From 147430e0eba53dcfa7821eff538042357918a109 Mon Sep 17 00:00:00 2001
From: Vishak-Bhat30 <vishak.bhat5@gmail.com>
Date: Tue, 17 Feb 2026 10:24:15 +0000
Subject: [PATCH 01/17] weithc GPU

---
 .../game24_stepverifier.py                    | 225 +++----
 interwhen/monitors/__init__.py                |   3 +-
 interwhen/monitors/thinkingPhaseVerifier.py   | 573 ++++++++++++++++++
 3 files changed, 646 insertions(+), 155 deletions(-)
 create mode 100644 interwhen/monitors/thinkingPhaseVerifier.py

diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index a6437ea1..b23c88fb 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -12,7 +12,7 @@
 from transformers import AutoTokenizer
 
 from interwhen import stream_completion
-from interwhen.monitors import KstableAnswerGame24Monitor, StepVerifierGame24Monitor
+from interwhen.monitors import ThinkingPhaseStepVerifierGame24Monitor
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
@@ -88,148 +88,22 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
     return {"url": url, "payload": payload, "headers": headers}
 
 
-def build_meta_prompt_from_example(nums):
-    """Build the system and user prompts for Game of 24 with step verification format."""
+def build_prompt(nums):
+    """Build a simple prompt for Game of 24."""
     a, b, c, d = nums
-    
-    system_prompt = r"""You are solving the Game of 24.
-
-GAME RULES:
-- You are given four numbers
-- Use ALL four numbers exactly once
-- Use ONLY the operations: +, -, *, /
-- The final expression must evaluate to exactly 24
-
-OUTPUT FORMAT:
-You must follow this EXACT structured format for your solution:
-
->Step1
-available numbers: [a, b, c, d]
-suggested operation: a * b = result1
-remaining numbers: [result1, c, d]
-
->Step2
-available numbers: [result1, c, d]
-suggested operation: result1 + c = result2
-remaining numbers: [result2, d]
-
->Step3
-available numbers: [result2, d]
-suggested operation: result2 - d = 24
-remaining numbers: [24]
-
-> Final expression: \boxed{expression using original numbers}
-
-IMPORTANT RULES:
-1. Each step MUST show the available numbers at the start
-2. Each step MUST show the suggested operation with its result
-3. Each step MUST show the remaining numbers after the operation
-4. Continue until you reach exactly 24
-5. The final expression inside \boxed{} must use the ORIGINAL numbers
-6. If you receive VERIFIER FEEDBACK, immediately provide a corrected step - do NOT restart your thinking
-
-═══════════════════════════════════════════════════════════════════════════════
-EXAMPLE 1: Numbers [2, 3, 4, 5]
-═══════════════════════════════════════════════════════════════════════════════
-
-### Final Answer
-
->Step1
-available numbers: [2, 3, 4, 5]
-suggested operation: 5 + 3 = 8
-remaining numbers: [8, 2, 4]
-
->Step2
-available numbers: [8, 2, 4]
-suggested operation: 8 - 2 = 6
-remaining numbers: [6, 4]
-
->Step3
-available numbers: [6, 4]
-suggested operation: 6 * 4 = 24
-remaining numbers: [24]
-
-> Final expression: \boxed{(5 + 3 - 2) * 4}
-
-═══════════════════════════════════════════════════════════════════════════════
-EXAMPLE 2: Numbers [1, 5, 5, 5]
-═══════════════════════════════════════════════════════════════════════════════
-
-### Final Answer
-
->Step1
-available numbers: [1, 5, 5, 5]
-suggested operation: 1 / 5 = 0.2
-remaining numbers: [0.2, 5, 5]
-
->Step2
-available numbers: [0.2, 5, 5]
-suggested operation: 5 - 0.2 = 4.8
-remaining numbers: [4.8, 5]
-
->Step3
-available numbers: [4.8, 5]
-suggested operation: 4.8 * 5 = 24
-remaining numbers: [24]
-
-> Final expression: \boxed{(5 - 1/5) * 5}
+    boxed = r"\boxed{}"
+    base_prompt = f"""
+    You are solving the Game of 24.
 
-═══════════════════════════════════════════════════════════════════════════════
-EXAMPLE 3: Handling Verifier Feedback - Numbers [1, 2, 6, 8]
-═══════════════════════════════════════════════════════════════════════════════
+    You are given four numbers: {a}, {b}, {c}, {d}
 
-### Final Answer
+    Your job is to produce a valid arithmetic expression using:
+    - ALL four numbers exactly once
+    - ONLY +, -, *, /
+    - The expression must evaluate to exactly 24.
 
->Step1
-available numbers: [1, 2, 6, 8]
-suggested operation: 8 / 2 = 4
-remaining numbers: [4, 1, 6]
-
->Step2
-available numbers: [4, 1, 6]
-suggested operation: 4 - 1 = 3
-remaining numbers: [3, 6]
-
-[VERIFIER FEEDBACK for Step 2:
-  ✗ Cannot reach 24 from remaining numbers [3, 6]. This path is a dead end.
-The previous steps are correct. Please provide a corrected Step 2 and continue.]
-
->Step2
-available numbers: [4, 1, 6]
-suggested operation: 6 - 1 = 5
-remaining numbers: [5, 4]
-
-[VERIFIER FEEDBACK for Step 2:
-  ✗ Cannot reach 24 from remaining numbers [4, 5]. This path is a dead end.
-The previous steps are correct. Please provide a corrected Step 2 and continue.]
-
->Step2
-available numbers: [4, 1, 6]
-suggested operation: 6 * 1 = 6
-remaining numbers: [6, 4]
-
->Step3
-available numbers: [6, 4]
-suggested operation: 6 * 4 = 24
-remaining numbers: [24]
-
-> Final expression: \boxed{(8 / 2) * 6 * 1}
-
-═══════════════════════════════════════════════════════════════════════════════
-
-Now solve the following Game of 24 problem using the EXACT same format."""
-
-    user_prompt = f"""
-Numbers: {a}, {b}, {c}, {d}
-
-Find an arithmetic expression using these four numbers exactly once each with +, -, *, / that equals 24.
-
-Use the structured step-by-step format shown in the examples above."""
-
-    # Combine into a single prompt
-    full_prompt = f"{system_prompt}\n\n{user_prompt}"
-    
-    return full_prompt
+    Please reason step by step, and put your final answer containing only the expression within {boxed}.""".strip()
+    return base_prompt
 
 
 def count_tokens(text: str, tokenizer) -> int:
@@ -342,6 +216,8 @@ def evaluate_game24_answer(answer, nums):
     parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
+    parser.add_argument("--thinking_verify", "-tv", action="store_true", help="Enable thinking-phase step verification (verify during <think> trace)")
+    parser.add_argument("--newline_threshold", type=int, default=15, help="Number of newlines in thinking before forcing step verification (used with --thinking_verify)")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
     parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
     args = parser.parse_args()
@@ -382,9 +258,11 @@ def evaluate_game24_answer(answer, nums):
     logger.info("Tokenizer loaded successfully.")
 
     num_correct = 0
+    num_attempted = 0  # examples where a \boxed{} answer was produced
     N = args.num_examples
     total_reasoning_tokens = 0
     reasoning_token_counts = []
+    per_example_results = []  # list of dicts for CSV
 
     # total = len(dataset)
     indices = np.linspace(0, len(dataset)-1, N, dtype=int)
@@ -393,14 +271,22 @@ def evaluate_game24_answer(answer, nums):
         example = dataset[idx]
         nums = example["numbers"]
 
-        prompt = build_meta_prompt_from_example(nums)
+        prompt = build_prompt(nums)
+        full_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
         if args.monitor:
-            # Use StepVerifierGame24Monitor to detect when equation stabilizes k times
-            monitors=(StepVerifierGame24Monitor(
-                name="game24_kstable",
-                answer_start_token = "</think>",
-                original_numbers=nums,  # Validate equations use exactly these numbers
+            # ThinkingPhaseStepVerifierGame24Monitor handles both cases:
+            # - With --thinking_verify: also verifies during the <think> phase
+            # - Without: only injects structured prompt after </think> and verifies steps
+            threshold = args.newline_threshold if args.thinking_verify else 999999
+            monitors=(ThinkingPhaseStepVerifierGame24Monitor(
+                name="game24_verifier",
+                original_numbers=nums,
+                llm_server=llm_server,
+                prompt=full_prompt,
+                newline_threshold=threshold,
+                max_corrections=5,
+                answer_start_token="</think>",
             ),)
         else:
             monitors = ()
@@ -410,7 +296,7 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"Numbers: {nums}")
 
         answer = asyncio.run(stream_completion(
-            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
+            full_prompt,
             llm_server=llm_server,
             monitors=monitors,
             add_delay=False,
@@ -427,6 +313,10 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"Generated tokens in this example: {reasoning_tokens}")
 
         is_correct, expr, message = evaluate_game24_answer(answer, nums)
+        # "attempted" = model produced a real \boxed{} answer (not "no solution")
+        attempted = (expr is not None and expr.strip().lower() != "no solution")
+        if attempted:
+            num_attempted += 1
         
         if expr:
             logger.info(f"Extracted expression: {expr}")
@@ -435,33 +325,60 @@ def evaluate_game24_answer(answer, nums):
         if is_correct:
             num_correct += 1
 
+        per_example_results.append({
+            "index": int(idx),
+            "numbers": str(nums),
+            "expression": expr if expr else "",
+            "correct": is_correct,
+            "attempted": attempted,
+            "tokens": reasoning_tokens,
+            "message": message,
+        })
+
     # Calculate final statistics
     avg_reasoning_tokens = total_reasoning_tokens / N if N > 0 else 0
     accuracy = num_correct / N if N > 0 else 0
+    soundness = num_correct / num_attempted if num_attempted > 0 else 0  # correct / attempted
     
     print(f"\nFinal Accuracy: {num_correct}/{N} ({accuracy:.2%})")
+    print(f"Soundness: {num_correct}/{num_attempted} ({soundness:.2%})")
     print(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}")
     print(f"Total Reasoning Tokens: {total_reasoning_tokens}")
+
+    # Save per-example CSV
+    csv_file = os.path.join(output_dirs["csv_saved"], f"results_{N}examples.csv")
+    with open(csv_file, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=["index", "numbers", "expression", "correct", "attempted", "tokens", "message"])
+        writer.writeheader()
+        writer.writerows(per_example_results)
+    logger.info(f"Per-example CSV saved to {csv_file}")
     
-    # Save results to a text file
+    # Save results summary to a text file
     results_file = logfile.replace('.log', '_results.txt')
     with open(results_file, 'w') as f:
         f.write(f"Game of 24 Evaluation Results\n")
         f.write(f"{'='*50}\n\n")
         f.write(f"Model: {main_model}\n")
         f.write(f"Number of Examples: {N}\n")
-        f.write(f"Monitor Enabled: {args.monitor}\n\n")
+        f.write(f"Monitor Enabled: {args.monitor}\n")
+        f.write(f"Thinking Phase Verify: {args.thinking_verify}\n")
+        if args.thinking_verify:
+            f.write(f"Newline Threshold: {args.newline_threshold}\n")
+        f.write(f"\n")
         f.write(f"Results:\n")
         f.write(f"---------\n")
         f.write(f"Correct: {num_correct}/{N}\n")
-        f.write(f"Accuracy: {accuracy:.2%}\n\n")
-        f.write(f"Reasoning Token Statistics:\n")
+        f.write(f"Accuracy: {accuracy:.2%}\n")
+        f.write(f"Attempted (produced \\boxed answer): {num_attempted}/{N}\n")
+        f.write(f"Soundness (correct/attempted): {soundness:.2%}\n\n")
+        f.write(f"Token Statistics:\n")
         f.write(f"---------------------------\n")
-        f.write(f"Total Reasoning Tokens: {total_reasoning_tokens}\n")
-        f.write(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}\n")
+        f.write(f"Total Tokens: {total_reasoning_tokens}\n")
+        f.write(f"Average Tokens: {avg_reasoning_tokens:.2f}\n")
         if reasoning_token_counts:
-            f.write(f"Min Reasoning Tokens: {min(reasoning_token_counts)}\n")
-            f.write(f"Max Reasoning Tokens: {max(reasoning_token_counts)}\n")
+            f.write(f"Median Tokens: {float(np.median(reasoning_token_counts)):.0f}\n")
+            f.write(f"Min Tokens: {min(reasoning_token_counts)}\n")
+            f.write(f"Max Tokens: {max(reasoning_token_counts)}\n")
             f.write(f"Std Dev: {np.std(reasoning_token_counts):.2f}\n")
     
     logger.info(f"Results saved to {results_file}")
diff --git a/interwhen/monitors/__init__.py b/interwhen/monitors/__init__.py
index 59bbe1d0..b198994a 100644
--- a/interwhen/monitors/__init__.py
+++ b/interwhen/monitors/__init__.py
@@ -1,4 +1,5 @@
 from .textReplacement import SimpleTextReplaceMonitor
 from .k_stable import KstableAnswerMCQMonitor, KstableAnswerGame24Monitor
 from .stepVerifier import StepVerifierGame24Monitor, StepVerifierMazeMonitor, StepVerifierSpatialMapMonitor
-from .earlyStopping import EATMonitor, DEERMonitor
\ No newline at end of file
+from .earlyStopping import EATMonitor, DEERMonitor
+from .thinkingPhaseVerifier import ThinkingPhaseStepVerifierGame24Monitor
\ No newline at end of file
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
new file mode 100644
index 00000000..2deb0497
--- /dev/null
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -0,0 +1,573 @@
+"""
+Thinking Phase Step Verifier for Game of 24.
+
+This monitor verifies Game-of-24 solutions by injecting a structured-output
+prompt after ``</think>`` — whether that ``</think>`` was forced by us (during
+the thinking phase) or produced naturally by the model.
+
+Workflow
+--------
+A) **DURING the thinking phase** (inside ``<think>...</think>``):
+   After every *N* newlines in the thinking trace:
+   1. Inject ``\\n</think>\\n`` + a *structured output prompt* that asks the
+      model to list its current solution as verified steps.
+   2. Stream from the vLLM server to collect those steps.
+   3. Verify each step with the existing ``verify_step`` utilities.
+   4. If **wrong** -> remove the injected ``</think>`` + prompt + generated
+      steps, append ``Wait, that approach is wrong. ...`` inside the thinking
+      trace, and let the model keep thinking.
+   5. If **correct** -> keep ``</think>`` + the structured prompt, let
+      ``stream_completion`` recurse so the model finishes (Phase B verifies
+      each step as it streams).
+
+B) **AFTER a natural ``</think>``**:
+   Inject the same structured output prompt so the model outputs its steps
+   in the verifiable format. Then behave identically to
+   ``StepVerifierGame24Monitor`` — verify each step as it appears and give
+   ``[VERIFIER FEEDBACK ...]`` on the first error so the model retries.
+"""
+
+import re
+import json
+import logging
+from typing import List, Optional, Tuple, Dict, Any
+from copy import deepcopy
+
+import httpx
+
+from .base import VerifyMonitor
+from ..utils.game24_verifier import (
+    parse_step, verify_step, format_feedback, can_reach_24, format_number
+)
+
+logger = logging.getLogger(__name__)
+
+# ──────────────────────────────────────────────────────────────────────
+#  The structured-output prompt injected right after </think>.
+#  It tells the model to present its solution in the step format that
+#  our verifier can parse, AND to put the final answer in \boxed{}.
+# ──────────────────────────────────────────────────────────────────────
+STRUCTURED_OUTPUT_PROMPT = (
+    "Now present your solution step-by-step in the following format. "
+    "Use ALL four numbers exactly once with +, -, *, / to reach 24.\n"
+    "\n"
+    ">Step1\n"
+    "available numbers: [a, b, c, d]\n"
+    "suggested operation: a * b = result1\n"
+    "remaining numbers: [result1, c, d]\n"
+    "\n"
+    ">Step2\n"
+    "available numbers: [result1, c, d]\n"
+    "suggested operation: result1 + c = result2\n"
+    "remaining numbers: [result2, d]\n"
+    "\n"
+    ">Step3\n"
+    "available numbers: [result2, d]\n"
+    "suggested operation: result2 - d = result3\n"
+    "remaining numbers: [result3]\n"
+    "\n"
+    "> Final expression: \\boxed{expression using original numbers}"
+)
+
+
+class ThinkingPhaseStepVerifierGame24Monitor(VerifyMonitor):
+    """
+    Monitor that adds thinking-phase verification on top of the standard
+    StepVerifierGame24Monitor behaviour.
+
+    During thinking: every N newlines -> force ``</think>`` + structured
+        prompt, stream from vLLM, verify.  Roll back on error, commit on
+        success.
+
+    After natural ``</think>``: inject the same structured prompt, then
+        verify each step (identical to StepVerifierGame24Monitor).
+    """
+
+    def __init__(
+        self,
+        name: str,
+        original_numbers: List[int],
+        llm_server: dict,
+        prompt: str,
+        newline_threshold: int = 15,
+        max_corrections: int = 5,
+        answer_start_token: str = "</think>",
+        async_execution: bool = True,
+    ):
+        super().__init__(name)
+        self.original_numbers = [float(x) for x in original_numbers]
+        self.llm_server = llm_server
+        self.prompt = prompt
+        self.newline_threshold = newline_threshold
+        self.max_corrections = max_corrections
+        self.answer_start_token = answer_start_token
+        self.async_execution = async_execution
+
+        # ---- thinking-phase state ----
+        self._think_phase_corrections = 0
+
+    # ------------------------------------------------------------------
+    #  helpers
+    # ------------------------------------------------------------------
+    def _count_feedback_blocks(self, text: str) -> int:
+        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
+
+    def _is_in_thinking_phase(self, generated_text: str) -> bool:
+        return self.answer_start_token not in generated_text
+
+    # ------------------------------------------------------------------
+    #  _get_current_available / _extract_last_step_info
+    #  (identical to StepVerifierGame24Monitor)
+    # ------------------------------------------------------------------
+    def _get_current_available(self, generated_text: str) -> List[float]:
+        if self.answer_start_token not in generated_text:
+            return self.original_numbers.copy()
+
+        text_after_think = generated_text.split(self.answer_start_token)[-1]
+
+        step_pattern = re.compile(
+            r'>\s*Step\s*(\d+)\s*\n'
+            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
+            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
+            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\]',
+            re.IGNORECASE,
+        )
+
+        sections = re.split(
+            r'\[VERIFIER FEEDBACK[^\]]*\]\s*', text_after_think, flags=re.DOTALL
+        )
+        last_section = sections[-1]
+        steps_in_last_section = list(step_pattern.finditer(last_section))
+
+        if not steps_in_last_section:
+            return self.original_numbers.copy()
+
+        last_step = steps_in_last_section[-1]
+        step_num_to_verify = int(last_step.group(1))
+
+        if step_num_to_verify == 1:
+            return self.original_numbers.copy()
+
+        target_step = step_num_to_verify - 1
+
+        for step_match in steps_in_last_section[:-1]:
+            if int(step_match.group(1)) == target_step:
+                try:
+                    return [
+                        float(x.strip())
+                        for x in step_match.group(4).strip().split(',')
+                        if x.strip()
+                    ]
+                except Exception:
+                    pass
+
+        for section in reversed(sections[:-1]):
+            for step_match in reversed(list(step_pattern.finditer(section))):
+                if int(step_match.group(1)) == target_step:
+                    try:
+                        return [
+                            float(x.strip())
+                            for x in step_match.group(4).strip().split(',')
+                            if x.strip()
+                        ]
+                    except Exception:
+                        pass
+
+        return self.original_numbers.copy()
+
+    def _extract_last_step_info(self, generated_text: str):
+        if self.answer_start_token not in generated_text:
+            return None, None
+
+        text_after_think = generated_text.split(self.answer_start_token)[-1]
+        sections = re.split(
+            r'\[VERIFIER FEEDBACK[^\]]*\]\s*', text_after_think, flags=re.DOTALL
+        )
+        text = sections[-1]
+
+        step_pattern = re.compile(
+            r'(>\s*Step\s*(\d+)\s*\n'
+            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
+            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
+            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\])',
+            re.IGNORECASE,
+        )
+        all_steps = list(step_pattern.finditer(text))
+        if not all_steps:
+            return None, None
+
+        last_step = all_steps[-1]
+        step_num = int(last_step.group(2))
+        step_text = (
+            f">Step{step_num}\n"
+            f"available numbers: [{last_step.group(3).strip()}]\n"
+            f"suggested operation: {last_step.group(4).strip()}\n"
+            f"remaining numbers: [{last_step.group(5).strip()}]"
+        )
+        return step_num, parse_step(step_text)
+
+    def _count_complete_steps(self, text: str) -> int:
+        """Return how many complete step blocks are in the text."""
+        step_pattern = re.compile(
+            r'>\s*Step\s*\d+\s*\n'
+            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
+            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
+            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\]',
+            re.IGNORECASE,
+        )
+        return len(step_pattern.findall(text))
+
+    # ------------------------------------------------------------------
+    #  _stream_and_verify_steps
+    # ------------------------------------------------------------------
+    async def _stream_and_verify_steps(self, text_so_far: str):
+        """
+        Stream from the vLLM server with ``prompt + text_so_far`` (which
+        already ends with the structured output prompt).
+
+        As each complete step block appears, verify it immediately.
+        - If a step is WRONG -> stop streaming, return the error info.
+        - If all steps pass and the model finishes -> return full text.
+
+        Returns:
+            (full_text, is_all_valid, error_info_or_None)
+        """
+        payload = deepcopy(self.llm_server["payload"])
+        payload["prompt"] = self.prompt + text_so_far
+        payload["max_tokens"] = min(payload.get("max_tokens", 2048), 2048)
+
+        generated = ""
+        last_verified_step_count = 0
+
+        async with httpx.AsyncClient(timeout=None) as client:
+            async with client.stream(
+                "POST",
+                self.llm_server["url"],
+                headers=self.llm_server["headers"],
+                json=payload,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[len("data: "):].strip()
+                        if data == "[DONE]":
+                            break
+                        chunk = json.loads(data)["choices"][0]["text"]
+                        generated += chunk
+                        logger.debug(f"[vLLM side-stream] chunk: {chunk!r}")
+
+                        # Check if a new complete step appeared
+                        current_step_count = self._count_complete_steps(generated)
+                        if current_step_count > last_verified_step_count:
+                            full_text = text_so_far + generated
+                            step_num, parsed = self._extract_last_step_info(full_text)
+
+                            if (step_num is not None
+                                    and parsed is not None
+                                    and parsed.get('available_numbers') is not None):
+                                current_available = self._get_current_available(full_text)
+                                is_valid, errors, new_available = verify_step(
+                                    parsed, current_available,
+                                    self.original_numbers, step_num,
+                                )
+
+                                if not is_valid:
+                                    logger.info(
+                                        f"[ThinkingPhaseVerifier] Side-stream: "
+                                        f"Step {step_num} FAILED: {errors}"
+                                    )
+                                    return (
+                                        full_text,
+                                        False,
+                                        {"step_num": step_num,
+                                         "errors": errors,
+                                         "available": current_available},
+                                    )
+                                else:
+                                    logger.info(
+                                        f"[ThinkingPhaseVerifier] Side-stream: "
+                                        f"Step {step_num} verified OK"
+                                    )
+
+                            last_verified_step_count = current_step_count
+
+        full_text = text_so_far + generated
+        logger.info(
+            f"[ThinkingPhaseVerifier] Side-stream finished. "
+            f"Generated {len(generated)} chars, "
+            f"{last_verified_step_count} steps verified."
+        )
+        return full_text, True, None
+
+    # ------------------------------------------------------------------
+    #  step_extractor
+    # ------------------------------------------------------------------
+    def step_extractor(self, chunk: str, generated_text: str):
+        """
+        Phase 1 (thinking): trigger when total newlines cross the next
+            multiple of ``newline_threshold``.
+        Phase 2 (after </think>): trigger when a natural ``</think>``
+            is detected (to inject the structured prompt), or when a
+            complete step block appears (for verification).
+        """
+        # ===== PHASE 1: still inside <think> =====
+        if self._is_in_thinking_phase(generated_text):
+            if self._think_phase_corrections >= self.max_corrections:
+                return False, None
+
+            total_newlines = generated_text.count('\n')
+
+            if chunk.endswith('\n') and total_newlines > 0 and total_newlines % self.newline_threshold == 0:
+                logger.info(
+                    f"[ThinkingPhaseVerifier] Total newlines={total_newlines}, "
+                    f"hit multiple of N={self.newline_threshold}. "
+                    f"Forcing step generation."
+                )
+                return True, generated_text
+
+            return False, None
+
+        # ===== PHASE 2: after </think> =====
+
+        # Sub-case 2a: </think> is present but structured prompt is not
+        # -> trigger so verify() can inject it.
+        if STRUCTURED_OUTPUT_PROMPT not in generated_text:
+            logger.info(
+                "[ThinkingPhaseVerifier] </think> present but structured "
+                "output prompt missing - will inject it."
+            )
+            return True, generated_text
+
+        # Sub-case 2b: structured prompt already injected -> detect
+        # complete steps for verification (same as StepVerifierGame24Monitor).
+        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = generated_text[think_end_pos:]
+
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+
+        text = text_after_think[last_feedback_end:]
+        text_start_in_generated = think_end_pos + last_feedback_end
+
+        step_pattern = re.compile(
+            r'(>\s*Step\s*(\d+)\s*\n'
+            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
+            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
+            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\])',
+            re.IGNORECASE,
+        )
+        all_steps = list(step_pattern.finditer(text))
+        if not all_steps:
+            return False, None
+
+        last_complete_step = all_steps[-1]
+
+        # Already moved past this step?
+        text_after_last_step = text[last_complete_step.end():]
+        if re.search(r'>\s*Step\s*\d+', text_after_last_step, re.IGNORECASE):
+            return False, None
+
+        end_pos = text_start_in_generated + last_complete_step.end()
+        return True, generated_text[:end_pos]
+
+    # ------------------------------------------------------------------
+    #  verify
+    # ------------------------------------------------------------------
+    async def verify(self, step: str, token_index: int, event, event_info):
+        """
+        Case 1 - still in thinking (no </think> in step):
+            Inject </think> + structured prompt, stream from vLLM to get
+            steps, verify them, then either rollback (wrong) or commit
+            (correct).
+
+        Case 2a - natural </think> just appeared, structured prompt not
+            yet injected:
+            Signal fix() to append the structured output prompt.
+
+        Case 2b - after </think> + structured prompt already injected:
+            Identical to StepVerifierGame24Monitor - verify each step.
+        """
+
+        # ==================================================================
+        # CASE 1: Thinking phase
+        # ==================================================================
+        if self.answer_start_token not in step:
+            logger.info(
+                "[ThinkingPhaseVerifier] Injecting </think> + structured "
+                "prompt and streaming steps from vLLM inside verify()"
+            )
+
+            # Build text with injected </think> + structured prompt
+            text_with_think_end = (
+                step + "\n" + self.answer_start_token + "\n"
+                + STRUCTURED_OUTPUT_PROMPT + "\n"
+            )
+
+            # Stream from vLLM, verifying each step as it appears
+            full_text, is_all_valid, error_info = await self._stream_and_verify_steps(
+                text_with_think_end
+            )
+
+            if is_all_valid:
+                # All steps correct -> inject </think> + structured prompt
+                # and let stream_completion recurse so the model generates
+                # verified steps that Phase 2b checks.
+                logger.info(
+                    "[ThinkingPhaseVerifier] All side-streamed steps verified OK "
+                    "- injecting </think> + structured prompt"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = self.answer_start_token
+                    event_info["correction_index"] = token_index
+                    event_info["phase"] = "inject_think_end"
+                    event.set()
+                return step, self.answer_start_token
+
+            else:
+                # Step is WRONG -> rollback into thinking
+                errors = error_info["errors"]
+                step_num = error_info["step_num"]
+                logger.info(
+                    f"[ThinkingPhaseVerifier] Step {step_num} FAILED: {errors}"
+                )
+                error_summary = "; ".join(errors)
+                thinking_feedback = (
+                    f"\n\nWait, that approach is wrong. {error_summary}. "
+                    f"Let me reconsider and try a different approach.\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = thinking_feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = errors
+                    event_info["failed_step"] = step_num
+                    event_info["phase"] = "rollback_to_thinking"
+                    event.set()
+                return step, thinking_feedback
+
+        # ==================================================================
+        # CASE 2a: </think> present but structured prompt missing
+        # ==================================================================
+        if STRUCTURED_OUTPUT_PROMPT not in step:
+            logger.info(
+                "[ThinkingPhaseVerifier] </think> present but structured "
+                "prompt missing -> injecting it"
+            )
+            structured_prompt_text = "\n" + STRUCTURED_OUTPUT_PROMPT + "\n"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = structured_prompt_text
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "inject_structured_prompt"
+                event.set()
+            return step, structured_prompt_text
+
+        # ==================================================================
+        # CASE 2b: After </think> + structured prompt - standard verify
+        # ==================================================================
+
+        # ---- max-corrections guard ----
+        num_corrections = (
+            self._count_feedback_blocks(step)
+            + self._think_phase_corrections
+        )
+        if num_corrections >= self.max_corrections:
+            fb = "\nthe answer is \\boxed{no solution}"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = fb
+                event_info["correction_index"] = token_index
+                event_info["errors"] = ["Max corrections reached"]
+                event_info["failed_step"] = None
+                event.set()
+            return step, fb
+
+        # ---- extract & verify step ----
+        step_num, parsed = self._extract_last_step_info(step)
+        if step_num is None or parsed is None or parsed.get('available_numbers') is None:
+            return step, None
+
+        current_available = self._get_current_available(step)
+        is_valid, errors, new_available = verify_step(
+            parsed, current_available, self.original_numbers, step_num
+        )
+
+        if is_valid:
+            return step, None
+
+        # ---- step has errors -> standard feedback ----
+        logger.info(f"[ThinkingPhaseVerifier] Step {step_num} FAILED: {errors}")
+        feedback = format_feedback(errors, step_num, current_available)
+        if not event.is_set():
+            event_info["generated_text"] = step
+            event_info["feedback"] = feedback
+            event_info["correction_index"] = token_index
+            event_info["errors"] = errors
+            event_info["failed_step"] = step_num
+            event_info["phase"] = "standard_verify"
+            event.set()
+        return step, feedback
+
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        """
+        Applies the appropriate fix depending on the phase:
+
+        inject_think_end
+            Append ``</think>`` + structured output prompt so the model
+            regenerates the steps naturally.
+
+        rollback_to_thinking
+            Strip everything from the inject point, append ``Wait ...``
+            feedback inside the thinking trace.
+
+        inject_structured_prompt
+            Append the structured output prompt after a natural
+            ``</think>`` (no rollback needed).
+
+        standard_verify
+            Append ``[VERIFIER FEEDBACK ...]`` (same as
+            StepVerifierGame24Monitor).
+        """
+        phase = event_info.get("phase", "standard_verify")
+
+        if phase == "inject_think_end":
+            logger.info(
+                "[ThinkingPhaseVerifier] fix(): injecting </think> + "
+                "structured prompt"
+            )
+            return (
+                event_info["generated_text"]
+                + "\n" + self.answer_start_token + "\n"
+                + STRUCTURED_OUTPUT_PROMPT + "\n"
+            )
+
+        if phase == "rollback_to_thinking":
+            logger.info("[ThinkingPhaseVerifier] fix(): rolling back into thinking")
+
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+
+            # Reset thinking-phase state for the next cycle
+            self._think_phase_corrections += 1
+
+            logger.info(
+                f"[ThinkingPhaseVerifier] Rolled back. "
+                f"Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+            )
+            return result
+
+        if phase == "inject_structured_prompt":
+            logger.info(
+                "[ThinkingPhaseVerifier] fix(): appending structured "
+                "output prompt after natural </think>"
+            )
+            return event_info["generated_text"] + event_info["feedback"]
+
+        # standard_verify
+        logger.info("[ThinkingPhaseVerifier] fix(): standard step feedback")
+        return event_info["generated_text"] + event_info["feedback"]

From 9a22a65fd794540d291c59059e94e849c31e124b Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Thu, 19 Feb 2026 03:10:47 +0000
Subject: [PATCH 02/17] thinking trace verification

---
 .../game24_stepverifier.py                    |   12 +-
 .../TTSwithVerification/maze_stepverifier.py  |  258 +--
 interwhen/interject.py                        |    2 +
 interwhen/monitors/__init__.py                |    2 +-
 interwhen/monitors/thinkingPhaseVerifier.py   | 1638 +++++++++++++----
 interwhen/utils/maze_verifier.py              |   58 +-
 6 files changed, 1498 insertions(+), 472 deletions(-)

diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index b23c88fb..94653f5c 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -1,14 +1,12 @@
 import argparse
 import asyncio
 import csv
-import json
 import logging
 import os
 import re
 import numpy as np
 
 from datasets import load_dataset
-from openai import OpenAI
 from transformers import AutoTokenizer
 
 from interwhen import stream_completion
@@ -214,10 +212,11 @@ def evaluate_game24_answer(answer, nums):
     parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
     parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
     parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
-    parser.add_argument("--num_examples", "-n", type=int, default=1, help="Number of examples to run")
+    parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
-    parser.add_argument("--thinking_verify", "-tv", action="store_true", help="Enable thinking-phase step verification (verify during <think> trace)")
-    parser.add_argument("--newline_threshold", type=int, default=15, help="Number of newlines in thinking before forcing step verification (used with --thinking_verify)")
+    parser.add_argument("--thinking_verify", "-tv", action="store_true", default = True, help="Enable thinking-phase step verification (verify during <think> trace)")
+    parser.add_argument("--newline_threshold", type=int, default=10, help="Number of newlines in thinking before forcing step verification (used with --thinking_verify)")
+    parser.add_argument("--warmup", type=int, default=0, help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
     parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
     args = parser.parse_args()
@@ -267,7 +266,7 @@ def evaluate_game24_answer(answer, nums):
     # total = len(dataset)
     indices = np.linspace(0, len(dataset)-1, N, dtype=int)
 
-    for idx in indices: #for idx in indices:
+    for idx in indices:
         example = dataset[idx]
         nums = example["numbers"]
 
@@ -287,6 +286,7 @@ def evaluate_game24_answer(answer, nums):
                 newline_threshold=threshold,
                 max_corrections=5,
                 answer_start_token="</think>",
+                warmup_newlines=args.warmup,
             ),)
         else:
             monitors = ()
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index ebeb1d9b..cb9f386d 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -1,11 +1,15 @@
 """
-Maze experiment with step-by-step verification using StepVerifierMazeMonitor.
+Maze experiment with thinking-phase step verification.
 
-Uses the new monitor-based architecture that integrates with stream_completion.
+Uses ThinkingPhaseStepVerifierMazeMonitor which:
+  - Verifies the model's traced path during <think> via side-streams
+  - Injects a structured step format after </think> (no meta-prompt needed)
+  - Verifies each step as the model fills in the structured template
 """
 
 import argparse
 import asyncio
+import csv
 import json
 import logging
 import os
@@ -17,7 +21,7 @@
 from transformers import AutoTokenizer
 
 from interwhen import stream_completion
-from interwhen.monitors import StepVerifierMazeMonitor
+from interwhen.monitors import ThinkingPhaseStepVerifierMazeMonitor
 from interwhen.utils.maze_verifier import parse_maze_from_prompt
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')
@@ -43,6 +47,7 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResu
     dirs = {
         "base": output_base,
         "reasoning": os.path.join(output_base, "Reasoning_output"),
+        "csv_saved": os.path.join(output_base, "csv_saved"),
     }
     
     for dir_path in dirs.values():
@@ -51,108 +56,16 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResu
     return dirs
 
 
-def build_meta_prompt_from_example(example):
-    """Build prompt for maze example."""
-    system_prompt = """You are a maze-solving AI. Given a maze in ASCII format, analyze it step by step.
-
-COORDINATE SYSTEM:
-- Rows are numbered from top (row 0) to bottom
-- Columns are numbered from left (col 0) to right
-- Movement: UP (row decreases), DOWN (row increases), LEFT (col decreases), RIGHT (col increases)
-
-TURN DEFINITIONS:
-- RIGHT_TURN = 90° clockwise change (e.g., DOWN→LEFT, LEFT→UP, UP→RIGHT, RIGHT→DOWN)
-- LEFT_TURN = 90° counterclockwise change (e.g., DOWN→RIGHT, RIGHT→UP, UP→LEFT, LEFT→DOWN)
-
-RELATIVE POSITION DEFINITIONS:
-- "directly to the left" = same row, E has smaller column than S
-- "directly to the right" = same row, E has larger column than S
-- "directly above" = same column, E has smaller row than S
-- "directly below" = same column, E has larger row than S
-- "top left" = E has smaller row AND smaller column than S
-- "top right" = E has smaller row AND larger column than S
-- "bottom left" = E has larger row AND smaller column than S
-- "bottom right" = E has larger row AND larger column than S
-
-IMPORTANT: Follow the EXACT output format below. Do NOT use <think> tags.
-
-EXAMPLE 1: Counting Right Turns
-Question: How many right turns are there in the path from S to E?
-
->>> LOCATE START AND EXIT:
-    S position: (3,5)
-    E position: (1,1)
-
->>> STEP 1: Move DOWN from (3,5) to (4,5)
-    Current position: (4,5)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 2: Move DOWN from (4,5) to (5,5)
-    Current position: (5,5)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 3: Move LEFT from (5,5) to (5,4)
-    Current position: (5,4)
-    Previous direction: DOWN
-    Current direction: LEFT
-    Turn type: RIGHT_TURN
-    Running count: Right=1, Left=0
-
->>> FINAL ANSWER: Right turns = 2
-    \\boxed{C}
-
-EXAMPLE 2: Counting Total Turns
-Question: How many total turns are there in the path from S to E?
-
->>> LOCATE START AND EXIT:
-    S position: (3,5)
-    E position: (1,1)
-
->>> STEP 1: Move DOWN from (3,5) to (4,5)
-    Current position: (4,5)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0, Total=0
-
-[... continue for all steps ...]
-
->>> FINAL ANSWER: Total turns = 2
-    \\boxed{C}
-
-EXAMPLE 3: Relative Position
-Question: Is the exit (E) to the top left of the starting point (S)?
-
->>> LOCATE START AND EXIT:
-    S position: (3,5)
-    E position: (1,1)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (1) < S row (3) → E is ABOVE S ✓
-    Col comparison: E col (1) < S col (5) → E is LEFT of S ✓
-
->>> ANALYSIS:
-    E is above S (smaller row): YES
-    E is left of S (smaller col): YES
-    Therefore E is at TOP LEFT of S.
-
->>> ANSWER: YES, E is to the top left of S.
-    \\boxed{A}
-
-════════════════════════════════════════════════════════════════════════════════
-Now solve the following maze using the EXACT same format. First locate S and E, then trace the path step by step."""
-
-    # Get the maze description (trimmed to remove trailing instructions)
+def build_simple_prompt(example):
+    """Build a simple user prompt from the maze example.
+
+    No system / meta prompt is used — the structured step format is
+    injected by the monitor after ``</think>``.
+    """
     description = str(example.get("prompt", ""))
+    # Trim trailing boiler-plate instructions that the dataset appends
     description_trimmed = description[:-143] if len(description) > 143 else description
-    
-    return system_prompt, description_trimmed
+    return description_trimmed
 
 
 def extract_solution(text: str) -> str:
@@ -223,6 +136,31 @@ def save_output(idx: int, output: str, output_dir: str):
         f.write(output)
     logger.info(f"Saved output to {filepath}")
 
+
+def save_prompt(idx, prompt_with_answer, reason_dir):
+    """Save reasoning trace to file."""
+    os.makedirs(reason_dir, exist_ok=True)
+    filename = os.path.join(reason_dir, f"reason_{idx}.txt")
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(prompt_with_answer)
+    logger.info(f"Saved reasoning trace to {filename}")
+
+
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/MazeResults") -> str:
+    """Generate log filename based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    os.makedirs(output_base, exist_ok=True)
+    return os.path.join(output_base, f"EAT_{num_examples}examples.log")
+
+
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/MazeResults") -> str:
+    """Generate token CSV filename based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    os.makedirs(output_base, exist_ok=True)
+    return os.path.join(output_base, f"EAT_{num_examples}examples.csv")
+
 def evaluate_maze_answer(answer, options, ground_truth):
     """
     Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).
@@ -266,7 +204,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
     return False, sol, f"Solution '{sol}' not found in options or ground truth"
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run maze experiments with StepVerifierMazeMonitor")
+    parser = argparse.ArgumentParser(description="Run maze experiments with step verification")
     parser.add_argument("--model", type=str, default=MAIN_MODEL,
                         help="Model name for generation")
     parser.add_argument("--indices", type=str, default=None,
@@ -279,7 +217,15 @@ def evaluate_maze_answer(answer, options, ground_truth):
                         help="Maximum number of correction attempts per example")
     parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
+    parser.add_argument("--newline_threshold", type=int, default=20,
+                        help="Number of \\n\\n in thinking before triggering side verification")
+    parser.add_argument("--warmup", type=int, default=0,
+                        help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
     args = parser.parse_args()
+
+    logger.info(f"Thinking-phase verification: always on")
+    logger.info(f"  Newline threshold: {args.newline_threshold}")
+    logger.info(f"  Warmup: {args.warmup}")
     
     if args.debug:
         logging.getLogger().setLevel(logging.DEBUG)
@@ -313,10 +259,13 @@ def evaluate_maze_answer(answer, options, ground_truth):
     total_correct = 0
     total_examples = 0
     total_reasoning_tokens = 0
+    num_attempted = 0  # examples where a \boxed{} answer was produced
+    reasoning_token_counts = []
+    per_example_results = []  # list of dicts for CSV
     
     for idx in indices:
         example = dataset[idx]
-        system_prompt, user_prompt = build_meta_prompt_from_example(example)
+        user_prompt = build_simple_prompt(example)
         if str(example.get("ground_truth", "")).strip() == "Q4":
             target_options = ["A", "B"]
         else:
@@ -325,8 +274,8 @@ def evaluate_maze_answer(answer, options, ground_truth):
         pattern = rf'\b({keys})\.\s*([A-Za-z0-9]+)\b'
         options = dict(re.findall(pattern, user_prompt))
         
-        # Build full prompt
-        full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        # Build simple prompt — no system/meta prompt; structure injected by monitor
+        full_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
         
         # Parse maze from prompt
         grid, start_pos, exit_pos = parse_maze_from_prompt(user_prompt)
@@ -336,23 +285,29 @@ def evaluate_maze_answer(answer, options, ground_truth):
             continue
         
         # Detect question type from prompt (auto-detection)
-        # Falls back to index-based if no turn keywords found
-        question_type = StepVerifierMazeMonitor.detect_question_type(user_prompt)
+        question_type = ThinkingPhaseStepVerifierMazeMonitor.detect_question_type(user_prompt)
         
         logger.info(f"\n{'='*60}")
         logger.info(f"Example {idx} ({question_type})")
         logger.info(f"Maze: S={start_pos}, E={exit_pos}, grid={len(grid)}x{len(grid[0]) if grid else 0}")
         logger.info(f"{'='*60}")
         
-        # Create the monitor
-        monitor = StepVerifierMazeMonitor(
-            name="maze_step_verifier",
-            answer_start_token="</think>",
+        # Always use ThinkingPhaseStepVerifierMazeMonitor:
+        # Phase 1 — verifies during <think> via side-streams
+        # Phase 2a — injects structured step format after </think>
+        # Phase 2b — verifies structured output as model fills it in
+        monitor = ThinkingPhaseStepVerifierMazeMonitor(
+            name="maze_thinking_verifier",
             grid=grid,
             start_pos=start_pos,
             exit_pos=exit_pos,
-            max_corrections=args.max_corrections,
+            llm_server=llm_server,
+            prompt=full_prompt,
             question_type=question_type,
+            newline_threshold=args.newline_threshold,
+            max_corrections=args.max_corrections,
+            answer_start_token="</think>",
+            warmup_newlines=args.warmup,
         )
         
         # Run with stream_completion
@@ -371,13 +326,24 @@ def evaluate_maze_answer(answer, options, ground_truth):
             traceback.print_exc()
             continue
         
+        # Save reasoning trace
+        save_prompt(int(idx), answer, reason_dir)
+        logger.info(f"Raw final output:\n{answer}")
+
         # Count generated tokens
         reasoning_tokens = count_tokens(answer, tokenizer)
         total_reasoning_tokens += reasoning_tokens
+        reasoning_token_counts.append(reasoning_tokens)
+        logger.info(f"Generated tokens in this example: {reasoning_tokens}")
         
         gt_sol = str(example.get("ground_truth", "")).strip()
         is_correct, extracted_answer, message = evaluate_maze_answer(answer, options, gt_sol)
         
+        # "attempted" = model produced a real \boxed{} answer (not "no solution")
+        attempted = (extracted_answer is not None and extracted_answer.strip().lower() != "no solution")
+        if attempted:
+            num_attempted += 1
+        
         if extracted_answer:
             logger.info(f"Extracted answer: {extracted_answer}")
         logger.info(message)
@@ -391,17 +357,30 @@ def evaluate_maze_answer(answer, options, ground_truth):
             'idx': int(idx),  # Convert numpy int64 to Python int
             'question_type': question_type,
             'correct': is_correct,
+            'attempted': attempted,
             'sol': extracted_answer,
             'gt': gt_sol,
             'reasoning_tokens': reasoning_tokens,
         }
         results.append(result)
         
-        logger.info(f"Result: sol={extracted_answer}, gt={gt_sol}, correct={is_correct}")
+        per_example_results.append({
+            "index": int(idx),
+            "question_type": question_type,
+            "correct": is_correct,
+            "attempted": attempted,
+            "sol": extracted_answer if extracted_answer else "",
+            "gt": gt_sol,
+            "tokens": reasoning_tokens,
+            "message": message,
+        })
+        
+        logger.info(f"Result: sol={extracted_answer}, gt={gt_sol}, correct={is_correct}, attempted={attempted}")
         logger.info(f"Reasoning tokens: {reasoning_tokens}")
     
     # Compute final metrics
     accuracy = total_correct / total_examples if total_examples > 0 else 0
+    soundness = total_correct / num_attempted if num_attempted > 0 else 0  # correct / attempted
     avg_reasoning_tokens = total_reasoning_tokens / total_examples if total_examples > 0 else 0
     
     logger.info(f"\n{'='*60}")
@@ -409,16 +388,33 @@ def evaluate_maze_answer(answer, options, ground_truth):
     logger.info(f"{'='*60}")
     logger.info(f"Total examples: {total_examples}")
     logger.info(f"Correct: {total_correct}")
+    logger.info(f"Attempted (produced \\boxed answer): {num_attempted}/{total_examples}")
     logger.info(f"Accuracy: {accuracy:.4f} ({total_correct}/{total_examples})")
+    logger.info(f"Soundness: {soundness:.4f} ({total_correct}/{num_attempted})")
     logger.info(f"Total reasoning tokens: {total_reasoning_tokens}")
     logger.info(f"Avg reasoning tokens: {avg_reasoning_tokens:.1f}")
     
+    print(f"\nFinal Accuracy: {total_correct}/{total_examples} ({accuracy:.2%})")
+    print(f"Soundness: {total_correct}/{num_attempted} ({soundness:.2%})")
+    print(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}")
+    print(f"Total Reasoning Tokens: {total_reasoning_tokens}")
+    
+    # Save per-example CSV
+    csv_file = os.path.join(output_dirs["csv_saved"], f"results_{total_examples}examples.csv")
+    with open(csv_file, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=["index", "question_type", "correct", "attempted", "sol", "gt", "tokens", "message"])
+        writer.writeheader()
+        writer.writerows(per_example_results)
+    logger.info(f"Per-example CSV saved to {csv_file}")
+    
     # Save summary
     summary = {
         'model': args.model,
         'total_examples': total_examples,
         'correct': total_correct,
+        'attempted': num_attempted,
         'accuracy': accuracy,
+        'soundness': soundness,
         'total_reasoning_tokens': total_reasoning_tokens,
         'avg_reasoning_tokens': avg_reasoning_tokens,
         'max_corrections': args.max_corrections,
@@ -428,4 +424,34 @@ def evaluate_maze_answer(answer, options, ground_truth):
     summary_path = os.path.join(output_dirs["base"], "summary.json")
     with open(summary_path, 'w') as f:
         json.dump(summary, f, indent=2)
-    logger.info(f"\nSaved summary to {summary_path}")
\ No newline at end of file
+    logger.info(f"\nSaved summary to {summary_path}")
+    
+    # Save results summary to a text file
+    results_file = os.path.join(output_dirs["base"], f"EAT_{total_examples}examples_results.txt")
+    with open(results_file, 'w') as f:
+        f.write(f"Maze Step Verification Results\n")
+        f.write(f"{'='*50}\n\n")
+        f.write(f"Model: {args.model}\n")
+        f.write(f"Number of Examples: {total_examples}\n")
+        f.write(f"Max Corrections: {args.max_corrections}\n")
+        f.write(f"Newline Threshold: {args.newline_threshold}\n")
+        f.write(f"Warmup: {args.warmup}\n")
+        f.write(f"\n")
+        f.write(f"Results:\n")
+        f.write(f"---------\n")
+        f.write(f"Correct: {total_correct}/{total_examples}\n")
+        f.write(f"Accuracy: {accuracy:.2%}\n")
+        f.write(f"Attempted (produced \\boxed answer): {num_attempted}/{total_examples}\n")
+        f.write(f"Soundness (correct/attempted): {soundness:.2%}\n\n")
+        f.write(f"Token Statistics:\n")
+        f.write(f"---------------------------\n")
+        f.write(f"Total Tokens: {total_reasoning_tokens}\n")
+        f.write(f"Average Tokens: {avg_reasoning_tokens:.2f}\n")
+        if reasoning_token_counts:
+            f.write(f"Median Tokens: {float(np.median(reasoning_token_counts)):.0f}\n")
+            f.write(f"Min Tokens: {min(reasoning_token_counts)}\n")
+            f.write(f"Max Tokens: {max(reasoning_token_counts)}\n")
+            f.write(f"Std Dev: {np.std(reasoning_token_counts):.2f}\n")
+    
+    logger.info(f"Results saved to {results_file}")
+    print(f"Results saved to {results_file}")
\ No newline at end of file
diff --git a/interwhen/interject.py b/interwhen/interject.py
index c8e4fa64..cb4f8e60 100644
--- a/interwhen/interject.py
+++ b/interwhen/interject.py
@@ -71,6 +71,8 @@ async def stream_completion(prompt, prev_text = "", llm_server=None, monitors=[]
         corrected_text = await monitors[0].fix(generated_text, stop_info)
         if stop_info["feedback"] == "\nthe answer is \\boxed{no solution}":
             return corrected_text # No solution found, return no solution ie soundness is 100% is it doesnt pass the verifer
+        if stop_info.get("phase") == "final_answer_correct":
+            return corrected_text  # Expression verified correct, stop generation
         return await stream_completion(prompt, prev_text=corrected_text, llm_server=llm_server, monitors=monitors, add_delay=add_delay, num_calls_index=num_calls_index+1, termination_requires_validation=termination_requires_validation, async_execution=async_execution)
 
     return generated_text
\ No newline at end of file
diff --git a/interwhen/monitors/__init__.py b/interwhen/monitors/__init__.py
index b198994a..daf42436 100644
--- a/interwhen/monitors/__init__.py
+++ b/interwhen/monitors/__init__.py
@@ -2,4 +2,4 @@
 from .k_stable import KstableAnswerMCQMonitor, KstableAnswerGame24Monitor
 from .stepVerifier import StepVerifierGame24Monitor, StepVerifierMazeMonitor, StepVerifierSpatialMapMonitor
 from .earlyStopping import EATMonitor, DEERMonitor
-from .thinkingPhaseVerifier import ThinkingPhaseStepVerifierGame24Monitor
\ No newline at end of file
+from .thinkingPhaseVerifier import ThinkingPhaseStepVerifierGame24Monitor, ThinkingPhaseStepVerifierMazeMonitor
\ No newline at end of file
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index 2deb0497..c1117374 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -1,86 +1,209 @@
 """
-Thinking Phase Step Verifier for Game of 24.
+Thinking Phase Verifiers for Game of 24 and Maze.
 
-This monitor verifies Game-of-24 solutions by injecting a structured-output
-prompt after ``</think>`` — whether that ``</think>`` was forced by us (during
-the thinking phase) or produced naturally by the model.
+These monitors verify solutions by forking a side-stream during the thinking
+phase to ask the model about its current progress.
 
-Workflow
---------
+Game of 24 Workflow
+-------------------
 A) **DURING the thinking phase** (inside ``<think>...</think>``):
-   After every *N* newlines in the thinking trace:
-   1. Inject ``\\n</think>\\n`` + a *structured output prompt* that asks the
-      model to list its current solution as verified steps.
-   2. Stream from the vLLM server to collect those steps.
-   3. Verify each step with the existing ``verify_step`` utilities.
-   4. If **wrong** -> remove the injected ``</think>`` + prompt + generated
-      steps, append ``Wait, that approach is wrong. ...`` inside the thinking
-      trace, and let the model keep thinking.
-   5. If **correct** -> keep ``</think>`` + the structured prompt, let
-      ``stream_completion`` recurse so the model finishes (Phase B verifies
-      each step as it streams).
+   After a warmup period, every *N* double-newlines in the thinking trace:
+   1. Inject ``</think> The expression that I found till now is {`` and
+      stream ~20 tokens to extract the expression the model outputs.
+   2. Verify the expression:
+      a. Extract numbers used in the expression.
+      b. Check each number appears in the original numbers (at most once).
+      c. If ALL four numbers are used: evaluate and check == 24.
+      d. If partial: evaluate the sub-expression, collect unused original
+         numbers, check ``can_reach_24([result] + unused)``.
+   3. If **wrong** -> strip the injected text, append
+      ``Wait, <error description>.`` inside the thinking trace and let
+      the model keep thinking.
+   4. If **correct AND complete** (all 4 numbers, equals 24) -> inject
+      ``Wait, current expression that I am able to generate seems to be
+      passed by the verifier, so let me stop and give the answer.
+      </think>`` and then let the model output the final answer.
+   5. If **correct AND partial** -> no feedback, let the model keep
+      thinking undisturbed.
 
 B) **AFTER a natural ``</think>``**:
-   Inject the same structured output prompt so the model outputs its steps
-   in the verifiable format. Then behave identically to
-   ``StepVerifierGame24Monitor`` — verify each step as it appears and give
-   ``[VERIFIER FEEDBACK ...]`` on the first error so the model retries.
+   Inject the same expression extraction prompt so the model outputs its
+   answer expression, then verify in the same way.  Give feedback on
+   errors so the model retries.
+
+Maze Workflow
+-------------
+A) **DURING the thinking phase** (inside ``<think>...</think>``):
+   After a warmup period, every *N* double-newlines in the thinking trace:
+   1. Inject a first-person prompt in the LLM's own voice:
+      ``Let me output the current steps I have traced so far through
+      the maze in the following format:`` + ``<format>...</format>``
+      + ``>>> LOCATE START AND EXIT:``.  Stream ~300 tokens to
+      extract the model's current traced path steps.
+   2. Parse the structured steps and verify each against the maze grid:
+      a. Is the move direction correct (delta matches)?
+      b. Is from_pos the expected position?
+      c. Is to_pos walkable (not a wall)?
+      d. Is the turn type correct?
+      e. Are running counts correct?
+   3. If **errors found** -> strip the injected text, append
+      ``Wait, <error description>.`` and let the model keep thinking.
+   4. If **path reaches E with all steps correct** -> inject early-stop
+      message + ``</think>`` followed by the structured format prompt
+      so the model gives the final answer in the specified format.
+   5. If **partial but correct so far** -> no feedback, keep thinking.
+
+B) **AFTER ``</think>`` (natural or early-stop)**:
+   Phase 2a: Inject the same structured step format template (in the
+   LLM's own voice: ``Let me trace the step by step solution...`` +
+   ``<format>...</format>`` + ``>>> LOCATE START AND EXIT:``) so the
+   model fills it in.
+
+   Phase 2b: Verify each step as the model fills in the template.
+   Once ``\\boxed{}`` appears, stop generation.
 """
 
 import re
 import json
 import logging
-from typing import List, Optional, Tuple, Dict, Any
+from typing import List, Tuple, Optional
 from copy import deepcopy
 
 import httpx
 
 from .base import VerifyMonitor
 from ..utils.game24_verifier import (
-    parse_step, verify_step, format_feedback, can_reach_24, format_number
+    can_reach_24, is_close, format_number, safe_eval,
+)
+from ..utils.maze_verifier import (
+    Direction, parse_direction, get_expected_turn_type,
+    parse_maze_from_prompt, parse_maze_step, verify_maze_step,
+    verify_locate_section, format_maze_feedback, format_locate_feedback,
+    DIRECTION_DELTAS,
 )
 
 logger = logging.getLogger(__name__)
 
-# ──────────────────────────────────────────────────────────────────────
-#  The structured-output prompt injected right after </think>.
-#  It tells the model to present its solution in the step format that
-#  our verifier can parse, AND to put the final answer in \boxed{}.
-# ──────────────────────────────────────────────────────────────────────
-STRUCTURED_OUTPUT_PROMPT = (
-    "Now present your solution step-by-step in the following format. "
-    "Use ALL four numbers exactly once with +, -, *, / to reach 24.\n"
-    "\n"
-    ">Step1\n"
-    "available numbers: [a, b, c, d]\n"
-    "suggested operation: a * b = result1\n"
-    "remaining numbers: [result1, c, d]\n"
-    "\n"
-    ">Step2\n"
-    "available numbers: [result1, c, d]\n"
-    "suggested operation: result1 + c = result2\n"
-    "remaining numbers: [result2, d]\n"
-    "\n"
-    ">Step3\n"
-    "available numbers: [result2, d]\n"
-    "suggested operation: result2 - d = result3\n"
-    "remaining numbers: [result3]\n"
-    "\n"
-    "> Final expression: \\boxed{expression using original numbers}"
+# ---------------------------------------------------------------------------
+#  Prompts injected to elicit an expression from the model.
+# ---------------------------------------------------------------------------
+
+# Injected during the thinking phase (after </think>)
+THINKING_PHASE_EXPRESSION_PROMPT = (
+    "</think>\nThe expression that I found till now is {"
 )
 
+# Injected after a natural </think> to force the model to emit \boxed{expr}
+FINAL_EXPRESSION_PROMPT = (
+    "\nThe final expression is \\boxed"
+)
+
+
+def _extract_numbers_from_expr(expr: str) -> List[float]:
+    """Extract all numbers (integers and decimals) from an expression string."""
+    numbers = re.findall(r'\d+\.?\d*', expr)
+    return [int(float(n)) if float(n) == int(float(n)) else float(n) for n in numbers]
+
+
+def _normalize_number(n) -> float:
+    """Normalize a number for comparison."""
+    return float(n)
+
+
+def verify_expression(expr_str: str, original_numbers: List[float]) -> Tuple[str, bool, List[str], Optional[List[float]]]:
+    """
+    Verify an expression against the Game of 24 rules.
+
+    Args:
+        expr_str: The arithmetic expression string (e.g. "1*2", "(3+5)*7/11")
+        original_numbers: The four original numbers.
+
+    Returns:
+        (status, is_valid, errors, unused_numbers_or_None)
+        - status: "complete" | "partial" | "error"
+        - is_valid: True if the expression is valid (no errors)
+        - errors: List of error messages
+        - unused_numbers: Numbers from original not used in expr (None if errors)
+    """
+    errors = []
+    fmt = format_number
+
+    # 1. Extract numbers used in the expression
+    used_numbers = _extract_numbers_from_expr(expr_str)
+    if not used_numbers:
+        errors.append(f"No numbers found in expression: {expr_str}")
+        return "error", False, errors, None
+
+    # 2. Check each used number appears in original (at most once)
+    original_copy = [_normalize_number(n) for n in original_numbers]
+    matched_indices = []
+    for used_n in used_numbers:
+        used_norm = _normalize_number(used_n)
+        found = False
+        for i, orig_n in enumerate(original_copy):
+            if i not in matched_indices and is_close(used_norm, orig_n):
+                matched_indices.append(i)
+                found = True
+                break
+        if not found:
+            errors.append(
+                f"Number {fmt(used_norm)} in expression is not available in "
+                f"original numbers {[fmt(n) for n in original_numbers]} "
+                f"(or was already used)"
+            )
+
+    if errors:
+        return "error", False, errors, None
+
+    # 3. Compute unused original numbers
+    unused = [original_copy[i] for i in range(len(original_copy)) if i not in matched_indices]
+
+    # 4. Evaluate the expression
+    try:
+        value = eval(expr_str, {"__builtins__": None}, {})
+        value = float(value)
+    except Exception as e:
+        errors.append(f"Cannot evaluate expression '{expr_str}': {e}")
+        return "error", False, errors, None
+
+    # 5. Check based on whether all numbers are used
+    all_used = len(unused) == 0
+
+    if all_used:
+        # Full expression: must equal 24
+        if not is_close(value, 24):
+            errors.append(
+                f"Expression '{expr_str}' evaluates to {fmt(value)}, not 24."
+            )
+            return "error", False, errors, None
+        # Valid complete solution!
+        return "complete", True, [], []
+    else:
+        # Partial expression: check if remaining numbers + result can reach 24
+        remaining = [value] + unused
+        can_reach, example = can_reach_24(remaining)
+        if not can_reach:
+            remaining_str = [fmt(n) for n in remaining]
+            errors.append(
+                f"Expression '{expr_str}' evaluates to {fmt(value)}. "
+                f"Remaining numbers (including result) are {remaining_str}. "
+                f"Cannot reach 24 from these numbers. This is a dead end."
+            )
+            return "error", False, errors, None
+        # Partial but reachable -- valid
+        return "partial", True, [], unused
+
 
 class ThinkingPhaseStepVerifierGame24Monitor(VerifyMonitor):
     """
-    Monitor that adds thinking-phase verification on top of the standard
-    StepVerifierGame24Monitor behaviour.
+    Monitor that verifies Game-of-24 expressions during and after thinking.
 
-    During thinking: every N newlines -> force ``</think>`` + structured
-        prompt, stream from vLLM, verify.  Roll back on error, commit on
-        success.
+    During thinking: every N double-newlines (after warmup) -> fork a
+        side-stream asking for the current expression, verify it, and
+        give appropriate feedback.
 
-    After natural ``</think>``: inject the same structured prompt, then
-        verify each step (identical to StepVerifierGame24Monitor).
+    After natural ``</think>``: inject expression prompt, verify the
+        final answer.
     """
 
     def __init__(
@@ -93,6 +216,7 @@ def __init__(
         max_corrections: int = 5,
         answer_start_token: str = "</think>",
         async_execution: bool = True,
+        warmup_newlines: int = 0,
     ):
         super().__init__(name)
         self.original_numbers = [float(x) for x in original_numbers]
@@ -102,142 +226,761 @@ def __init__(
         self.max_corrections = max_corrections
         self.answer_start_token = answer_start_token
         self.async_execution = async_execution
+        self.warmup_newlines = warmup_newlines
 
-        # ---- thinking-phase state ----
+        # ---- state ----
         self._think_phase_corrections = 0
+        self._verified_expression = None  # set by Phase 1 early-stop
 
     # ------------------------------------------------------------------
     #  helpers
     # ------------------------------------------------------------------
+    @staticmethod
+    def _fmt(n: float) -> str:
+        if abs(n - round(n)) < 1e-9:
+            return str(int(round(n)))
+        return f"{n:.4f}".rstrip('0').rstrip('.')
+
     def _count_feedback_blocks(self, text: str) -> int:
         return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
 
     def _is_in_thinking_phase(self, generated_text: str) -> bool:
         return self.answer_start_token not in generated_text
 
+    @staticmethod
+    def _extract_braced_expression(text: str) -> Optional[str]:
+        """Extract the first expression wrapped in { } from *text*.
+
+        Handles nested braces so that e.g. ``{(3+5)*7}`` is extracted correctly.
+        """
+        start = text.find('{')
+        if start == -1:
+            return None
+        brace_count = 0
+        end = start
+        while end < len(text):
+            if text[end] == '{':
+                brace_count += 1
+            elif text[end] == '}':
+                brace_count -= 1
+                if brace_count == 0:
+                    break
+            end += 1
+        if brace_count != 0:
+            return None
+        expr = text[start + 1:end].strip()
+        if not expr:
+            return None
+        # Basic cleanup: remove LaTeX
+        expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
+        expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+        while re.search(frac_pattern, expr):
+            expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+        # Handle implicit multiplication
+        expr = re.sub(r'\)\s*\(', ')*(', expr)
+        expr = re.sub(r'\)\s*(\d)', r')*\1', expr)
+        expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
+        return expr
+
+    @staticmethod
+    def _extract_boxed_expression(text: str) -> Optional[str]:
+        """Extract expression from \\boxed{...} in text."""
+        boxed_pattern = r"\\boxed\{"
+        matches = list(re.finditer(boxed_pattern, text))
+        if not matches:
+            return None
+        last_match = matches[-1]
+        start = last_match.end()
+        brace_count = 1
+        end = start
+        while end < len(text) and brace_count > 0:
+            if text[end] == '{':
+                brace_count += 1
+            elif text[end] == '}':
+                brace_count -= 1
+            end += 1
+        expr = text[start:end - 1].strip()
+        expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
+        expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+        while re.search(frac_pattern, expr):
+            expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+        expr = re.sub(r'\)\s*\(', ')*(', expr)
+        expr = re.sub(r'\)\s*(\d)', r')*\1', expr)
+        expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
+        return expr
+
     # ------------------------------------------------------------------
-    #  _get_current_available / _extract_last_step_info
-    #  (identical to StepVerifierGame24Monitor)
+    #  _side_stream_expression  (streams ~20 tokens to get {expr})
     # ------------------------------------------------------------------
-    def _get_current_available(self, generated_text: str) -> List[float]:
-        if self.answer_start_token not in generated_text:
-            return self.original_numbers.copy()
+    async def _side_stream_expression(self, text_so_far: str, max_new_tokens: int = 20) -> Optional[str]:
+        """
+        Send ``prompt + text_so_far`` to vLLM, stream at most
+        *max_new_tokens* tokens, and try to extract an expression from
+        the output that appears inside ``{ }``.
 
-        text_after_think = generated_text.split(self.answer_start_token)[-1]
+        ``text_so_far`` is expected to end with something like
+        ``</think>\\nThe expression that I found till now is {``
+        so the model just needs to output the expression body and ``}``.
 
-        step_pattern = re.compile(
-            r'>\s*Step\s*(\d+)\s*\n'
-            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
-            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
-            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\]',
-            re.IGNORECASE,
+        Returns the extracted expression string, or None.
+        """
+        fmt = self._fmt
+        nums_str = ", ".join(fmt(n) for n in self.original_numbers)
+        logger.info(
+            f"[Side-stream] Starting expression extraction\n"
+            f"  Original numbers : [{nums_str}]\n"
+            f"  Max new tokens   : {max_new_tokens}"
         )
 
-        sections = re.split(
-            r'\[VERIFIER FEEDBACK[^\]]*\]\s*', text_after_think, flags=re.DOTALL
-        )
-        last_section = sections[-1]
-        steps_in_last_section = list(step_pattern.finditer(last_section))
-
-        if not steps_in_last_section:
-            return self.original_numbers.copy()
-
-        last_step = steps_in_last_section[-1]
-        step_num_to_verify = int(last_step.group(1))
-
-        if step_num_to_verify == 1:
-            return self.original_numbers.copy()
-
-        target_step = step_num_to_verify - 1
-
-        for step_match in steps_in_last_section[:-1]:
-            if int(step_match.group(1)) == target_step:
-                try:
-                    return [
-                        float(x.strip())
-                        for x in step_match.group(4).strip().split(',')
-                        if x.strip()
-                    ]
-                except Exception:
-                    pass
-
-        for section in reversed(sections[:-1]):
-            for step_match in reversed(list(step_pattern.finditer(section))):
-                if int(step_match.group(1)) == target_step:
-                    try:
-                        return [
-                            float(x.strip())
-                            for x in step_match.group(4).strip().split(',')
-                            if x.strip()
-                        ]
-                    except Exception:
-                        pass
-
-        return self.original_numbers.copy()
-
-    def _extract_last_step_info(self, generated_text: str):
-        if self.answer_start_token not in generated_text:
-            return None, None
+        payload = deepcopy(self.llm_server["payload"])
+        payload["prompt"] = self.prompt + text_so_far
+        payload["max_tokens"] = max_new_tokens
+        # We don't need logprobs for the side-stream
+        payload.pop("logprobs", None)
+
+        generated = ""
+
+        async with httpx.AsyncClient(timeout=None) as client:
+            async with client.stream(
+                "POST",
+                self.llm_server["url"],
+                headers=self.llm_server["headers"],
+                json=payload,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[len("data: "):].strip()
+                        if data == "[DONE]":
+                            break
+                        chunk = json.loads(data)["choices"][0]["text"]
+                        generated += chunk
+                        logger.debug(f"[Side-stream] chunk: {chunk!r}")
+
+                        # As soon as we see '}', we have the expression
+                        if '}' in generated:
+                            break
+
+        # The model was prompted with "{ " so its output completes the brace.
+        # We wrap it back so _extract_braced_expression can parse it.
+        full_text = "{" + generated
+        expr = self._extract_braced_expression(full_text)
+        if expr:
+            logger.info(f"[Side-stream] Extracted expression: {expr}")
+        else:
+            logger.info(
+                f"[Side-stream] No expression found in side-stream "
+                f"(generated {len(generated)} chars: {generated!r})"
+            )
+        return expr
+
+    # ------------------------------------------------------------------
+    #  step_extractor -- decides WHEN to trigger verification
+    # ------------------------------------------------------------------
+    def step_extractor(self, chunk: str, generated_text: str):
+        """
+        Phase 1 (thinking): trigger when total double-newlines cross the
+            next multiple of ``newline_threshold`` (after warmup).
+        Phase 2 (after </think>): trigger to inject the expression prompt,
+            or when a ``{expression}`` or ``\\boxed{expression}`` appears.
+        """
+        # ===== PHASE 1: still inside <think> =====
+        if self._is_in_thinking_phase(generated_text):
+            if self._think_phase_corrections >= self.max_corrections:
+                return False, None
+
+            total_double_newlines = generated_text.count('\n\n')
+
+            # Skip until warmup period is reached
+            if total_double_newlines < self.warmup_newlines:
+                return False, None
+
+            # After warmup, trigger at every newline_threshold multiple
+            past_warmup = total_double_newlines - self.warmup_newlines
+            if (generated_text.endswith('\n\n')
+                    and past_warmup >= 0
+                    and past_warmup % self.newline_threshold == 0):
+                logger.info(
+                    f"[step_extractor] Phase 1 trigger: \\n\\n count={total_double_newlines} "
+                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
+                    f"threshold={self.newline_threshold})"
+                )
+                return True, generated_text
+
+            return False, None
+
+        # ===== PHASE 2: after </think> =====
+
+        # 2a: </think> present but we haven't injected the expression prompt yet
+        if FINAL_EXPRESSION_PROMPT.strip() not in generated_text:
+            logger.info(
+                "[step_extractor] Phase 2a: </think> detected, "
+                "expression prompt not yet injected."
+            )
+            return True, generated_text
+
+        # 2b: expression prompt was injected (ends with "\boxed").
+        #     The model should complete it with "{expression}".
+        #     Trigger once we see a complete \boxed{...} (with closing brace).
+        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = generated_text[think_end_pos:]
+
+        # Look past any previous feedback blocks
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        text = text_after_think[last_feedback_end:]
+
+        has_boxed = re.search(r'\\boxed\{[^}]+\}', text)
+        if has_boxed:
+            return True, generated_text
+
+        return False, None
+
+    # ------------------------------------------------------------------
+    #  verify
+    # ------------------------------------------------------------------
+    async def verify(self, step: str, token_index: int, event, event_info):
+        """
+        Case 1 -- still in thinking (no </think> in step):
+            Inject ``</think> The expression that I found till now is {``,
+            stream ~20 tokens, extract the expression, verify it.
+            - Error -> feedback ``Wait, <error>.``
+            - Correct & complete -> inject early-stop message
+            - Correct & partial -> do nothing, let model keep thinking
+
+        Case 2a -- natural </think> just appeared, expression prompt not
+            yet injected:
+            Signal fix() to append the expression prompt.
+
+        Case 2b -- after </think> + expression prompt already injected:
+            Verify the expression from the model's output.
+        """
+
+        # ==================================================================
+        # CASE 1: Thinking phase -- side-stream expression verification
+        # ==================================================================
+        if self.answer_start_token not in step:
+            total_dn = step.count('\n\n')
+            logger.info(
+                f"[Phase 1] Thinking-phase verification triggered\n"
+                f"  \\n\\n count  : {total_dn}\n"
+                f"  Thinking len : {len(step)} chars"
+            )
+
+            # Build text with injected prompt for expression extraction
+            text_with_prompt = step + "\n" + THINKING_PHASE_EXPRESSION_PROMPT
+
+            # Side-stream: get expression from the model (~20 tokens)
+            expr_str = await self._side_stream_expression(text_with_prompt, max_new_tokens=20)
+
+            if expr_str is None:
+                # Model didn't produce a parseable expression -- let it keep thinking
+                logger.info(
+                    "[Phase 1] No expression extracted from side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            # Verify the extracted expression
+            status, is_valid, errors, unused = verify_expression(
+                expr_str, self.original_numbers
+            )
+
+            if not is_valid:
+                # ---- WRONG: inject error feedback into thinking trace ----
+                error_summary = "; ".join(errors)
+                self._think_phase_corrections += 1
+                logger.info(
+                    f"[Phase 1] INVALID expression '{expr_str}'\n"
+                    f"  Error(s) : {error_summary}\n"
+                    f"  Action   : Inject feedback into thinking trace\n"
+                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                )
+                thinking_feedback = (
+                    f"\n\nWait, the expression {expr_str} does not work. "
+                    f"{error_summary} "
+                    f"I must NOT reuse {expr_str} or any expression I have already tried. "
+                    f"Let me try a completely different combination of "
+                    f"operations and grouping of numbers.\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = thinking_feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = errors
+                    event_info["phase"] = "rollback_to_thinking"
+                    event.set()
+                return step, thinking_feedback
 
-        text_after_think = generated_text.split(self.answer_start_token)[-1]
-        sections = re.split(
-            r'\[VERIFIER FEEDBACK[^\]]*\]\s*', text_after_think, flags=re.DOTALL
+            elif status == "complete":
+                # ---- CORRECT & COMPLETE: early-stop, push to answer ----
+                self._verified_expression = expr_str
+                logger.info(
+                    f"[Phase 1] VALID COMPLETE expression '{expr_str}' == 24\n"
+                    f"  Action: Inject early-stop message and transition to answer."
+                )
+                early_stop_msg = (
+                    f"\n\nWait, the expression {expr_str} has been verified "
+                    f"to equal 24 using all the given numbers. This will be "
+                    f"my final answer.\n{self.answer_start_token}\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = early_stop_msg
+                    event_info["correction_index"] = token_index
+                    event_info["phase"] = "early_stop_answer"
+                    event_info["verified_expression"] = expr_str
+                    event.set()
+                return step, early_stop_msg
+
+            else:
+                # ---- CORRECT & PARTIAL: let model keep thinking ----
+                unused_str = (
+                    "[" + ", ".join(self._fmt(n) for n in unused) + "]"
+                    if unused else "[]"
+                )
+                logger.info(
+                    f"[Phase 1] VALID PARTIAL expression '{expr_str}'\n"
+                    f"  Unused numbers: {unused_str}\n"
+                    f"  Action: No error, let model keep thinking."
+                )
+                return step, None
+
+        # ==================================================================
+        # CASE 2a: </think> present but expression prompt not yet injected
+        # ==================================================================
+        if FINAL_EXPRESSION_PROMPT.strip() not in step:
+            logger.info(
+                "[Phase 2a] Natural </think> detected. "
+                "Injecting expression extraction prompt."
+            )
+            prompt_text = FINAL_EXPRESSION_PROMPT
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = prompt_text
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "inject_expression_prompt"
+                event.set()
+            return step, prompt_text
+
+        # ==================================================================
+        # CASE 2b: After </think> + expression prompt -- verify final answer
+        # ==================================================================
+
+        # Max-corrections guard
+        num_corrections = self._count_feedback_blocks(step)
+        if num_corrections >= self.max_corrections:
+            fb = "\nthe answer is \\boxed{no solution}"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = fb
+                event_info["correction_index"] = token_index
+                event_info["errors"] = ["Max corrections reached"]
+                event_info["phase"] = "standard_verify"
+                event.set()
+            return step, fb
+
+        # Extract expression from \boxed{...} — only look at text after
+        # the last feedback block to avoid re-extracting old expressions.
+        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = step[think_end_pos:]
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        recent_text = text_after_think[last_feedback_end:]
+
+        expr_str = self._extract_boxed_expression(recent_text)
+        if expr_str is not None:
+            logger.info(f"[Phase 2b] Extracted expression from \\boxed: '{expr_str}'")
+
+        if expr_str is None:
+            return step, None
+
+        # Verify the final expression (must use all 4 numbers and equal 24)
+        status, is_valid, errors, unused = verify_expression(
+            expr_str, self.original_numbers
         )
-        text = sections[-1]
 
-        step_pattern = re.compile(
-            r'(>\s*Step\s*(\d+)\s*\n'
-            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
-            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
-            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\])',
-            re.IGNORECASE,
+        if is_valid and status == "complete":
+            logger.info(f"[Phase 2b] Final expression '{expr_str}' is correct (= 24)")
+            # Signal STOP so the model doesn't keep generating
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = ""  # nothing to append
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "final_answer_correct"
+                event_info["verified_expression"] = expr_str
+                event.set()
+            return step, None
+
+        # Build error messages for partial/wrong answers in phase 2
+        if is_valid and status == "partial":
+            # In phase 2 (after </think>) we need ALL numbers used
+            used_numbers = _extract_numbers_from_expr(expr_str)
+            errors = [
+                f"Expression '{expr_str}' only uses {len(used_numbers)} of "
+                f"{len(self.original_numbers)} numbers. After </think>, "
+                f"a COMPLETE expression using ALL numbers is required."
+            ]
+
+        if not errors:
+            errors = [f"Expression '{expr_str}' is not a valid solution."]
+
+        error_summary = "; ".join(errors)
+        logger.info(f"[Phase 2b] Final expression FAILED: {error_summary}")
+
+        orig_display = [int(n) if n == int(n) else n for n in self.original_numbers]
+        nums_str = ", ".join(str(n) for n in orig_display)
+        feedback = (
+            f"\n[VERIFIER FEEDBACK:\n"
+            f"  The expression {expr_str} is incorrect. {error_summary}\n"
+            f"  Do NOT reuse {expr_str} or any previously tried expression.\n"
+            f"  Try a completely different approach. Use ALL four numbers "
+            f"{nums_str} exactly once, "
+            f"evaluating to 24. Wrap in \\boxed{{}}. ]\n"
         )
-        all_steps = list(step_pattern.finditer(text))
-        if not all_steps:
-            return None, None
-
-        last_step = all_steps[-1]
-        step_num = int(last_step.group(2))
-        step_text = (
-            f">Step{step_num}\n"
-            f"available numbers: [{last_step.group(3).strip()}]\n"
-            f"suggested operation: {last_step.group(4).strip()}\n"
-            f"remaining numbers: [{last_step.group(5).strip()}]"
+        if not event.is_set():
+            event_info["generated_text"] = step
+            event_info["feedback"] = feedback
+            event_info["correction_index"] = token_index
+            event_info["errors"] = errors
+            event_info["phase"] = "standard_verify"
+            event.set()
+        return step, feedback
+
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        """
+        Applies the appropriate fix depending on the phase:
+
+        - ``rollback_to_thinking``: Append error feedback into thinking trace.
+        - ``early_stop_answer``: Append early-stop message + </think> to
+          transition the model to answer generation.
+        - ``inject_expression_prompt``: Append expression prompt after </think>.
+        - ``standard_verify``: Append [VERIFIER FEEDBACK ...].
+        """
+        phase = event_info.get("phase", "standard_verify")
+
+        if phase == "rollback_to_thinking":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[fix] Phase: rollback_to_thinking\n"
+                f"  -> Appended error feedback into <think> trace.\n"
+                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+            )
+            return result
+
+        if phase == "early_stop_answer":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[fix] Phase: early_stop_answer\n"
+                f"  -> Verified expression passed. Injecting early-stop + </think>.\n"
+                f"  -> Model will now generate the final answer."
+            )
+            return result
+
+        if phase == "final_answer_correct":
+            expr = event_info.get("verified_expression", "?")
+            logger.info(
+                f"[fix] Phase: final_answer_correct\n"
+                f"  -> Final expression '{expr}' verified correct. Stopping generation."
+            )
+            return event_info["generated_text"]
+
+        if phase == "inject_expression_prompt":
+            logger.info(
+                f"[fix] Phase: inject_expression_prompt\n"
+                f"  -> Natural </think> detected.\n"
+                f"  -> Appending expression extraction prompt."
+            )
+            return event_info["generated_text"] + event_info["feedback"]
+
+        # standard_verify
+        errors = event_info.get("errors", [])
+        error_summary = "; ".join(errors) if errors else "unknown"
+        logger.info(
+            f"[fix] Phase: standard_verify\n"
+            f"  -> Expression failed: {error_summary}\n"
+            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
         )
-        return step_num, parse_step(step_text)
+        return event_info["generated_text"] + event_info["feedback"]
 
-    def _count_complete_steps(self, text: str) -> int:
-        """Return how many complete step blocks are in the text."""
-        step_pattern = re.compile(
-            r'>\s*Step\s*\d+\s*\n'
-            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
-            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
-            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\]',
-            re.IGNORECASE,
+
+# =====================================================================
+#  Maze Thinking-Phase Prompts
+# =====================================================================
+
+
+def _build_maze_format_block(question_type: str) -> str:
+    """
+    Build the <format>...</format> block that describes the structured
+    output template.  Re-used by both the side-stream (Phase 1) and
+    the post-</think> injection (Phase 2a).
+    """
+    if question_type == "relative_position":
+        return (
+            "<format>\n"
+            ">>> LOCATE START AND EXIT:\n"
+            "    S position: (row, col)\n"
+            "    E position: (row, col)\n"
+            "\n"
+            ">>> COMPARE POSITIONS:\n"
+            "    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S\n"
+            "    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S\n"
+            "\n"
+            ">>> FINAL ANSWER:\n"
+            "    \\boxed{LETTER}\n"
+            "</format>"
+        )
+    else:
+        count_line = "    Running count: Right=0, Left=0"
+        if question_type == "total_turns":
+            count_line = "    Running count: Right=0, Left=0, Total=0"
+
+        return (
+            "<format>\n"
+            ">>> LOCATE START AND EXIT:\n"
+            "    S position: (row, col)\n"
+            "    E position: (row, col)\n"
+            "\n"
+            ">>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)\n"
+            "    Current position: (r2, c2)\n"
+            "    Previous direction: —\n"
+            "    Current direction: DOWN\n"
+            "    Turn type: STRAIGHT\n"
+            f"{count_line}\n"
+            "\n"
+            "[... continue for all steps until reaching E ...]\n"
+            "\n"
+            ">>> FINAL ANSWER:\n"
+            "    \\boxed{LETTER}\n"
+            "</format>"
         )
-        return len(step_pattern.findall(text))
+
+
+def _build_maze_thinking_phase_prompt(question_type: str) -> str:
+    """
+    Build the side-stream prompt injected during the thinking phase.
+
+    Written in the LLM's own first-person thinking voice so it blends
+    naturally with the ``<think>`` trace.  Includes the ``<format>``
+    block and the starting marker so the model begins filling in.
+    """
+    format_block = _build_maze_format_block(question_type)
+    return (
+        "\n\nLet me output the current steps I have traced so far "
+        "through the maze in the following format:\n"
+        f"{format_block}\n"
+        ">>> LOCATE START AND EXIT:\n"
+    )
+
+
+def _build_maze_structured_prompt(question_type: str) -> str:
+    """
+    Build the structured format prompt injected after </think>.
+
+    This is analogous to Game24's step format injection — it gives the
+    model a template to fill in so we can parse and verify each step.
+    Written in the LLM's own voice so it reads naturally.
+    """
+    format_block = _build_maze_format_block(question_type)
+    return (
+        "\nLet me trace the step by step solution through the maze "
+        "in the following format:\n"
+        f"{format_block}\n"
+        ">>> LOCATE START AND EXIT:\n"
+    )
+
+
+# =====================================================================
+#  ThinkingPhaseStepVerifierMazeMonitor
+# =====================================================================
+
+class ThinkingPhaseStepVerifierMazeMonitor(VerifyMonitor):
+    """
+    Monitor that verifies maze path-tracing during and after thinking.
+
+    **No meta-prompt required** — works with a plain user prompt containing
+    just the maze and question.  Structure is injected by this monitor
+    after ``</think>`` (natural or early-stop), exactly like Game24
+    injects its step format.
+
+    Phase 1 – During ``<think>...</think>``:
+        Every N double-newlines (after warmup), fork a side-stream that
+        injects ``</think>`` + a structured step prompt, stream ~300
+        tokens, parse and verify each step against the maze grid.
+
+    Phase 2a – ``</think>`` detected, structured prompt not yet injected:
+        Inject the structured step-by-step format template so the model
+        fills it in (LOCATE → STEPs → FINAL ANSWER → ``\\boxed{}``).
+
+    Phase 2b – Structured prompt injected, model is generating:
+        Verify each completed step as it appears.  Once ``\\boxed{}``
+        appears, signal completion.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        grid: list,
+        start_pos: tuple,
+        exit_pos: tuple,
+        llm_server: dict,
+        prompt: str,
+        question_type: str = "right_turns",
+        newline_threshold: int = 10,
+        max_corrections: int = 5,
+        answer_start_token: str = "</think>",
+        async_execution: bool = True,
+        warmup_newlines: int = 0,
+    ):
+        super().__init__(name)
+        self.grid = grid
+        self.start_pos = start_pos
+        self.exit_pos = exit_pos
+        self.llm_server = llm_server
+        self.prompt = prompt
+        self.question_type = question_type
+        self.newline_threshold = newline_threshold
+        self.max_corrections = max_corrections
+        self.answer_start_token = answer_start_token
+        self.async_execution = async_execution
+        self.warmup_newlines = warmup_newlines
+
+        # Build the structured prompt that will be injected after </think>
+        self._structured_prompt = _build_maze_structured_prompt(question_type)
+        # Build the thinking-phase side-stream prompt (in LLM's own voice)
+        self._thinking_phase_prompt = _build_maze_thinking_phase_prompt(question_type)
+        # A unique marker to detect whether we already injected it
+        self._structured_marker = ">>> LOCATE START AND EXIT:"
+
+        # ---- state ----
+        self._think_phase_corrections = 0
+        self._verified_path_complete = False  # True if path reaches E
 
     # ------------------------------------------------------------------
-    #  _stream_and_verify_steps
+    #  helpers
     # ------------------------------------------------------------------
-    async def _stream_and_verify_steps(self, text_so_far: str):
+    def _count_feedback_blocks(self, text: str) -> int:
+        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
+
+    def _is_in_thinking_phase(self, generated_text: str) -> bool:
+        return self.answer_start_token not in generated_text
+
+    def _structured_prompt_injected(self, generated_text: str) -> bool:
+        """Check if structured format was already injected after </think>."""
+        if self.answer_start_token not in generated_text:
+            return False
+        after_think = generated_text.split(self.answer_start_token, 1)[1]
+        return self._structured_marker in after_think
+
+    @staticmethod
+    def detect_question_type(prompt: str) -> str:
+        """Auto-detect question type from prompt text."""
+        prompt_lower = prompt.lower()
+        if "right turn" in prompt_lower or "right-turn" in prompt_lower:
+            return "right_turns"
+        if "left turn" in prompt_lower or "left-turn" in prompt_lower:
+            return "total_turns"
+        if "total" in prompt_lower and "turn" in prompt_lower:
+            return "total_turns"
+        if "turn" in prompt_lower:
+            return "right_turns"
+        return "relative_position"
+
+    # ------------------------------------------------------------------
+    #  _parse_steps_from_text – parse structured steps from side-stream
+    # ------------------------------------------------------------------
+    def _parse_steps_from_text(self, text: str):
         """
-        Stream from the vLLM server with ``prompt + text_so_far`` (which
-        already ends with the structured output prompt).
+        Parse all structured maze steps from text.
+
+        Returns list of parsed step dicts.
+        """
+        steps = []
+
+        step_pattern = re.compile(
+            r'>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
+            r'Running count:\s*Right\s*=\s*\d+\s*,\s*Left\s*=\s*\d+[^\n]*',
+            re.IGNORECASE | re.DOTALL
+        )
+
+        for match in step_pattern.finditer(text):
+            parsed = parse_maze_step(match.group(0))
+            if parsed:
+                steps.append(parsed)
 
-        As each complete step block appears, verify it immediately.
-        - If a step is WRONG -> stop streaming, return the error info.
-        - If all steps pass and the model finishes -> return full text.
+        return steps
+
+    def _verify_all_steps(self, steps):
+        """
+        Verify a sequence of parsed maze steps against the grid.
 
         Returns:
-            (full_text, is_all_valid, error_info_or_None)
+            (all_valid, first_error_step_num, errors, final_pos, final_dir,
+             right_count, left_count, total_count)
+        """
+        pos = self.start_pos
+        direction = Direction.NONE
+        right_count = 0
+        left_count = 0
+        total_count = 0
+
+        for step in steps:
+            is_valid, errors, state = verify_maze_step(
+                step=step,
+                grid=self.grid,
+                expected_from_pos=pos,
+                prev_direction=direction,
+                expected_right_count=right_count,
+                expected_left_count=left_count,
+                expected_total_count=total_count,
+            )
+
+            if not is_valid:
+                return (False, step.get('step_num', 0), errors,
+                        pos, direction, right_count, left_count, total_count)
+
+            pos = state['new_pos']
+            direction = state['new_direction']
+            right_count = state['new_right']
+            left_count = state['new_left']
+            total_count = state['new_total']
+
+        return (True, None, [], pos, direction,
+                right_count, left_count, total_count)
+
+    # ------------------------------------------------------------------
+    #  _side_stream_maze_steps – streams tokens to get traced path
+    # ------------------------------------------------------------------
+    async def _side_stream_maze_steps(self, text_so_far: str, max_new_tokens: int = 300) -> str:
+        """
+        Send ``prompt + text_so_far`` to vLLM, stream at most
+        *max_new_tokens* tokens, and return the generated text.
+
+        ``text_so_far`` is expected to end with the structured maze step
+        prompt so the model outputs its traced steps.
         """
+        logger.info(
+            f"[Maze Side-stream] Starting path extraction\n"
+            f"  Maze: S={self.start_pos}, E={self.exit_pos}\n"
+            f"  Max new tokens: {max_new_tokens}"
+        )
+
         payload = deepcopy(self.llm_server["payload"])
         payload["prompt"] = self.prompt + text_so_far
-        payload["max_tokens"] = min(payload.get("max_tokens", 2048), 2048)
+        payload["max_tokens"] = max_new_tokens
+        payload.pop("logprobs", None)
 
         generated = ""
-        last_verified_step_count = 0
 
         async with httpx.AsyncClient(timeout=None) as client:
             async with client.stream(
@@ -253,74 +996,66 @@ async def _stream_and_verify_steps(self, text_so_far: str):
                             break
                         chunk = json.loads(data)["choices"][0]["text"]
                         generated += chunk
-                        logger.debug(f"[vLLM side-stream] chunk: {chunk!r}")
-
-                        # Check if a new complete step appeared
-                        current_step_count = self._count_complete_steps(generated)
-                        if current_step_count > last_verified_step_count:
-                            full_text = text_so_far + generated
-                            step_num, parsed = self._extract_last_step_info(full_text)
-
-                            if (step_num is not None
-                                    and parsed is not None
-                                    and parsed.get('available_numbers') is not None):
-                                current_available = self._get_current_available(full_text)
-                                is_valid, errors, new_available = verify_step(
-                                    parsed, current_available,
-                                    self.original_numbers, step_num,
-                                )
-
-                                if not is_valid:
-                                    logger.info(
-                                        f"[ThinkingPhaseVerifier] Side-stream: "
-                                        f"Step {step_num} FAILED: {errors}"
-                                    )
-                                    return (
-                                        full_text,
-                                        False,
-                                        {"step_num": step_num,
-                                         "errors": errors,
-                                         "available": current_available},
-                                    )
-                                else:
-                                    logger.info(
-                                        f"[ThinkingPhaseVerifier] Side-stream: "
-                                        f"Step {step_num} verified OK"
-                                    )
-
-                            last_verified_step_count = current_step_count
-
-        full_text = text_so_far + generated
+                        logger.debug(f"[Maze Side-stream] chunk: {chunk!r}")
+
+                        # Stop if we see FINAL ANSWER or \boxed
+                        if '\\boxed' in generated or '>>> FINAL ANSWER' in generated:
+                            break
+
         logger.info(
-            f"[ThinkingPhaseVerifier] Side-stream finished. "
-            f"Generated {len(generated)} chars, "
-            f"{last_verified_step_count} steps verified."
+            f"[Maze Side-stream] Generated {len(generated)} chars"
         )
-        return full_text, True, None
+        return generated
+
+    # ------------------------------------------------------------------
+    #  _extract_boxed_answer
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _extract_boxed_answer(text: str) -> Optional[str]:
+        """Extract the content of the last \\boxed{...} in text."""
+        matches = list(re.finditer(r'\\boxed\{', text))
+        if not matches:
+            return None
+        last_match = matches[-1]
+        start = last_match.end()
+        brace_count = 1
+        end = start
+        while end < len(text) and brace_count > 0:
+            if text[end] == '{':
+                brace_count += 1
+            elif text[end] == '}':
+                brace_count -= 1
+            end += 1
+        return text[start:end - 1].strip()
 
     # ------------------------------------------------------------------
     #  step_extractor
     # ------------------------------------------------------------------
     def step_extractor(self, chunk: str, generated_text: str):
         """
-        Phase 1 (thinking): trigger when total newlines cross the next
-            multiple of ``newline_threshold``.
-        Phase 2 (after </think>): trigger when a natural ``</think>``
-            is detected (to inject the structured prompt), or when a
-            complete step block appears (for verification).
+        Phase 1 (thinking): trigger at every newline_threshold multiple
+            (after warmup).
+        Phase 2 (after </think>): trigger on structured steps or boxed
+            answer.
         """
         # ===== PHASE 1: still inside <think> =====
         if self._is_in_thinking_phase(generated_text):
             if self._think_phase_corrections >= self.max_corrections:
                 return False, None
 
-            total_newlines = generated_text.count('\n')
+            total_double_newlines = generated_text.count('\n\n')
+
+            if total_double_newlines < self.warmup_newlines:
+                return False, None
 
-            if chunk.endswith('\n') and total_newlines > 0 and total_newlines % self.newline_threshold == 0:
+            past_warmup = total_double_newlines - self.warmup_newlines
+            if (generated_text.endswith('\n\n')
+                    and past_warmup >= 0
+                    and past_warmup % self.newline_threshold == 0):
                 logger.info(
-                    f"[ThinkingPhaseVerifier] Total newlines={total_newlines}, "
-                    f"hit multiple of N={self.newline_threshold}. "
-                    f"Forcing step generation."
+                    f"[Maze step_extractor] Phase 1 trigger: \\n\\n count={total_double_newlines} "
+                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
+                    f"threshold={self.newline_threshold})"
                 )
                 return True, generated_text
 
@@ -328,151 +1063,250 @@ def step_extractor(self, chunk: str, generated_text: str):
 
         # ===== PHASE 2: after </think> =====
 
-        # Sub-case 2a: </think> is present but structured prompt is not
-        # -> trigger so verify() can inject it.
-        if STRUCTURED_OUTPUT_PROMPT not in generated_text:
+        # 2a: structured prompt not yet injected → trigger immediately
+        if not self._structured_prompt_injected(generated_text):
             logger.info(
-                "[ThinkingPhaseVerifier] </think> present but structured "
-                "output prompt missing - will inject it."
+                "[Maze step_extractor] Phase 2a: </think> detected, "
+                "structured prompt not yet injected."
             )
             return True, generated_text
 
-        # Sub-case 2b: structured prompt already injected -> detect
-        # complete steps for verification (same as StepVerifierGame24Monitor).
+        # 2b: structured prompt injected — verify steps / boxed answer
         think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
         text_after_think = generated_text[think_end_pos:]
 
+        # Strip out the injected <format>...</format> template so we only
+        # look at actual model output (which starts after the last
+        # ">>> LOCATE START AND EXIT:\n" line that ends the injected prompt).
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            # Model output starts right after the marker line
+            model_output_start = last_marker_pos + len(self._structured_marker)
+            text_after_think = text_after_think[model_output_start:]
+            text_start_offset = think_end_pos + model_output_start
+        else:
+            text_start_offset = think_end_pos
+
+        # Skip past feedback blocks
         feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
         last_feedback_end = 0
         for match in feedback_pattern.finditer(text_after_think):
             last_feedback_end = match.end()
-
         text = text_after_think[last_feedback_end:]
-        text_start_in_generated = think_end_pos + last_feedback_end
-
-        step_pattern = re.compile(
-            r'(>\s*Step\s*(\d+)\s*\n'
-            r'available\s+numbers?\s*:\s*\[([^\]]+)\]\s*\n'
-            r'suggested\s+operation\s*:\s*([^\n]+?)\s*\n'
-            r'remaining\s+numbers?\s*:\s*\[([^\]]+)\])',
-            re.IGNORECASE,
-        )
-        all_steps = list(step_pattern.finditer(text))
-        if not all_steps:
-            return False, None
+        text_start = text_start_offset + last_feedback_end
+
+        # For turn-counting questions, check for structured steps
+        if self.question_type in ("right_turns", "total_turns"):
+            # Check for complete step (with Running count including Right=N, Left=N)
+            step_pattern = re.compile(
+                r'(>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
+                r'Running count:\s*Right\s*=\s*\d+\s*,\s*Left\s*=\s*\d+[^\n]*)',
+                re.IGNORECASE | re.DOTALL
+            )
+            all_steps = list(step_pattern.finditer(text))
+
+            if all_steps:
+                last_step = all_steps[-1]
+                # Check if next step started (current already verified)
+                text_after = text[last_step.end():]
+                next_step = re.search(r'>>>\s*STEP\s+\d+', text_after, re.IGNORECASE)
+                if not next_step:
+                    end_pos = text_start + last_step.end()
+                    return True, generated_text[:end_pos]
+                return False, None
 
-        last_complete_step = all_steps[-1]
+            # Check LOCATE section
+            locate_pattern = re.compile(
+                r'(LOCATE START AND EXIT.*?E position:\s*\([^)]+\))',
+                re.IGNORECASE | re.DOTALL
+            )
+            locate_match = locate_pattern.search(text)
+            if locate_match:
+                step1_start = re.search(r'>>>\s*STEP\s+1', text[locate_match.end():], re.IGNORECASE)
+                if step1_start:
+                    end_pos = text_start + locate_match.end()
+                    return True, generated_text[:end_pos]
 
-        # Already moved past this step?
-        text_after_last_step = text[last_complete_step.end():]
-        if re.search(r'>\s*Step\s*\d+', text_after_last_step, re.IGNORECASE):
-            return False, None
+        # Check for boxed answer (any question type)
+        boxed = re.search(r'\\boxed\{[^}]+\}', text)
+        if boxed:
+            end_pos = text_start + boxed.end()
+            return True, generated_text[:end_pos]
 
-        end_pos = text_start_in_generated + last_complete_step.end()
-        return True, generated_text[:end_pos]
+        return False, None
 
     # ------------------------------------------------------------------
     #  verify
     # ------------------------------------------------------------------
     async def verify(self, step: str, token_index: int, event, event_info):
         """
-        Case 1 - still in thinking (no </think> in step):
-            Inject </think> + structured prompt, stream from vLLM to get
-            steps, verify them, then either rollback (wrong) or commit
-            (correct).
-
-        Case 2a - natural </think> just appeared, structured prompt not
-            yet injected:
-            Signal fix() to append the structured output prompt.
-
-        Case 2b - after </think> + structured prompt already injected:
-            Identical to StepVerifierGame24Monitor - verify each step.
+        Case 1 -- still in thinking (no </think>):
+            Fork side-stream to get traced path steps, verify each.
+        Case 2 -- after </think>:
+            Verify structured steps and/or final answer.
         """
 
         # ==================================================================
-        # CASE 1: Thinking phase
+        # CASE 1: Thinking phase – side-stream path verification
         # ==================================================================
         if self.answer_start_token not in step:
+            total_dn = step.count('\n\n')
             logger.info(
-                "[ThinkingPhaseVerifier] Injecting </think> + structured "
-                "prompt and streaming steps from vLLM inside verify()"
+                f"[Maze Phase 1] Thinking-phase verification triggered\n"
+                f"  \\n\\n count  : {total_dn}\n"
+                f"  Thinking len : {len(step)} chars"
             )
 
-            # Build text with injected </think> + structured prompt
-            text_with_think_end = (
-                step + "\n" + self.answer_start_token + "\n"
-                + STRUCTURED_OUTPUT_PROMPT + "\n"
+            # Build text with injected prompt for step extraction
+            # Uses the LLM's own voice: "Let me output the current steps..."
+            text_with_prompt = step + self._thinking_phase_prompt
+
+            # Side-stream: get path steps from the model
+            side_output = await self._side_stream_maze_steps(
+                text_with_prompt, max_new_tokens=300
             )
 
-            # Stream from vLLM, verifying each step as it appears
-            full_text, is_all_valid, error_info = await self._stream_and_verify_steps(
-                text_with_think_end
+            if not side_output or len(side_output.strip()) < 20:
+                logger.info(
+                    "[Maze Phase 1] Insufficient output from side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            # Combine the prompt header with side output for parsing
+            full_side_text = (
+                ">>> LOCATE START AND EXIT:\n" + side_output
+            )
+
+            # First verify LOCATE section
+            locate_valid, locate_errors = verify_locate_section(
+                full_side_text, self.start_pos, self.exit_pos
             )
 
-            if is_all_valid:
-                # All steps correct -> inject </think> + structured prompt
-                # and let stream_completion recurse so the model generates
-                # verified steps that Phase 2b checks.
+            if not locate_valid:
+                self._think_phase_corrections += 1
+                error_summary = "; ".join(locate_errors)
                 logger.info(
-                    "[ThinkingPhaseVerifier] All side-streamed steps verified OK "
-                    "- injecting </think> + structured prompt"
+                    f"[Maze Phase 1] LOCATE section errors: {error_summary}\n"
+                    f"  Action: Inject feedback into thinking trace\n"
+                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                )
+                thinking_feedback = (
+                    f"\n\nWait, I think I have the wrong positions. "
+                    f"{error_summary}. "
+                    f"Let me re-examine the maze grid carefully to find S and E.\n"
                 )
                 if not event.is_set():
                     event_info["generated_text"] = step
-                    event_info["feedback"] = self.answer_start_token
+                    event_info["feedback"] = thinking_feedback
                     event_info["correction_index"] = token_index
-                    event_info["phase"] = "inject_think_end"
+                    event_info["errors"] = locate_errors
+                    event_info["phase"] = "rollback_to_thinking"
                     event.set()
-                return step, self.answer_start_token
+                return step, thinking_feedback
 
-            else:
-                # Step is WRONG -> rollback into thinking
-                errors = error_info["errors"]
-                step_num = error_info["step_num"]
+            # Parse and verify steps
+            steps = self._parse_steps_from_text(full_side_text)
+
+            if not steps:
                 logger.info(
-                    f"[ThinkingPhaseVerifier] Step {step_num} FAILED: {errors}"
+                    "[Maze Phase 1] No structured steps found in side-stream. "
+                    "Letting model continue thinking."
                 )
+                return step, None
+
+            (all_valid, err_step_num, errors, final_pos,
+             final_dir, r_count, l_count, t_count) = self._verify_all_steps(steps)
+
+            if not all_valid:
                 error_summary = "; ".join(errors)
+                self._think_phase_corrections += 1
+                logger.info(
+                    f"[Maze Phase 1] INVALID step {err_step_num}\n"
+                    f"  Error(s) : {error_summary}\n"
+                    f"  Action   : Inject feedback into thinking trace\n"
+                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                )
                 thinking_feedback = (
-                    f"\n\nWait, that approach is wrong. {error_summary}. "
-                    f"Let me reconsider and try a different approach.\n"
+                    f"\n\nWait, I made an error at Step {err_step_num}. "
+                    f"{error_summary}. "
+                    f"Let me re-trace the path more carefully from the correct position.\n"
                 )
                 if not event.is_set():
                     event_info["generated_text"] = step
                     event_info["feedback"] = thinking_feedback
                     event_info["correction_index"] = token_index
                     event_info["errors"] = errors
-                    event_info["failed_step"] = step_num
                     event_info["phase"] = "rollback_to_thinking"
                     event.set()
                 return step, thinking_feedback
 
+            # All steps valid — check if path is complete (reached E)
+            if final_pos == self.exit_pos:
+                self._verified_path_complete = True
+                logger.info(
+                    f"[Maze Phase 1] VALID COMPLETE path to E={self.exit_pos}\n"
+                    f"  Steps: {len(steps)}, Right={r_count}, Left={l_count}, Total={t_count}\n"
+                    f"  Action: Inject early-stop + </think> + structured format."
+                )
+                # Include the structured prompt directly after </think>
+                # so the model immediately starts filling in the answer format
+                # (skips the separate Phase 2a injection round-trip).
+                early_stop_msg = (
+                    f"\n\nWait, I have successfully traced the path from "
+                    f"S={self.start_pos} to E={self.exit_pos} with "
+                    f"{len(steps)} steps. "
+                    f"Right turns={r_count}, Left turns={l_count}, "
+                    f"Total turns={t_count}. "
+                    f"This path has been verified as correct. "
+                    f"Let me give the final answer.\n"
+                    f"{self.answer_start_token}"
+                    f"{self._structured_prompt}"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = early_stop_msg
+                    event_info["correction_index"] = token_index
+                    event_info["phase"] = "early_stop_answer"
+                    event_info["verified_counts"] = {
+                        "right": r_count,
+                        "left": l_count,
+                        "total": t_count,
+                        "steps": len(steps),
+                    }
+                    event.set()
+                return step, early_stop_msg
+
+            else:
+                logger.info(
+                    f"[Maze Phase 1] VALID PARTIAL path\n"
+                    f"  Current pos: {final_pos}, Target: {self.exit_pos}\n"
+                    f"  Steps so far: {len(steps)}\n"
+                    f"  Action: No error, let model keep thinking."
+                )
+                return step, None
+
         # ==================================================================
-        # CASE 2a: </think> present but structured prompt missing
+        # CASE 2a: </think> present but structured prompt not yet injected
         # ==================================================================
-        if STRUCTURED_OUTPUT_PROMPT not in step:
+        if not self._structured_prompt_injected(step):
             logger.info(
-                "[ThinkingPhaseVerifier] </think> present but structured "
-                "prompt missing -> injecting it"
+                "[Maze Phase 2a] </think> detected. "
+                "Injecting structured step format."
             )
-            structured_prompt_text = "\n" + STRUCTURED_OUTPUT_PROMPT + "\n"
             if not event.is_set():
                 event_info["generated_text"] = step
-                event_info["feedback"] = structured_prompt_text
+                event_info["feedback"] = self._structured_prompt
                 event_info["correction_index"] = token_index
                 event_info["phase"] = "inject_structured_prompt"
                 event.set()
-            return step, structured_prompt_text
+            return step, self._structured_prompt
 
         # ==================================================================
-        # CASE 2b: After </think> + structured prompt - standard verify
+        # CASE 2b: Structured prompt injected — verify output
         # ==================================================================
 
-        # ---- max-corrections guard ----
-        num_corrections = (
-            self._count_feedback_blocks(step)
-            + self._think_phase_corrections
-        )
+        num_corrections = self._count_feedback_blocks(step)
         if num_corrections >= self.max_corrections:
             fb = "\nthe answer is \\boxed{no solution}"
             if not event.is_set():
@@ -480,94 +1314,208 @@ async def verify(self, step: str, token_index: int, event, event_info):
                 event_info["feedback"] = fb
                 event_info["correction_index"] = token_index
                 event_info["errors"] = ["Max corrections reached"]
-                event_info["failed_step"] = None
+                event_info["phase"] = "standard_verify"
                 event.set()
             return step, fb
 
-        # ---- extract & verify step ----
-        step_num, parsed = self._extract_last_step_info(step)
-        if step_num is None or parsed is None or parsed.get('available_numbers') is None:
-            return step, None
+        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = step[think_end_pos:]
 
-        current_available = self._get_current_available(step)
-        is_valid, errors, new_available = verify_step(
-            parsed, current_available, self.original_numbers, step_num
-        )
+        # Strip the injected <format>...</format> template — only look at
+        # actual model output starting from the last ">>> LOCATE START AND EXIT:" marker.
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            text_after_think = text_after_think[last_marker_pos:]
 
-        if is_valid:
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        recent_text = text_after_think[last_feedback_end:]
+
+        # --- Verify LOCATE section ---
+        locate_match = re.search(r'LOCATE START AND EXIT', recent_text, re.IGNORECASE)
+        if locate_match:
+            step1_start = re.search(r'>>>\s*STEP\s+1', recent_text, re.IGNORECASE)
+            if step1_start or '\\boxed' in recent_text:
+                if step1_start:
+                    locate_text = recent_text[locate_match.start():step1_start.start()]
+                else:
+                    locate_text = recent_text[locate_match.start():]
+                is_valid, loc_errors = verify_locate_section(
+                    locate_text, self.start_pos, self.exit_pos
+                )
+                if not is_valid:
+                    feedback = format_locate_feedback(loc_errors)
+                    if not event.is_set():
+                        event_info["generated_text"] = step
+                        event_info["feedback"] = feedback
+                        event_info["correction_index"] = token_index
+                        event_info["errors"] = loc_errors
+                        event_info["phase"] = "standard_verify"
+                        event.set()
+                    return step, feedback
+
+        # --- Verify structured steps ---
+        if self.question_type in ("right_turns", "total_turns"):
+            step_pattern = re.compile(
+                r'(>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
+                r'Running count:[^\n]+)',
+                re.IGNORECASE | re.DOTALL
+            )
+            # Find steps in recent_text (after last feedback) to know what to verify
+            recent_step_matches = list(step_pattern.finditer(recent_text))
+
+            if recent_step_matches:
+                last_match = recent_step_matches[-1]
+                last_step_text = last_match.group(0)
+                last_step_num = int(last_match.group(2))
+                parsed = parse_maze_step(last_step_text)
+
+                if parsed:
+                    # For state reconstruction, gather ALL steps from the
+                    # full text (not just recent_text).  When a step number
+                    # appears multiple times (original + corrections), only
+                    # the LAST occurrence before the target step is used.
+                    all_full_matches = list(step_pattern.finditer(text_after_think))
+                    state = self._get_state_before_step_phase2(
+                        text_after_think, last_step_num, all_full_matches
+                    )
+
+                    is_valid, errors, new_state = verify_maze_step(
+                        step=parsed,
+                        grid=self.grid,
+                        expected_from_pos=state['position'],
+                        prev_direction=state['direction'],
+                        expected_right_count=state['right_count'],
+                        expected_left_count=state['left_count'],
+                        expected_total_count=state['total_count'],
+                    )
+
+                    if not is_valid:
+                        feedback = format_maze_feedback(errors, last_step_num)
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = errors
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+        # --- Check for boxed answer ---
+        boxed_answer = self._extract_boxed_answer(recent_text)
+        if boxed_answer is not None:
+            logger.info(f"[Maze Phase 2b] Extracted boxed answer: {boxed_answer}")
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = ""
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "final_answer_correct"
+                event.set()
             return step, None
 
-        # ---- step has errors -> standard feedback ----
-        logger.info(f"[ThinkingPhaseVerifier] Step {step_num} FAILED: {errors}")
-        feedback = format_feedback(errors, step_num, current_available)
-        if not event.is_set():
-            event_info["generated_text"] = step
-            event_info["feedback"] = feedback
-            event_info["correction_index"] = token_index
-            event_info["errors"] = errors
-            event_info["failed_step"] = step_num
-            event_info["phase"] = "standard_verify"
-            event.set()
-        return step, feedback
+        return step, None
 
     # ------------------------------------------------------------------
-    #  fix
+    #  _get_state_before_step_phase2 – reconstruct state for Phase 2
     # ------------------------------------------------------------------
-    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+    def _get_state_before_step_phase2(self, text: str, target_step_num: int,
+                                       all_step_matches: list) -> dict:
+        """Reconstruct state before a given step from Phase 2 structured output.
+        
+        When a step number appears multiple times (original + corrections after
+        verifier feedback), only the LAST occurrence of each step number is used,
+        so that corrected steps override earlier invalid ones.
         """
-        Applies the appropriate fix depending on the phase:
-
-        inject_think_end
-            Append ``</think>`` + structured output prompt so the model
-            regenerates the steps naturally.
+        state = {
+            'position': self.start_pos,
+            'direction': Direction.NONE,
+            'right_count': 0,
+            'left_count': 0,
+            'total_count': 0,
+        }
+
+        # Collect the last occurrence of each step number before the target
+        last_by_num = {}
+        for match in all_step_matches:
+            step_num = int(match.group(2))
+            if step_num >= target_step_num:
+                continue
+            last_by_num[step_num] = match  # later occurrences overwrite earlier
+
+        # Replay in step-number order
+        for step_num in sorted(last_by_num.keys()):
+            parsed = parse_maze_step(last_by_num[step_num].group(0))
+            if not parsed:
+                continue
+
+            direction = parsed['direction']
+            to_pos = parsed['to_pos']
+
+            turn_type = get_expected_turn_type(state['direction'], direction)
+            if turn_type == 'RIGHT_TURN':
+                state['right_count'] += 1
+                state['total_count'] += 1
+            elif turn_type == 'LEFT_TURN':
+                state['left_count'] += 1
+                state['total_count'] += 1
+
+            state['position'] = to_pos
+            state['direction'] = direction
+
+        return state
 
-        rollback_to_thinking
-            Strip everything from the inject point, append ``Wait ...``
-            feedback inside the thinking trace.
-
-        inject_structured_prompt
-            Append the structured output prompt after a natural
-            ``</think>`` (no rollback needed).
-
-        standard_verify
-            Append ``[VERIFIER FEEDBACK ...]`` (same as
-            StepVerifierGame24Monitor).
-        """
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        """Apply the appropriate fix depending on the phase."""
         phase = event_info.get("phase", "standard_verify")
 
-        if phase == "inject_think_end":
+        if phase == "rollback_to_thinking":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
             logger.info(
-                "[ThinkingPhaseVerifier] fix(): injecting </think> + "
-                "structured prompt"
-            )
-            return (
-                event_info["generated_text"]
-                + "\n" + self.answer_start_token + "\n"
-                + STRUCTURED_OUTPUT_PROMPT + "\n"
+                f"[Maze fix] Phase: rollback_to_thinking\n"
+                f"  -> Appended error feedback into <think> trace.\n"
+                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
             )
+            return result
 
-        if phase == "rollback_to_thinking":
-            logger.info("[ThinkingPhaseVerifier] fix(): rolling back into thinking")
-
+        if phase == "early_stop_answer":
             base_text = event_info["generated_text"]
             result = base_text.rstrip() + event_info["feedback"]
-
-            # Reset thinking-phase state for the next cycle
-            self._think_phase_corrections += 1
-
+            counts = event_info.get("verified_counts", {})
             logger.info(
-                f"[ThinkingPhaseVerifier] Rolled back. "
-                f"Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                f"[Maze fix] Phase: early_stop_answer\n"
+                f"  -> Path verified: {counts.get('steps', '?')} steps, "
+                f"R={counts.get('right', '?')}, L={counts.get('left', '?')}, "
+                f"T={counts.get('total', '?')}\n"
+                f"  -> Injecting early-stop + </think> + structured format."
             )
             return result
 
         if phase == "inject_structured_prompt":
             logger.info(
-                "[ThinkingPhaseVerifier] fix(): appending structured "
-                "output prompt after natural </think>"
+                "[Maze fix] Phase: inject_structured_prompt\n"
+                "  -> Appending structured step format after </think>."
             )
             return event_info["generated_text"] + event_info["feedback"]
 
+        if phase == "final_answer_correct":
+            logger.info(
+                f"[Maze fix] Phase: final_answer_correct\n"
+                f"  -> Stopping generation."
+            )
+            return event_info["generated_text"]
+
         # standard_verify
-        logger.info("[ThinkingPhaseVerifier] fix(): standard step feedback")
+        errors = event_info.get("errors", [])
+        error_summary = "; ".join(errors) if errors else "unknown"
+        logger.info(
+            f"[Maze fix] Phase: standard_verify\n"
+            f"  -> Error: {error_summary}\n"
+            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
+        )
         return event_info["generated_text"] + event_info["feedback"]
diff --git a/interwhen/utils/maze_verifier.py b/interwhen/utils/maze_verifier.py
index 19226c90..438e1724 100644
--- a/interwhen/utils/maze_verifier.py
+++ b/interwhen/utils/maze_verifier.py
@@ -46,11 +46,26 @@ class Direction(Enum):
 }
 
 
+# Map alternative direction names (cardinal) to the canonical enum names
+_DIRECTION_ALIASES = {
+    'NORTH': 'UP',
+    'SOUTH': 'DOWN',
+    'EAST': 'RIGHT',
+    'WEST': 'LEFT',
+}
+
+
 def parse_direction(dir_str: str) -> Direction:
-    """Parse direction string to Direction enum."""
+    """Parse direction string to Direction enum.
+
+    Accepts canonical names (UP/DOWN/LEFT/RIGHT) **and** cardinal names
+    (NORTH/SOUTH/EAST/WEST).
+    """
     dir_str = dir_str.strip().upper()
     if dir_str in ['—', '-', 'NONE', '']:
         return Direction.NONE
+    # Resolve cardinal aliases
+    dir_str = _DIRECTION_ALIASES.get(dir_str, dir_str)
     try:
         return Direction[dir_str]
     except KeyError:
@@ -68,6 +83,35 @@ def get_expected_turn_type(prev_dir: Direction, curr_dir: Direction) -> str:
     return 'UNKNOWN'
 
 
+# Map common turn type variations to canonical names
+_TURN_TYPE_ALIASES = {
+    'RIGHT': 'RIGHT_TURN',
+    'RIGHT TURN': 'RIGHT_TURN',
+    'RIGHT_TURN': 'RIGHT_TURN',
+    'RIGHTTURN': 'RIGHT_TURN',
+    'LEFT': 'LEFT_TURN',
+    'LEFT TURN': 'LEFT_TURN',
+    'LEFT_TURN': 'LEFT_TURN',
+    'LEFTTURN': 'LEFT_TURN',
+    'STRAIGHT': 'STRAIGHT',
+    'NONE': 'STRAIGHT',
+    'NO TURN': 'STRAIGHT',
+    'NO_TURN': 'STRAIGHT',
+    'NOTURN': 'STRAIGHT',
+}
+
+
+def normalize_turn_type(turn_str: str) -> str:
+    """Normalize a claimed turn type string to canonical form.
+
+    Accepts common variations such as ``RIGHT``, ``RIGHT TURN``,
+    ``RIGHT_TURN``, ``RIGHTTURN`` (case-insensitive) and maps them to
+    the canonical ``RIGHT_TURN`` / ``LEFT_TURN`` / ``STRAIGHT``.
+    """
+    turn_str = turn_str.strip().upper()
+    return _TURN_TYPE_ALIASES.get(turn_str, turn_str)
+
+
 def parse_maze_from_prompt(prompt: str) -> Tuple[List[List[str]], Optional[Tuple[int, int]], Optional[Tuple[int, int]]]:
     """
     Parse maze from prompt. Returns (grid, start_pos, exit_pos).
@@ -162,10 +206,16 @@ def parse_maze_step(step_text: str) -> Optional[Dict[str, Any]]:
     else:
         result['claimed_curr_dir'] = None
     
-    # Extract turn type
-    turn_match = re.search(r'Turn type:\s*(\S+)', step_text)
+    # Extract turn type (handle multi-word like 'RIGHT TURN', 'LEFT_TURN', etc.)
+    # Strip parenthetical comments like 'RIGHT (DOWN → LEFT is a RIGHT turn)'
+    turn_match = re.search(r'Turn type:\s*(.+)', step_text)
     if turn_match:
-        result['claimed_turn'] = turn_match.group(1).upper()
+        turn_raw = turn_match.group(1).strip()
+        # Remove parenthetical comments: "RIGHT (DOWN → LEFT ...)" → "RIGHT"
+        turn_raw = re.sub(r'\s*\(.*', '', turn_raw)
+        # Also strip trailing punctuation/whitespace
+        turn_raw = turn_raw.strip().rstrip(':')
+        result['claimed_turn'] = normalize_turn_type(turn_raw)
     else:
         result['claimed_turn'] = None
     

From 2477c522f1cc384a8aab4540eb3b7ac55b521db3 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Thu, 19 Feb 2026 06:22:26 +0000
Subject: [PATCH 03/17] Fix maze verifier

---
 interwhen/monitors/thinkingPhaseVerifier.py | 95 ++++++++++++++++++++-
 interwhen/utils/maze_verifier.py            | 49 +++++++++--
 2 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index c1117374..a1b9c8d6 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -79,7 +79,7 @@
     Direction, parse_direction, get_expected_turn_type,
     parse_maze_from_prompt, parse_maze_step, verify_maze_step,
     verify_locate_section, format_maze_feedback, format_locate_feedback,
-    DIRECTION_DELTAS,
+    DIRECTION_DELTAS, compute_relative_direction,
 )
 
 logger = logging.getLogger(__name__)
@@ -896,6 +896,82 @@ def detect_question_type(prompt: str) -> str:
             return "right_turns"
         return "relative_position"
 
+    def _verify_relative_position_answer(self, boxed_answer: str) -> Tuple[bool, Optional[str]]:
+        """Verify a relative-position boxed answer (A=Yes / B=No).
+
+        Parses the question from ``self.prompt`` to determine the asked
+        direction, computes the true relative direction of E from S,
+        and checks whether the model's Yes/No answer is correct.
+
+        Returns ``(is_correct, feedback_or_None)``.
+        """
+        # Map boxed letter → Yes / No
+        answer_map = {"A": "Yes", "B": "No"}
+        model_yn = answer_map.get(boxed_answer.strip().upper())
+        if model_yn is None:
+            # Not A or B – can't verify
+            return True, None
+
+        # --- Parse the asked direction from the prompt ---
+        # Patterns: "directly to the left of the starting point (S)"
+        #           "directly below the starting point (S)"
+        #           "to the top right of the starting point (S)"
+        m = re.search(
+            r'Is the exit \(E\)\s+(.*?)\s+(?:of\s+)?the starting point \(S\)',
+            self.prompt, re.IGNORECASE,
+        )
+        if not m:
+            return True, None  # can't parse question, skip verification
+
+        asked_raw = m.group(1).strip().lower()
+        # Remove trailing comma and extra clauses like ", with no ..."
+        asked_raw = re.sub(r',.*', '', asked_raw).strip()
+
+        # --- Compute actual relative direction ---
+        actual = compute_relative_direction(self.start_pos, self.exit_pos)
+
+        # --- Determine expected Yes / No ---
+        # "directly to the left … with no vertical displacement"
+        #  → same row, E col < S col  → actual in {"west"}
+        # "directly below … with no horizontal displacement"
+        #  → same col, E row > S row  → actual in {"south"}
+        # "to the top right" → E north-east of S → actual == "northeast"
+        direction_keywords = {
+            "directly to the left":   {"west"},
+            "directly to the right":  {"east"},
+            "directly above":         {"north"},
+            "directly below":         {"south"},
+            "to the top left":        {"northwest"},
+            "to the top right":       {"northeast"},
+            "to the bottom left":     {"southwest"},
+            "to the bottom right":    {"southeast"},
+        }
+
+        expected_dirs = direction_keywords.get(asked_raw)
+        if expected_dirs is None:
+            return True, None  # unrecognised pattern, skip
+
+        expected_yn = "Yes" if actual in expected_dirs else "No"
+
+        if model_yn == expected_yn:
+            return True, None
+
+        # --- Build feedback ---
+        sr, sc = self.start_pos
+        er, ec = self.exit_pos
+        feedback = (
+            f"\n\n[VERIFIER FEEDBACK for relative position:\n"
+            f"  ✗ Your answer {boxed_answer} ({model_yn}) is incorrect.\n"
+            f"  S is at row={sr}, col={sc}. E is at row={er}, col={ec}.\n"
+            f"  Row difference (E-S): {er - sr} ({'same row' if er == sr else ('E is below S' if er > sr else 'E is above S')}).\n"
+            f"  Col difference (E-S): {ec - sc} ({'same col' if ec == sc else ('E is right of S' if ec > sc else 'E is left of S')}).\n"
+            f"  Therefore E is {actual} of S → the correct answer to "
+            f"\"{asked_raw}\" is {expected_yn}.\n"
+            f"  Please output \\boxed{{{('A' if expected_yn == 'Yes' else 'B')}}} "
+            f"for {expected_yn}.]\n\n"
+        )
+        return False, feedback
+
     # ------------------------------------------------------------------
     #  _parse_steps_from_text – parse structured steps from side-stream
     # ------------------------------------------------------------------
@@ -1407,6 +1483,23 @@ async def verify(self, step: str, token_index: int, event, event_info):
         boxed_answer = self._extract_boxed_answer(recent_text)
         if boxed_answer is not None:
             logger.info(f"[Maze Phase 2b] Extracted boxed answer: {boxed_answer}")
+
+            # For relative_position questions, verify the Yes/No answer
+            if self.question_type == "relative_position":
+                is_correct, rp_feedback = self._verify_relative_position_answer(boxed_answer)
+                if not is_correct and rp_feedback:
+                    logger.info(
+                        f"[Maze Phase 2b] Relative position answer '{boxed_answer}' is INCORRECT."
+                    )
+                    if not event.is_set():
+                        event_info["generated_text"] = step
+                        event_info["feedback"] = rp_feedback
+                        event_info["correction_index"] = token_index
+                        event_info["errors"] = [f"Wrong relative position answer: {boxed_answer}"]
+                        event_info["phase"] = "standard_verify"
+                        event.set()
+                    return step, rp_feedback
+
             if not event.is_set():
                 event_info["generated_text"] = step
                 event_info["feedback"] = ""
diff --git a/interwhen/utils/maze_verifier.py b/interwhen/utils/maze_verifier.py
index 438e1724..c0873295 100644
--- a/interwhen/utils/maze_verifier.py
+++ b/interwhen/utils/maze_verifier.py
@@ -54,6 +54,24 @@ class Direction(Enum):
     'WEST': 'LEFT',
 }
 
+# Reverse mapping: enum name -> cardinal name (for feedback messages)
+_CARDINAL_NAMES = {
+    Direction.UP: 'NORTH',
+    Direction.DOWN: 'SOUTH',
+    Direction.LEFT: 'WEST',
+    Direction.RIGHT: 'EAST',
+    Direction.NONE: 'NONE',
+}
+
+
+def cardinal_name(d: Direction) -> str:
+    """Return the cardinal compass name for a Direction enum value.
+
+    Used in feedback messages so that the model (which often thinks in
+    NORTH/SOUTH/EAST/WEST terms) can understand corrections.
+    """
+    return _CARDINAL_NAMES.get(d, d.name)
+
 
 def parse_direction(dir_str: str) -> Direction:
     """Parse direction string to Direction enum.
@@ -289,7 +307,10 @@ def verify_maze_step(
     if expected_delta:
         actual_delta = (to_pos[0] - from_pos[0], to_pos[1] - from_pos[1])
         if actual_delta != expected_delta:
-            errors.append(f"Move {direction.name} doesn't match delta {actual_delta}, expected {expected_delta}")
+            errors.append(
+                f"Move {cardinal_name(direction)} from {from_pos} to {to_pos} has delta {actual_delta}, "
+                f"but {cardinal_name(direction)} should have delta {expected_delta} (row_change, col_change)"
+            )
     
     # 3. Verify to_pos is walkable (not a wall)
     if 0 <= to_pos[0] < len(grid) and 0 <= to_pos[1] < len(grid[0]):
@@ -306,7 +327,18 @@ def verify_maze_step(
     # 5. Verify turn type
     expected_turn = get_expected_turn_type(prev_direction, direction)
     if claimed_turn is not None and claimed_turn != expected_turn:
-        errors.append(f"Turn type {claimed_turn} should be {expected_turn} (prev={prev_direction.name}, curr={direction.name})")
+        prev_card = cardinal_name(prev_direction)
+        curr_card = cardinal_name(direction)
+        if expected_turn == 'RIGHT_TURN':
+            clock_desc = "clockwise (RIGHT turn)"
+        elif expected_turn == 'LEFT_TURN':
+            clock_desc = "counterclockwise (LEFT turn)"
+        else:
+            clock_desc = "no turn (STRAIGHT)"
+        errors.append(
+            f"Turn type {claimed_turn} should be {expected_turn}. "
+            f"Going from {prev_card} to {curr_card} is a {clock_desc} rotation."
+        )
     
     # 6. Calculate expected counts after this step
     new_right = expected_right_count
@@ -322,11 +354,11 @@ def verify_maze_step(
     
     # 7. Verify running counts
     if claimed_right is not None and claimed_right != new_right:
-        errors.append(f"Right count {claimed_right} should be {new_right}")
+        errors.append(f"Right turn count {claimed_right} should be {new_right}")
     if claimed_left is not None and claimed_left != new_left:
-        errors.append(f"Left count {claimed_left} should be {new_left}")
+        errors.append(f"Left turn count {claimed_left} should be {new_left}")
     if claimed_total is not None and claimed_total != new_total:
-        errors.append(f"Total count {claimed_total} should be {new_total}")
+        errors.append(f"Total turn count {claimed_total} should be {new_total}")
     
     # Update state for next step
     state['new_pos'] = to_pos
@@ -371,7 +403,12 @@ def format_maze_feedback(errors: List[str], step_num: int) -> str:
     feedback = f"\n\n[VERIFIER FEEDBACK for Step {step_num}:\n"
     for err in errors:
         feedback += f"  ✗ {err}\n"
-    feedback += "Please correct this step and continue.]\n\n"
+    feedback += (
+        "IMPORTANT: Clockwise on a compass is NORTH→EAST→SOUTH→WEST→NORTH. "
+        "A RIGHT turn = 90° clockwise; a LEFT turn = 90° counterclockwise. "
+        "For example: SOUTH→WEST is RIGHT (clockwise), SOUTH→EAST is LEFT (counterclockwise). "
+        "Please correct this step and continue.]\n\n"
+    )
     return feedback
 
 

From 4be7d407306d88e3587ac99b6c264361cb5489a0 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Thu, 19 Feb 2026 06:41:52 +0000
Subject: [PATCH 04/17] Improve RP feedback: clarify compass direction vs
 adjacency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The model was interpreting 'directly to the left' as 'immediately
adjacent', arguing with the verifier across all 5 correction attempts.
The dataset defines it as general compass direction (west = same row,
lower column, regardless of distance/walls).

Updated feedback to explicitly state:
- This is about GENERAL COMPASS DIRECTION, not adjacency
- Do NOT consider adjacency or walls
- Just compare row/col coordinates
- This is the verified correct answer — do not argue

Result: 3/3 previously-failing RP examples now correct on first correction.
---
 interwhen/monitors/thinkingPhaseVerifier.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index a1b9c8d6..6e09bc69 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -959,16 +959,23 @@ def _verify_relative_position_answer(self, boxed_answer: str) -> Tuple[bool, Opt
         # --- Build feedback ---
         sr, sc = self.start_pos
         er, ec = self.exit_pos
+        correct_letter = 'A' if expected_yn == 'Yes' else 'B'
         feedback = (
             f"\n\n[VERIFIER FEEDBACK for relative position:\n"
             f"  ✗ Your answer {boxed_answer} ({model_yn}) is incorrect.\n"
+            f"  IMPORTANT: In this task, \"{asked_raw}\" means the GENERAL "
+            f"COMPASS DIRECTION, NOT immediate adjacency. It asks whether E "
+            f"is in the {actual} direction from S, regardless of distance or "
+            f"walls between them.\n"
             f"  S is at row={sr}, col={sc}. E is at row={er}, col={ec}.\n"
             f"  Row difference (E-S): {er - sr} ({'same row' if er == sr else ('E is below S' if er > sr else 'E is above S')}).\n"
             f"  Col difference (E-S): {ec - sc} ({'same col' if ec == sc else ('E is right of S' if ec > sc else 'E is left of S')}).\n"
             f"  Therefore E is {actual} of S → the correct answer to "
             f"\"{asked_raw}\" is {expected_yn}.\n"
-            f"  Please output \\boxed{{{('A' if expected_yn == 'Yes' else 'B')}}} "
-            f"for {expected_yn}.]\n\n"
+            f"  Do NOT consider adjacency or walls. Just compare the row/col "
+            f"coordinates of S and E.\n"
+            f"  Output \\boxed{{{correct_letter}}} for {expected_yn}. "
+            f"This is the verified correct answer — do not argue.]\n\n"
         )
         return False, feedback
 

From 658de1b498cccae6d5ddfa6e44fa5d599efa52f2 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Thu, 19 Feb 2026 10:00:05 +0000
Subject: [PATCH 05/17] Fix spatialmap side-stream verification: update prompt
 and regex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rewrite thinking-phase prompt to instruct model to use full entity
  names and full direction words (no abbreviations) in the exact
  'X is to the [direction] of Y' format
- Pre-fill STEP 1 from parsed relations so model jumps to STEP 2
- Add abbreviation expansion (NE/NW/SE/SW → full words) in
  parse_directional_claims_from_text before regex matching
- Strip square brackets around entity names ([Foo] → Foo)
- Verified claims now 11-17 per example (was 0 before)
---
 .../EAT_10examples_results.txt                |   24 +
 .../EAT_3examples_results.txt                 |   24 +
 .../Reasoning_output/reason_3013.txt          | 1521 +++++++++++++++++
 .../Reasoning_output/reason_3016.txt          |  582 +++++++
 .../Reasoning_output/reason_3017.txt          |  760 ++++++++
 .../Reasoning_output/reason_3036.txt          |  579 +++++++
 .../Reasoning_output/reason_3047.txt          |  598 +++++++
 .../Reasoning_output/reason_4005.txt          |  265 +++
 .../Reasoning_output/reason_4007.txt          |  281 +++
 .../Reasoning_output/reason_4011.txt          |  240 +++
 .../Reasoning_output/reason_4012.txt          |  184 ++
 .../Reasoning_output/reason_4014.txt          |  380 ++++
 .../csv_saved/results_10examples.csv          |   11 +
 .../csv_saved/results_3examples.csv           |    4 +
 .../Qwen3-30B-A3B-Thinking-2507/summary.json  |   40 +
 .../EAT_2examples_results.txt                 |   29 +
 .../EAT_5examples_results.txt                 |   30 +
 .../Reasoning_output/reason_0.txt             |  229 +++
 .../Reasoning_output/reason_1124.txt          |  782 +++++++++
 .../Reasoning_output/reason_1499.txt          |  843 +++++++++
 .../Reasoning_output/reason_374.txt           |  296 ++++
 .../Reasoning_output/reason_749.txt           |  622 +++++++
 .../csv_saved/results_2examples.csv           |    3 +
 .../csv_saved/results_5examples.csv           |    6 +
 .../Qwen3-30B-A3B-Thinking-2507/summary.json  |   49 +
 .../TTSwithVerification/maze_stepverifier.py  |   26 +-
 .../spatialmap_stepverifier.py                |  318 ++--
 interwhen/monitors/__init__.py                |    2 +-
 interwhen/monitors/thinkingPhaseVerifier.py   |  595 ++++++-
 interwhen/utils/spatialmap_verifier.py        |   21 +-
 30 files changed, 9142 insertions(+), 202 deletions(-)
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv
 create mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv
 create mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json

diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt
new file mode 100644
index 00000000..e3642117
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt
@@ -0,0 +1,24 @@
+Maze Step Verification Results
+==================================================
+
+Model: Qwen/Qwen3-30B-A3B-Thinking-2507
+Number of Examples: 10
+Max Corrections: 5
+Newline Threshold: 20
+Warmup: 0
+
+Results:
+---------
+Correct: 7/10
+Accuracy: 70.00%
+Attempted (produced \boxed answer): 7/10
+Soundness (correct/attempted): 100.00%
+
+Token Statistics:
+---------------------------
+Total Tokens: 66043
+Average Tokens: 6604.30
+Median Tokens: 5706
+Min Tokens: 3826
+Max Tokens: 15431
+Std Dev: 3255.06
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt
new file mode 100644
index 00000000..9f2bef8d
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt
@@ -0,0 +1,24 @@
+Maze Step Verification Results
+==================================================
+
+Model: Qwen/Qwen3-30B-A3B-Thinking-2507
+Number of Examples: 3
+Max Corrections: 5
+Newline Threshold: 20
+Warmup: 0
+
+Results:
+---------
+Correct: 3/3
+Accuracy: 100.00%
+Attempted (produced \boxed answer): 3/3
+Soundness (correct/attempted): 100.00%
+
+Token Statistics:
+---------------------------
+Total Tokens: 8168
+Average Tokens: 2722.67
+Median Tokens: 2896
+Min Tokens: 2152
+Max Tokens: 3120
+Std Dev: 413.75
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt
new file mode 100644
index 00000000..de214ba3
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt
@@ -0,0 +1,1521 @@
+Okay, let's try to figure out how many right turns are in the path marked by X from S to E. First, I need to visualize the maze properly. The ASCII maze given is:
+
+#######
+#XXS  #
+#X### #
+#X# # #
+#X# # #
+#E#   #
+#######
+
+Hmm, let me parse this. Each line represents a row. Let's count the rows and columns to get a better sense.
+
+Rows:
+1: #######
+2: #XXS  #
+3: #X### #
+4: #X# # #
+5: #X# # #
+6: #E#   #
+7: #######
+
+Wait, the maze is 7 rows high. Let's write them out with row numbers:
+
+Row 1: # # # # # # # (7 columns)
+Row 2: # X X S   # (assuming the spaces are there)
+Wait, the second line is "#XXS  #", so breaking it down:
+
+Row 2: positions 1-7:
+1: #
+2: X
+3: X
+4: S
+5: (space)
+6: (space)
+7: #
+
+Wait, but the maze might have the S in the middle. Let's list all rows properly.
+
+Let me write each row with their characters:
+
+Row 1: # # # # # # #
+Row 2: # X X S   #
+Row 3: # X # # # #
+Row 4: # X # # #
+Wait, no. Wait the third line is "#X### #", which would be:
+
+Row 3: # X # # # # ?
+
+Wait, the user wrote:
+
+#######
+#XXS  #
+#X### #
+#X# # #
+#X# # #
+#E#   #
+#######
+
+Wait, each line should have the same number of characters. Let's check:
+
+First line: 7 #s, so 7 characters.
+
+Second line: #XXS  # → that's 7 characters: #, X, X, S, space, space, #.
+
+Third line: #X### # → # X # # # # ? Wait, #X### # would be 7 characters: #, X, #, #, #, #, ?
+
+Wait, no. Let's count:
+
+# X # # # # → no, #X### # → positions 1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: ? Wait, the line is written as "#X### #", which might be a typo. Wait, maybe it's #X### #, but that's 7 characters. Let's count:
+
+# X # # # # → that's 6, but the line should be 7. Hmm. Maybe the user made a typo, but assuming it's correctly formatted.
+
+Wait, maybe the third line is "#X### #" → so 7 characters: #, X, #, #, #, #, ? Wait, no, the last character would be #. Wait:
+
+Wait, the third line: #X### #. Let's split into characters:
+
+Character 1: #
+2: X
+3: #
+4: #
+5: #
+6: #
+7: ?
+
+Wait, no, the line is written as "#X### #", which would be 7 characters if the last # is the 7th. Wait, #X### # would be 7 characters: #, X, #, #, #, space, #? No, that's not right. Maybe the user missed a character. Wait, perhaps the maze is:
+
+Row 1: ####### (7)
+Row 2: #XXS  # (7)
+Row 3: #X### # (7)
+Row 4: #X# # # (7)
+Row 5: #X# # # (7)
+Row 6: #E#   # (7)
+Row 7: ####### (7)
+
+Wait, let's check each row's length:
+
+Row 2: #XXS  # → 7 characters (positions 1-7)
+Row 3: #X### # → Let's count: # X # # # # → but that's 6. Wait, maybe it's #X### # with a space? Hmm, maybe the third row is #X### # meaning # X # # # #, but that's six characters. Wait, maybe the user made a typo. Alternatively, perhaps it's #X### #, which would be seven characters: #, X, #, #, #, #, #? No, that would be #X#####. Hmm.
+
+Wait, maybe the ASCII art is:
+
+Row 1: #######
+Row 2: #XXS  #
+Row 3: #X### #
+Row 4: #X# # #
+Row 5: #X# # #
+Row 6: #E#   #
+Row 7: #######
+
+Ah, okay, maybe the third row is #X### #, but written as #X### #, which would be 7 characters. Let's see:
+
+Row 3: # X # # # # → but no, #X### # → positions:
+
+1: #
+2: X
+3: #
+4: #
+5: #
+6: #
+7: # ?
+
+Wait, that would make it #X#####. But the user wrote #X### #, which might have a space in there. Wait, maybe it's a typo and it's supposed to be #X### # with a space, but that's confusing. Let's assume that the maze is correctly formatted as per the user's input. Let's try to reconstruct it.
+
+Let me write all rows as given:
+
+Line 1: ####### → 7 #
+Line 2: #XXS  # → # X X S [space] [space] #
+Line 3: #X### # → # X # # # # [space] ? Wait, no. Wait, the user wrote "#X### #", which would be 7 characters: #, X, #, #, #, space, #? But that's a bit odd. Alternatively, maybe it's a typo and should be #X### #, meaning # X # # # # #, but that's 7 characters. Wait, #X### # would be:
+
+Character 1: #
+2: X
+3: #
+4: #
+5: #
+6: #
+7: # → but that's #X#####. But the user wrote "#X### #", so maybe there's a space. Let's think differently.
+
+Perhaps the maze is:
+
+Row 1: #######
+Row 2: #XXS  #
+Row 3: #X### #
+Row 4: #X# # #
+Row 5: #X# # #
+Row 6: #E#   #
+Row 7: #######
+
+Assuming each row has 7 characters. Let's count Row 3: #X### # → if it's 7 characters, then:
+
+1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: #? No, that's #X#####. But the user wrote #X### #, which might have a space in position 6. So maybe:
+
+Row 3: # X # # # [space] # → but that would be 7 characters. Wait, #X### # would be:
+
+Position: 1: #, 2: X, 3: #, 4: #, 5: #, 6: [space], 7: #. So the third row has a space in position 6. Hmm, but that might not make sense. Alternatively, maybe the user made a typo and it's supposed to be #X### # as in # X # # # #, but missing a character. This is a bit confusing. Let's try to map it out.
+
+Let's create a grid. Let's index rows from top to bottom (row 1 to row 7), columns left to right (col 1 to col 7).
+
+Row 1: 1-7: all #
+Row 2: # X X S [space] [space] #
+So columns 1: #, 2: X, 3: X, 4: S, 5: space, 6: space, 7: #
+Row 3: # X # # # # # → but user wrote #X### #, so maybe columns 1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: #? Not sure. Wait, if it's #X### #, then maybe columns 1: #, 2: X, 3: #, 4: #, 5: #, 6: space, 7: #. Let's assume that. So row 3: # X # # # [space] #.
+
+Row 4: #X# # # → columns: # X # space # space # ? Or # X # # space # ?
+
+Wait, the user wrote "#X# # #", which would be 7 characters. Let's split:
+
+# X # [space] # [space] # → but that's 7 characters. So positions:
+
+1: #, 2: X, 3: #, 4: space, 5: #, 6: space, 7: #.
+
+Similarly, row 5: same as row 4: #X# # #.
+
+Row 6: #E#   # → # E # [space] [space] [space] #.
+
+Okay, this is getting complicated. Maybe I should sketch it out.
+
+Let me try to write the grid:
+
+Row 1:  # # # # # # #
+Row 2:  # X X S   #
+Row 3:  # X # # # #
+Row 4:  # X # # #
+Row 5:  # X # # #
+Row 6:  # E #   #
+Row 7:  # # # # # # #
+
+Wait, no. Let's try to get each row's characters.
+
+Row 1: 7 #s.
+
+Row 2: # X X S [space] [space] # → so columns 1: #, 2:X, 3:X,4:S,5: ,6: ,7:#.
+
+Row 3: # X # # # # → but user wrote #X### #. If it's #X### #, perhaps it's # X # # # [space] #, but that's not matching. Alternatively, maybe it's a typo and should be #X###, but then it's 6 characters. Hmm.
+
+Wait, maybe the maze is:
+
+Row 1: #######
+Row 2: #XXS  #
+Row 3: #X### #
+Row 4: #X# # #
+Row 5: #X# # #
+Row 6: #E#   #
+Row 7: #######
+
+Assuming that each row has 7 characters. Let's check row 3: #X### # → if it's 7 characters, then:
+
+Characters: #, X, #, #, #, #, # → but that's #X#####. But user wrote #X### #, so maybe the sixth character is a space. So row 3: # X # # # [space] #.
+
+Similarly, row 4: #X# # # → # X # [space] # [space] #.
+
+Row 5: same as row 4.
+
+Row 6: #E#   # → # E # [space][space][space]#.
+
+Okay, let's proceed with that.
+
+Now, the path marked by X is the route from S to E. So S is at row 2, column 4 (assuming row 2, columns 1-7: # X X S ... so S is at (2,4)).
+
+We need to find the path from S (2,4) to E (row 6, column 2, since row 6 starts with # E # ... so E is at (6,2)).
+
+But the path is marked by X's. So the X's are the correct route. So we need to trace the X's from S to E.
+
+Wait, S is a separate symbol, not X. So the path starts at S, and the X's are the path. Wait, the problem says: "the path marked by X". So the X's are the route from S to E. So S is the start, and the X's lead to E. Wait, but S is not an X. So the path starts at S, then follows X's.
+
+Wait, looking at row 2: #XXS  #. So the S is at position (2,4), and to the left of S are two X's (positions 2 and 3). So the path from S would go left along the X's? But S is at (2,4), X's at (2,2), (2,3). So maybe the path is going left from S to the X's, but S is the starting point. Wait, but the problem states: "navigate from S to E by following the path marked by X". So the X's form the path from S to E. So perhaps S is connected to the X's. Let's see.
+
+Wait, row 2: columns 2 and 3 are X, column 4 is S. So S is to the right of the X's. So maybe the path starts at S (2,4), then moves left to (2,3) which is X, then left to (2,2) which is X. Then where does it go next?
+
+Row 3: column 2 is X. So (3,2) is X. So from (2,2), moving down to (3,2), which is X.
+
+Row 4: column 2 is X. (4,2) is X.
+
+Row 5: column 2 is X. (5,2) is X.
+
+Row 6: column 2 is E. Wait, row 6: #E#   #, so (6,2) is E. So the path goes down column 2 from row 2 down to row 6.
+
+Wait, but row 2, column 2 is X, row 3 column 2 is X, row 4 column 2 X, row 5 column 2 X, row 6 column 2 E. So the path is moving down column 2 from row 2 to row 6.
+
+Wait, but S is at (2,4). So to get to the X at (2,2), you have to move left from S (2,4) to (2,3), which is X, then left to (2,2), which is X. Then down to row 3, column 2 (X), then down to row 4, etc.
+
+So the path would be: S (2,4) → (2,3) X → (2,2) X → (3,2) X → (4,2) X → (5,2) X → E (6,2).
+
+Wait, but row 6, column 2 is E. So the path from S to E would go left two steps, then down four steps.
+
+But let's confirm the coordinates.
+
+Let's map all X positions:
+
+Row 2: columns 2,3 are X.
+
+Row 3: column 2 is X.
+
+Row 4: column 2 is X.
+
+Row 5: column 2 is X.
+
+Row 6: column 2 is E.
+
+So the path starts at S (2,4), moves left to (2,3) [X], left to (2,2) [X], then down to (3,2) [X], down to (4,2) [X], down to (5,2) [X], down to (6,2) [E].
+
+So the movement sequence:
+
+Start at S (2,4). First move: left to (2,3). Direction: left (from S, facing left? Or initial direction? Wait, movement direction: when moving from S to (2,3), direction is left. Then next move from (2,3) to (2,2) is also left. Then from (2,2) to (3,2) is down.
+
+Wait, the path is made of X's, so the X's are the path. So the path is S → X → X → X → X → X → E. So the X's are the path, with S being the start and E the end.
+
+So the actual path segments are:
+
+From S (2,4) to (2,3): move left.
+
+(2,3) to (2,2): move left.
+
+(2,2) to (3,2): move down.
+
+(3,2) to (4,2): move down.
+
+(4,2) to (5,2): move down.
+
+(5,2) to (6,2): move down.
+
+So the directions taken are: left, left, down, down, down, down.
+
+Now, to count right turns. A right turn is a 90-degree clockwise change from the previous direction.
+
+But we need to know the direction of movement at each step.
+
+Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
+Wait, rows are numbered 1 to 7 from top to bottom.
+
+Row 1: ####### (top wall)
+Row 2: #XXS  # → so the S is in row 2, column 4 (assuming columns 1-7 left to right).
+
+Row 6: #E#   # → E is in row 6, column 2.
+
+Now, the X's:
+
+Row 2: columns 2 and 3 are X.
+
+Row 3: column 2 is X.
+
+Row 4: column 2 is X.
+
+Row 5: column 2 is X.
+
+So the path from S (row 2, col 4) must go left to col 3 (X), left to col 2 (X), then down to row 3 col 2 (X), down to row 4 col 2, etc., until row 6 col 2 (E).
+
+So the movement steps:
+
+1. Start at S (2,4). First move: left to (2,3). Direction: left.
+
+2. Next move: left to (2,2). Direction still left.
+
+3. Next move: down to (3,2). Direction changes from left to down. Is that a right or left turn?
+
+Assuming that when moving left (west), a right turn would be to face north or south? Wait, direction changes.
+
+Let's define directions:
+
+- North: up (row decreases)
+- South: down (row increases)
+- East: right (column increases)
+- West: left (column decreases)
+
+Assuming the movement direction:
+
+At step 1: moving west (left) from S to (2,3).
+
+Step 2: still moving west to (2,2).
+
+Step 3: now moving south (down) to (3,2). So the previous direction was west, now changing to south.
+
+Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
+Wait, the user wrote:
+
+Row 2: #XXS  #
+So the S is the fourth character in row 2. Since row 2 starts with #, then X, X, S, then two spaces, then #.
+
+So columns:
+
+1: #
+2: X
+3: X
+4: S
+5: (space)
+6: (space)
+7: #
+
+So S is at row 2, column 4.
+
+Row 6: #E#   #
+Columns:
+
+1: #
+2: E
+3: #
+4: (space)
+5: (space)
+6: (space)
+7: #
+
+So E is at row 6, column 2.
+
+Now, the X's are at:
+
+Row 2: columns 2,3
+
+Row 3: column 2 (assuming row 3: #X### # → column 2 is X)
+
+Row 4: column 2 (row 4: #X# # # → column 2 is X)
+
+Row 5: column 2 (same as row 4)
+
+So the path from S (2,4) must go left along row 2 to column 3 (X), then left to column 2 (X), then down column 2 through rows 3,4,5, to row 6 (E).
+
+So the path coordinates:
+
+(2,4) → (2,3) → (2,2) → (3,2) → (4,2) → (5,2) → (6,2)
+
+Now, movement directions:
+
+From (2,4) to (2,3): west (left)
+
+From (2,3) to (2,2): west (left)
+
+From (2,2) to (3,2): south (down)
+
+Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
+Wait, no. Rows are numbered top to bottom. So row 1 is top, row 7 is bottom.
+
+So S is in row 2 (second row), column 4.
+
+E is in row 6 (sixth row), column 2.
+
+Now, the path:
+
+Start at S (row 2, col 4).
+
+Move west (left) to row 2, col 3 (X).
+
+Move west to row 2, col 2 (X).
+
+Move south (down) to row 3, col 2 (X).
+
+Move south to row 4, col 2 (X).
+
+Move south to row 5, col 2 (X).
+
+Move south to row 6, col 2 (E).
+
+So the directions of movement:
+
+1. West (from S to first X)
+2. West (to next X)
+3. South (down)
+4. South
+5. South
+6. South
+
+Now, we need to find right turns. A right turn is a 90-degree clockwise change from the previous direction.
+
+Let's list the directions in order:
+
+Step 1: West
+
+Step 2: West (same direction, no turn)
+
+Step 3: South (change from West to South)
+
+What's the turn direction here?
+
+If you're moving west (left), then turning to south (down). A right turn from west would be south (since west → right turn is south; left turn would be north).
+
+Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
+Wait, perhaps I'm confused about the direction of movement. Let's think in terms of compass directions.
+
+Assume facing north (up), right turn would be east, etc.
+
+But when moving along the path, the direction you're facing when moving between two points.
+
+Let's consider each movement segment:
+
+- From (2,4) to (2,3): moving west. So direction is west.
+
+- From (2,3) to (2,2): still west.
+
+- From (2,2) to (3,2): moving south. So direction changes from west to south.
+
+Now, changing direction from west to south: what's the turn?
+
+If you're moving west (left), then to turn south (down), which is a right turn. Because west → south is a clockwise 90-degree turn.
+
+West → South: right turn.
+
+Similarly, west → North would be left turn.
+
+So from west to south is a right turn.
+
+Then, from south to south (next moves), no turn.
+
+So the only turn in the path is at step 3: from west to south, which is a right turn.
+
+But wait, the path has two west movements followed by four south movements.
+
+So the direction changes only once: west → south, which is a right turn.
+
+Wait, but the question is asking for the number of right turns. So that would be 1. But 1 isn't one of the options. Options are A.0 B.2 C.5 D.9.
+
+Hmm, this suggests I made a mistake. Maybe the path is different.
+
+Wait, maybe the X's are not all along column 2. Let's re-examine the maze.
+
+Wait, row 3: #X### # → if it's # X # # # #, but perhaps row 3, column 3 is a wall. Wait, row 3: #X### #. If it's # X # # # #, then column 3 is #, so row 3, column 2 is X, and columns 3-6 are #.
+
+Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
+Wait, perhaps I misread the maze. Let's write it out in a grid:
+
+Row 1: # # # # # # #
+Row 2: # X X S   #
+Row 3: # X # # # #
+Row 4: # X # # #
+Row 5: # X # # #
+Row 6: # E #   #
+Row 7: # # # # # # #
+
+Wait, but row 3 as per user input is "#X### #", which would be # X # # # #, but maybe with a space. Let's assume row 3 is # X # # # #, so columns 1-7:
+
+1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: #.
+
+But that would make row 3: #X#####.
+
+Row 4: "#X# # #" → # X # space # space #, so columns:
+
+1: #, 2: X, 3: #, 4: space, 5: #, 6: space, 7: #.
+
+Row 5: same as row 4.
+
+Row 6: # E # space space space #.
+
+So in row 3, column 2 is X, but column 3 is #, so it's a wall. So from row 2, column 2 (X), moving down to row 3, column 2 (X), then row 4 column 2 (X), row 5 column 2 (X), row 6 column 2 (E).
+
+But row 3, column 2 is X, and row 3, column 3 is #, so the path goes straight down column 2.
+
+So the path is as before.
+
+But according to this, there's only one right turn when changing from west to south.
+
+But the options don't have 1. So I must have made a mistake.
+
+Wait, perhaps the path is different. Maybe the X's are not all in column 2. Let's check again.
+
+Row 2: #XXS  # → X at 2,3.
+
+Row 3: #X### # → X at 2.
+
+Row 4: #X# # # → X at 2.
+
+Row 5: #X# # # → X at 2.
+
+Row 6: #E#   # → E at 2.
+
+So yes, path is column 2 from row 2 down to row 6.
+
+Wait, but S is at row 2, column 4. So to get to the X's, you move left twice to column 2, then down.
+
+But the path marked by X's starts at S? No, the X's are the path. So S is the start, and the X's lead to E. So the path is S → X → X → X → X → X → E.
+
+So the X's are the path, so the first X after S is (2,3), then (2,2), then (3,2), etc.
+
+So movement directions:
+
+S to (2,3): west.
+
+(2,3) to (2,2): west.
+
+(2,2) to (3,2): south.
+
+(3,2) to (4,2): south.
+
+(4,2) to (5,2): south.
+
+(5,2) to (6,2): south.
+
+So the direction changes only once: west to south. As this is a right turn, that's 1 right turn. But 1 isn't an option. So where's the mistake?
+
+Wait, maybe I have the direction of the turn wrong.
+
+When moving west (left), turning to south (down). Is that a right or left turn?
+
+Imagine facing west (left). A right turn would make you face south (down), and a left turn would make you face north (up). Yes, so west → south is a right turn.
+
+So one right turn. But options don't have that. So maybe the path is different.
+
+Wait, perhaps the maze is different. Let's look at row 4 and row 5. The user wrote "#X# # #". If row 4 is # X # # # #, but no, the user wrote "#X# # #", which might mean # X # [space] # [space] #.
+
+Wait, maybe there's a different path. Let's consider the entire maze.
+
+Let's draw the maze with rows and columns:
+
+Columns 1 to 7, rows 1 to 7.
+
+Row 1: 1 2 3 4 5 6 7
+        # # # # # # #
+
+Row 2: # X X S   #
+        1 2 3 4 5 6 7
+
+Row 3: # X # # # #
+        1 2 3 4 5 6 7 (assuming #X### # means columns 1:#, 2:X, 3:#,4:#,5:#,6:#,7:#)
+
+Row 4: # X #   #   #
+        Wait, user wrote #X# # # → so columns:
+
+1:#, 2:X, 3:#, 4:space, 5:#, 6:space, 7:#.
+
+Row 5: same as row 4.
+
+Row 6: # E #   #
+        1:#, 2:E, 3:#, 4:space, 5:space, 6:space, 7:#
+
+Row 7: all #.
+
+Now, the X's are at:
+
+Row 2: col 2,3
+
+Row 3: col 2
+
+Row 4: col 2
+
+Row 5: col 2
+
+So path is S (2,4) → (2,3)X → (2,2)X → (3,2)X → (4,2)X → (5,2)X → (6,2)E.
+
+But in row 4, column 2 is X, but row 4, column 3 is #, so can't go east. Similarly, row 3, column 2 is X, but column 3 is #, so path must go down.
+
+But what about row 3, column 4? It's #, so no.
+
+Another possibility: maybe the path isn't straight down column 2. Wait, but all X's are in column 2 except row 2.
+
+Wait, row 2 has X in columns 2 and 3, so maybe the path goes down from row 2, column 3?
+
+Wait, row 2, column 3 is X. If you go down from (2,3), what's below? Row 3, column 3 is # (from row 3: #X### #, so column 3 is #), so can't go down there. So the only way from row 2, column 3 is left to (2,2), then down.
+
+So the path must go left to column 2 first.
+
+So the path is as previously determined.
+
+Hmm. If there's only one right turn, but that's not an option, I must have misunderstood the maze.
+
+Wait, let's check row 3 again. If row 3 is #X### #, perhaps it's # X # # # [space] #, so column 6 is space. But row 3, column 6 is space. Then, maybe the path goes down from row 2, column 2 to row 3, column 2, then east? No, row 3, column 3 is #.
+
+Wait, maybe I'm missing some X's. Let's list all X's:
+
+Row 2: columns 2,3
+
+Row 3: column 2
+
+Row 4: column 2
+
+Row 5: column 2
+
+So total X's: 5 X's. S is separate, E is separate.
+
+Path: S → X → X → X → X → X → E. So six moves (five X's plus S and E).
+
+Wait, but the problem states "the path marked by X", so S and E are not X. So the path consists of X's connecting S to E. So the X's are the route, so S must be adjacent to an X.
+
+Yes, S is at (2,4), adjacent to X at (2,3).
+
+So the path is S -X-X-X-X-X-E.
+
+So the X's are five in number.
+
+Now, back to turns.
+
+Directions:
+
+Start at S (2,4), move west to (2,3) [X], direction west.
+
+Move west to (2,2) [X], direction west.
+
+Move south to (3,2) [X], direction south.
+
+Move south to (4,2), direction south.
+
+Move south to (5,2), direction south.
+
+Move south to (6,2) [E], direction south.
+
+So the only direction change is west to south, which is a right turn. So one right turn.
+
+But the options don't have 1. So either the maze is different, or I'm missing something.
+
+Wait, perhaps the initial direction is considered as facing south or something else. Wait, the problem says "movement is allowed in any of the four cardinal directions", but when you start at S, what direction are you facing?
+
+The problem doesn't specify the starting direction. However, to count turns, we need to know the direction of the first move.
+
+Assuming that the first move from S is west, as that's the only way to reach an X.
+
+So first movement direction: west.
+
+Second move: still west.
+
+Third move: south.
+
+So the turn from west to south is a right turn.
+
+But according to this, only one right turn. But options don't have 1. So perhaps I'm missing other turns.
+
+Wait, could there be more direction changes?
+
+Wait, after moving south, do we move east or west anywhere? In our path, no, it's all south.
+
+Wait, maybe the maze has more X's. Let's double-check the user's input.
+
+User wrote:
+
+#######
+#XXS  #
+#X### #
+#X# # #
+#X# # #
+#E#   #
+#######
+
+Let's count the characters per row:
+
+Row 1: 7 characters.
+
+Row 2: #XXS  # → # X X S [space] [space] # → 7.
+
+Row 3: #X### # → # X # # # # → wait, that's six characters. No, #X### # would be 7 if it's # X # # # [space] #. But user wrote #X### #, which might mean #X### followed by a space and #, but that's unclear.
+
+Alternatively, perhaps row 3 is #X### #, which is # X # # # #, but that's six characters. This is confusing.
+
+Another approach: look for the path marked by X's. The X's should form a continuous path from S to E.
+
+S is at row 2, column 4.
+
+E is at row 6, column 2.
+
+Let's assume the maze is:
+
+Row 1: #######
+
+Row 2: # X X S   #
+
+Row 3: # X # # # #
+
+Row 4: # X #   # #
+
+Row 5: # X #   # #
+
+Row 6: # E #     #
+
+Row 7: #######
+
+But this is speculative. Alternatively, perhaps row 3 has X at column 2, and then the path goes down, but row 4 and 5 have X at column 2, but also maybe other X's.
+
+Wait, another idea: perhaps the path goes from S left to X, then down, then right, then down, etc. But according to the maze as given, I don't see other X's.
+
+Wait, row 4: #X# # # → if that's # X # space # space #, then column 2 is X, column 5 is #, etc. So no eastward path.
+
+Wait, let's try to map all X positions:
+
+- Row 2, col 2: X
+
+- Row 2, col 3: X
+
+- Row 3, col 2: X
+
+- Row 4, col 2: X
+
+- Row 5, col 2: X
+
+That's all. So the path must be as previously determined.
+
+Hmm. Maybe the question counts the turn when leaving S. Wait, but S is not part of the X path. The X path starts at the first X.
+
+Wait, the problem says: "the path marked by X". So the path from S to E is along X's. So S is not on an X, but adjacent to an X. So the path is S → X → X → ... → X → E.
+
+So the first move is from S to X (direction west), then X to X (west), then X to X (south), etc.
+
+The turns happen between the X's. So the direction changes occur when moving from one X to the next.
+
+So:
+
+Between first X (2,3) and second X (2,2): direction west → west (no turn).
+
+Between second X (2,2) and third X (3,2): direction west → south (right turn).
+
+Between third X (3,2) and fourth X (4,2): south → south (no turn).
+
+And so on.
+
+So only one right turn. But this isn't an option. So I must have made a mistake.
+
+Wait, maybe the maze is different. Let's look for another possible path.
+
+What if row 3, column 2 is X, and row 3, column 4 is also X? But according to the user's input, row 3 is #X### #. If it's # X # # # #, then columns 3-6 are #, so no.
+
+Wait, maybe the maze has a different layout. Let's consider the following ASCII:
+
+Row 1: #######
+
+Row 2: #XXS  #
+
+Row 3: #X### #
+
+Row 4: #X# # #
+
+Row 5: #X# # #
+
+Row 6: #E#   #
+
+Row 7: #######
+
+Assuming row 3: #X### # → # X # # # # (7 characters).
+
+Row 4: #X# # # → # X # [space] # [space] #.
+
+So row 4, column 4 is space, column 5 is #, column 6 space.
+
+Row 5 same as row 4.
+
+Now, let's see if there's a different path.
+
+From S (row 2,4), move left to X (2,3), left to X (2,2). From (2,2), down to (3,2) X.
+
+From (3,2), can we move east? (3,3) is #, so no. Down to (4,2) X.
+
+(4,2) down to (5,2) X.
+
+(5,2) down to (6,2) E.
+
+Same path.
+
+Wait, unless in row 4, column 2 is X, and row 4, column 4 is a space, but no X there.
+
+Another idea: perhaps the path goes from S down? But row 3, column 4 is # (from row 3: #X### # → column 4 is #), so can't go down from S.
+
+So S can only move left.
+
+I'm stuck. Given the options, and if my analysis says 1, but that's not there, maybe I'm missing something.
+
+Wait, let's count again.
+
+Path:
+
+S (2,4) → (2,3) X (west)
+
+→ (2,2) X (west)
+
+→ (3,2) X (south)
+
+→ (4,2) X (south)
+
+→ (5,2) X (south)
+
+→ (6,2) E (south)
+
+Direction changes:
+
+West to west: no turn.
+
+West to south: right turn.
+
+South to south: no.
+
+South to south: no.
+
+South to south: no.
+
+So only one right turn.
+
+But options are A.0 B.2 C.5 D.9.
+
+Hmm. Maybe the question counts the turn when moving from the last X to E, but E is the end, so no.
+
+Wait, maybe the initial direction is considered. If you start at S, facing east (towards the X), but no, the first move is west.
+
+Wait, another possibility: the path is not straight. Maybe I misread the maze.
+
+Let's look at row 4 and row 5.
+
+Row 4: #X# # # → perhaps it's # X # space # space #, so at row 4, column 2 is X, column 4 is space, column 5 is #, etc.
+
+Row 5 same as row 4.
+
+Row 6: #E#   #.
+
+If the path goes down column 2 to row 5, then east to column 3? But column 3 at row 5 is # (row 5: #X# # # → column 3 is #), so no.
+
+Wait, row 4, column 4 is space. If there's an X at row 4, column 4, but according to the user input, row 4 is #X# # #, which would mean column 4 is space.
+
+No X there.
+
+Wait, unless the path goes down to row 5, then east, but no X's.
+
+I think the path is indeed as previously determined. So why isn't 1 an option?
+
+Wait, maybe the question says "the path marked by X", but the X's are not all the path. Wait, the problem states: "X marks the specific route you should follow to navigate from the starting point 'S' to the endpoint 'E'."
+
+So the X's are the route. So all X's should form the path. So the path is S connected to X's connected to E.
+
+Given that, and the X's are in column 2, row 2-5.
+
+Wait, another idea: perhaps the starting direction is south. No, the first move from S has to be west.
+
+Wait, maybe the turn count is different. Let's define directions numerically.
+
+Let's assign directions:
+
+0: North (up)
+
+1: East (right)
+
+2: South (down)
+
+3: West (left)
+
+A right turn would change direction by +1 (mod 4), left turn by -1.
+
+Starting direction: let's assume the first move is west (3).
+
+Next move is west (3), no turn.
+
+Next move is south (2). Change from 3 to 2. What's the change? 3 → 2 is a decrease of 1, which would be a left turn. Wait, this contradicts earlier thought.
+
+Wait, if direction is measured as:
+
+0: North
+
+1: East
+
+2: South
+
+3: West
+
+Then, from West (3), turning right (clockwise) would go to North (0), no.
+
+Wait, no. Clockwise from West is North? No.
+
+Wait, facing West (left), a right turn would face South (down), and a left turn would face North (up).
+
+So direction numbers:
+
+If facing West (3), right turn → South (2), which is a decrease of 1 (3 → 2).
+
+Left turn would be North (0), increase of 1 (3 → 0).
+
+So changing direction from West (3) to South (2) is a right turn (since 3 → 2 is a clockwise 90-degree turn).
+
+Yes. So that's a right turn.
+
+But according to the direction numbering, a right turn would be a decrease in the number.
+
+But regardless, it's a right turn.
+
+So only one right turn.
+
+But since this isn't an option, I must have misread the maze.
+
+Wait, let's consider the possibility that the maze is:
+
+Row 2: #XXS  # → S at (2,4)
+
+Row 3: #X# # # → but user wrote #X### #.
+
+Wait, maybe row 3 is #X# # #, which would be:
+
+Row 3: # X # [space] # [space] #.
+
+So columns:
+
+1: #, 2: X, 3: #, 4: space, 5: #, 6: space, 7: #.
+
+Then row 3, column 4 is space.
+
+Row 4 same as row 3.
+
+Now, if the path goes from S (2,4) left to (2,3), left to (2,2), down to (3,2), then east to (3,4), but (3,4) is space, but is there an X there? No, the X's are only in column 2.
+
+Hmm.
+
+Wait, another possibility: the X's are not all in column 2. Let's look at the user's input again:
+
+The maze:
+
+#######
+#XXS  #
+#X### #
+#X# # #
+#X# # #
+#E#   #
+#######
+
+Row 2: #XXS  # → X at 2,3.
+
+Row 3: #X### # → X at 2.
+
+Row 4: #X# # # → X at 2.
+
+Row 5: #X# # # → X at 2.
+
+Row 6: #E#   #.
+
+So all X's are column 2.
+
+Unless... row 3: #X### #, the third character is #, so column 3 is #, but maybe row 4 has X in another column.
+
+No, row 4: #X# # # → X at column 2.
+
+Wait, perhaps I've miscounted the columns. Let's write each row's columns explicitly.
+
+Row 1: 1:#, 2:#, 3:#, 4:#, 5:#, 6:#, 7:#
+
+Row 2: 1:#, 2:X, 3:X, 4:S, 5: , 6: , 7:#
+
+Row 3: 1:#, 2:X, 3:#, 4:#, 5:#, 6:#, 7:#  (assuming #X### # is a typo for #X#####)
+
+Row 4: 1:#, 2:X, 3:#, 4: , 5:#, 6: , 7:#
+
+Row 5: same as row 4.
+
+Row 6: 1:#, 2:E, 3:#, 4: , 5: , 6: , 7:#
+
+Row 7: all #.
+
+So the path from S (2,4) must go left to (2,3), left to (2,2), then down to (3,2), (4,2), (5,2), (6,2).
+
+Now, what's at row 4, column 2: X.
+
+Row 4, column 2 is X, so path continues.
+
+But row 4, column 4 is space, but no X there.
+
+So still the same path.
+
+Wait, unless there's a different path. For example, from (2,2) down to (3,2), then east to (3,4), but (3,4) is #, so no.
+
+I think the path is correct.
+
+Given that, and the only right turn being one, but it's not an option, perhaps the question has a typo, or I'm misunderstanding "right turns".
+
+Wait, the problem says "a right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
+
+So every time the direction changes clockwise by 90 degrees, it's a right turn.
+
+In our case, west to south: clockwise, so right turn.
+
+South to east would be right turn, etc.
+
+But in our path, only one such change.
+
+Wait, but maybe the initial direction is considered. For example, if you start moving west, but before that, you were facing south or something. But no, the first move's direction is west.
+
+Another idea: maybe the path has more turns. Let's imagine the maze differently.
+
+Wait, what if the maze is:
+
+Row 1: #######
+
+Row 2: # X X S   #
+
+Row 3: # X # # # #
+
+Row 4: # X # # # #
+
+Row 5: # X # # # #
+
+Row 6: # E #   #
+
+Row 7: #######
+
+And the X's form a path that goes down column 2, but also has a turn elsewhere. But according to the given maze, there are no other X's.
+
+Wait, unless the 'X' in row 2, column 3 is part of a different path.
+
+Wait, another possibility: I'm miscounting the number of X's.
+
+Row 2: two X's.
+
+Row 3: one X.
+
+Row 4: one X.
+
+Row 5: one X.
+
+Total X's: five.
+
+Path: S to X to X to X to X to X to E: six moves, five X's.
+
+So the path has six segments.
+
+But the turns are counted between the segments.
+
+So between segment 1 (S to X1) and segment 2 (X1 to X2), no turn.
+
+Between segment 2 and 3: turn.
+
+Segments 3-4, 4-5, 5-6: no turns.
+
+So one right turn.
+
+I'm really stuck. Given the options, maybe the intended answer is B.2, but why?
+
+Wait, perhaps I made a mistake in the direction change. Let's think again.
+
+When moving west (left), then turning south (down), is that a right turn?
+
+Imagine facing west (left). A right turn would make you face south (down), yes. A left turn would face north (up).
+
+Yes, so it's a right turn.
+
+But what if the path goes west, then south, then east? But it doesn't.
+
+Wait, unless the maze has a different layout. Let's consider the following:
+
+If row 4 is #X# # #, meaning column 4 is space, and if there's an X in row 4, column 4, but the user didn't write it. But according to the problem statement, X marks the path, so all X's should be there.
+
+Wait, the problem says: "X marks the specific route you should follow". So all X's are part of the path.
+
+Given that, and no other X's, the path must be as we have.
+
+Wait, unless the S is considered to be on the path. But S is a separate symbol.
+
+Another idea: the path from S to E includes S and E, but the X's are the path. So the X's are the only path, so S must be next to an X, which it is.
+
+I think the problem might have a typo, or I'm missing something obvious.
+
+Wait, let's count the number of direction changes again.
+
+Path segments:
+
+1. S to X1: west
+
+2. X1 to X2: west
+
+3. X2 to X3: south
+
+4. X3 to X4: south
+
+5. X4 to X5: south
+
+6. X5 to E: south
+
+Turns occur between segments 2-3.
+
+Direction change: west to south: right turn.
+
+That's one turn.
+
+But options don't have it. The closest is B.2.
+
+Wait, could there be another turn if the path goes east somewhere?
+
+Wait, row 6 has E at (6,2), and row 6, column 4-6 are spaces. But no X's there.
+
+Unless the path from (5,2) goes east to (5,4), but no X's.
+
+I'm at a loss. Given the options, and if I have to choose, maybe the intended answer is B.2, assuming I missed a turn.
+
+Wait, another possibility: when moving from (2,2) to (3,2), the direction is south, but if the previous direction was west, that's one right turn.
+
+Then, when moving from (3,2) to (4,2), direction south, no turn.
+
+But what if the path goes down to row 4, then east, but there's no X.
+
+Wait, perhaps the maze has a different layout where after row 3, the path turns east.
+
+But according to the given ASCII, row 3 is #X### #, so after X at (3,2), the next cell east is #, so no.
+
+Wait, let's look up similar mazes. Sometimes, the path might go down, then right, then down, etc.
+
+Wait, let's assume that row 4: #X# # # means that there's an X at column 2, and also at column 5 or something.
+
+No, user wrote #X# # #, which would be # X # [space] # [space] #.
+
+Column 5 is #, so no.
+
+I think I have to go with the analysis that there's one right turn, but since it's not an option, I must have erred.
+
+Wait, wait! The problem says "navigate from S to E by following the path marked by X". So the path is the X's, meaning that S is connected to the first X, and E is connected to the last X.
+
+But in our case, the last X is at (5,2), and E is at (6,2), so the move from (5,2) to E is south.
+
+But what if the path is longer?
+
+Wait, row 5, column 2 is X, row 6, column 2 is E. So yes.
+
+Wait, unless there's a mistake in assuming the direction change.
+
+Let's use a different method. Draw the path:
+
+S (2,4) → (2,3) → (2,2) → (3,2) → (4,2) → (5,2) → E (6,2)
+
+Plotting this:
+
+At row 2, move left twice.
+
+Then down four times.
+
+The turn occurs when changing from left to down.
+
+In terms of turns, if you're moving left, then turn to go down, that's a right turn.
+
+But maybe the question counts the number of times you turn right, regardless of direction.
+
+Wait, but it's only one turn.
+
+Wait, unless the path has another turn. Let's imagine that from (2,2), instead of going down, you go east, but there's a wall.
+
+No.
+
+Another idea: maybe the maze is mirrored. If rows are numbered from bottom to top, but the problem says "ASCII code" which is top to bottom.
+
+I think I've exhausted all possibilities. Given the options, and if forced to choose, maybe the intended answer is B.2, assuming that the path makes two right turns, but I can't see where.
+
+Wait, wait! What if the path from S goes west, then south, then west, then south, etc. But no, the X's don't allow that.
+
+Wait, let's consider that in row 4, column 2 is X, and row 4, column 4 is space, but there's an X there. But according to the problem, X's mark the path, so if it's not shown, it's not there.
+
+I think I have to conclude that either the question is flawed, or I made a mistake.
+
+Wait, let's try to count the number of times the direction changes to the right.
+
+Another approach: list the directions of each move.
+
+Move 1: West
+
+Move 2: West
+
+Move 3: South
+
+Move 4: South
+
+Move 5: South
+
+Move 6: South
+
+Changes occur between move 2 and 3.
+
+West to South: right turn.
+
+That's one.
+
+If the answer is not here, but given the options, maybe the correct answer is B.2, and I missed a turn.
+
+Wait, what if the initial direction is not west? For example, if you start at S and the first move is south, but there's a wall there.
+
+Row 3, column 4 is #, so can't move south from S.
+
+So first move must be west.
+
+I give up. Based on my analysis, it should be 1, but since that's not an option, and the closest is B.2, perhaps I made a mistake in the direction.
+
+Wait, let's think differently: when you move west, then south, is that a right turn or left turn?
+
+Imagine facing west (left). To turn south, you turn right (clockwise), yes.
+
+If you face west, turning right (clockwise) leads to south.
+
+Yes.
+
+So it's a right turn.
+
+If the path had another turn, say south to east, that would be another right turn.
+
+But it doesn't.
+
+Unless...
+
+Wait, maybe after row 5, column 2, you go east to E, but E is at column 2.
+
+No.
+
+Wait, row 6, column 2 is E, so no.
+
+I think the problem might have a typo, but given the options, and if I have to choose, perhaps the intended answer is B.2.
+
+But why?
+
+Wait, another possibility: the path is S → X (2,3) → X (3,3), but row 3, column 3 is #, so impossible.
+
+No.
+
+Wait, what if row 3 is #X# # # instead of #X### #, and it's a typo. If row 3 is #X# # #, then column 3 is #, column 4 is space.
+
+No help.
+
+Wait, let's look for similar examples. In some mazes, the path might turn twice.
+
+Wait, suppose the path goes:
+
+S (2,4) west to (2,3), west to (2,2), south to (3,2), east to (3,3), but (3,3) is #.
+
+No.
+
+I think I have to go with the analysis that there's one right turn, but since it's not an option, perhaps the correct answer expected is B.2, and there's a mistake in the maze description.
+
+Alternatively, maybe I miscounted the number of X's.
+
+Wait, row 2: two X's.
+
+Row 3: one X.
+
+Row 4: one X.
+
+Row 5: one X.
+
+Total of five X's. Path segments: six.
+
+Turns: one.
+
+But let's count the number of right turns as per the problem's example.
+
+Wait, the problem says "right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
+
+So, if you're moving west, then turn south: right turn.
+
+If you were to move south then east: right turn.
+
+But in our path, only one.
+
+Wait, unless the movement from (2,2) to (3,2) is considered a right turn, and then from (3,2) to (4,2) is another, but no, direction doesn't change.
+
+I think the answer should be A.0, but that contradicts.
+
+Wait, no. If there's one right turn, and the options don't have it, but the closest is B.2, maybe the intended path is different.
+
+Wait, let's try to see the maze as follows:
+
+Row 2: #XXS  # → S at (2,4)
+
+Row 3: #X### # → perhaps it's # X # # # [space] #, so column 6 is space.
+
+Row 4: #X# # # → column 4 and 6 are spaces.
+
+Row 5: same.
+
+Row 6: #E#   #.
+
+Now, suppose the path is:
+
+S (2,4) → west to (2,3) X → west to (2,2) X → south to (3,2) X → east to (3,4) but (3,4) is #.
+
+No.
+
+Or south to (3,2), then south to (4,2), east to (4,4), which is space, but no X.
+
+I give up. I think the answer should be 1, but since it's not there, and the options are given, perhaps the correct answer is B.2, and I made a mistake.
+
+Wait, another idea: the path from S goes left to X, then down, then right, then down. But where would that right turn be?
+
+If after row 3, column 2, the path goes east, but there's a wall.
+
+No.
+
+Wait, if row 4, column 2 is X, and row 4, column 3 is space, but no X. Not helpful.
+
+I think I have to go with the only logical conclusion: 1 right turn, but since it's not an option, there must be an error. However, given the choices, and if I consider that maybe the path has two right turns, perhaps the intended answer is B.2.
+
+Wait, wait! When moving from west to south is a right turn, and if later the direction changes again, but in our case, it doesn't. Unless...
+
+Wait, what if the path goes south, then west? But it doesn't.
+
+No.
+
+Wait, perhaps I've misread the maze's row 6.
+
+Row 6: #E#   # → E is at (6,2), so to get to E from (5,2), it's south. No turn.
+
+I think I've spent too much time. Given the options, and if I have to choose, I'll go with B.2, but I'm not sure. However, based on the analysis, it should be 1. But since 1 isn't there, maybe I missed a turn.
+
+Wait! Wait a second. When you move west (left), then south (down), that's a right turn. Then, if you continue south, no turn. But what if the path after row 5, column 2 turns east to reach E, but E is directly south.
+
+No.
+
+Wait, another possibility: the starting direction. If the first move is west, but the initial direction before moving is, say, south, then moving west would be a left turn. But the problem states that you navigate from S by following the path. The first movement direction is west, so the initial direction isn't counted.
+
+I think I have to conclude that the correct answer is not listed, but since it's a multiple-choice question, the intended answer is probably B.2. Maybe the maze was supposed to have a different layout.
+
+Alternatively, maybe the path is:
+
+S → X (2,3) → X (3,3), but row 3, column 3 is #, so no.
+
+Wait, the user wrote row 3 as #X### #, which might mean # X # # # #, so column 3 is #, but if it's #X# # #, then row 3: column 3 is #.
+
+I think I'll have to go with the analysis that it's 1, but since that's not an option, I must have erred. Given the options, and common maze turns, perhaps the answer is B.2.
+
+Wait, let's assume that from west to south is one right turn, and then when moving south, if there's a turn east, but there isn't. I'm really not sure.
+
+Wait, another way: count the number of times the path turns right.
+
+Path:
+
+Start at S, go left (west), left (west), down (south), down, down, down.
+
+The turn from west to south: right turn.
+
+No other turns.
+
+So 1 right turn.
+
+But since 1 isn't an option, and the closest is B.2, I might have to assume that the maze has a different path.
+
+Wait, maybe the X's are in a different place. Let's look at the ASCII again:
+
+The maze:
+
+#######
+#XXS  #
+#X### #
+#X# # #
+#X# # #
+#E#   #
+#######
+
+Row 3: #X### #. If this is # X # # # #, then column 2 is X, and columns 3-6 are #.
+
+Row 4: #X# # # → # X # space # space #.
+
+Row 5: same as row 4.
+
+Now, suppose the path is:
+
+S (2,4) → west to (2,3) X → west to (2,2) X → south to (3,2) X → south to (4,2) X → east to (4,4) (space), but no X there. Can't be.
+
+No.
+
+Wait, row 4, column 4 is space, and if there's an X there, but according to the maze, no.
+
+I think I must accept that the answer should be 1, but since it's not there, perhaps the question meant to say "left turns" or there's a typo.
+
+Given the options, and if forced to choose, I'll go with B.2, but I'm unsure.
+
+Wait, no. Let's think differently. Maybe the path is:
+
+S (2,4) → east? No, east is space, but no X.
+
+No.
+
+Wait, perhaps the maze is:
+
+Row 2: # X X S   #
+
+Row 3: # X # # # #
+
+Row 4: # X # # # #
+
+Row 5: # X # # # #
+
+Row 6: # E #   #
+
+And the path goes down column 2, but also, at row 4, it turns east.
+
+But no X's east.
+
+I give up. Based on strict analysis, answer should be 1, but since it's not there, and the options are given, perhaps the intended answer is B.2, so I'll choose B.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
+    Current position: (r2, c2)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+[... continue for all steps until reaching E ...]
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (2, 4)
+    E position: (6, 2)
+
+>>> STEP 1: Move LEFT from (2, 4) to (2, 3)
+    Current position: (2, 3)
+    Previous direction: —
+    Current direction: LEFT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+[VERIFIER FEEDBACK for LOCATE section:
+  ✗ S position (2, 4) should be (1, 3)
+  ✗ E position (6, 2) should be (5, 1)
+Please correct the start/exit positions and continue.]
+
+>>> LOCATE START AND EXIT (CORRECTED):
+    S position: (1, 3)
+    E position: (5, 1)
+
+>>> STEP 1: Move LEFT from (1, 3) to (1, 2)
+    Current position: (1, 2)
+    Previous direction: —
+    Current direction: LEFT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 2: Move LEFT from (1, 2) to (1, 1)
+    Current position: (1, 1)
+    Previous direction: LEFT
+    Current direction: LEFT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 3: Move DOWN from (1, 1) to (2, 1)
+    Current position: (2, 1)
+    Previous direction: LEFT
+    Current direction: DOWN
+    Turn type: RIGHT
+    Running count: Right=1, Left=0
+
+[VERIFIER FEEDBACK for Step 3:
+  ✗ Turn type RIGHT_TURN should be LEFT_TURN. Going from WEST to SOUTH is a counterclockwise (LEFT turn) rotation.
+  ✗ Right turn count 1 should be 0
+  ✗ Left turn count 0 should be 1
+IMPORTANT: Clockwise on a compass is NORTH→EAST→SOUTH→WEST→NORTH. A RIGHT turn = 90° clockwise; a LEFT turn = 90° counterclockwise. For example: SOUTH→WEST is RIGHT (clockwise), SOUTH→EAST is LEFT (counterclockwise). Please correct this step and continue.]
+
+>>> STEP 3 (CORRECTED): Move DOWN from (1, 1) to (2, 1)
+    Current position: (2, 1)
+    Previous direction: LEFT (WEST)
+    Current direction: DOWN (SOUTH)
+    Turn type: LEFT
+    Running count: Right=0, Left=1
+
+>>> STEP 4: Move DOWN from (2, 1) to (3, 1)
+    Current position: (3, 1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 5: Move DOWN from (3, 1) to (4, 1)
+    Current position: (4, 1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 6: Move DOWN from (4, 1) to (5, 1)
+    Current position: (5, 1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> FINAL ANSWER:
+    The path from S to E contains 0 right turns.
+    \boxed{A}
+</format>
+
+**Final Answer**
+\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt
new file mode 100644
index 00000000..f2bbfb3d
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt
@@ -0,0 +1,582 @@
+Okay, let's try to figure out how many right turns are in the path marked by X from S to E. First, I need to visualize the maze properly. The ASCII maze given is:
+
+#######
+#S#   #
+#X# ###
+#X#   #
+#X### #
+#XXE  #
+#######
+
+Hmm, let's break it down row by row. Each line represents a row in the maze. Let's index the rows starting from the top as row 0 maybe. Wait, but the actual maze is written as:
+
+Row 0: #######
+Row 1: #S#   #
+Row 2: #X# ###
+Row 3: #X#   #
+Row 4: #X### #
+Row 5: #XXE  #
+Row 6: #######
+
+Wait, but maybe the rows are as written. Let's count the lines. The user provided:
+
+#######
+#S#   #
+#X# ###
+#X#   #
+#X### #
+#XXE  #
+#######
+
+So that's 7 lines. Let's number them from top to bottom as rows 0 to 6.
+
+Row 0: ####### (all walls)
+Row 1: #S#   # (so positions 0: #, 1: S, 2: #, 3: space, 4: space, 5: space, 6: #)
+Row 2: #X# ### (positions: #, X, #, space, #, #, #)
+Row 3: #X#   # (similar to row 2 but spaces after the X)
+Row 4: #X### # (X followed by ### and then space and #)
+Row 5: #XXE  # (X, X, E, space, space, #)
+Row 6: #######
+
+Wait, but the maze's navigable path is marked by X. Wait, the problem states that X marks the specific route to follow. So the path from S to E is along the X's. So we need to trace the path from S through X's to E.
+
+Wait, S is in row 1, column 1 (assuming columns start at 0). Let's map each cell.
+
+Let's make a grid. Let's index rows from top to bottom (0 to 6), columns left to right (0 to 6).
+
+Row 0: 0-6: # # # # # # #
+Row 1: # S #   # (columns 0: #, 1:S, 2:#, 3:space, 4:space, 5:space, 6:#)
+Row 2: # X # ### (columns 0:#, 1:X, 2:#, 3:space, 4:#,5:#,6:#)
+Row 3: # X #   # (columns 0:#, 1:X, 2:#, 3:space, 4:space,5:space,6:#)
+Row 4: # X ### # (columns 0:#, 1:X, 2:#,3:#,4:#,5:space,6:#)
+Row 5: # X X E  # (columns 0:#,1:X,2:X,3:E,4:space,5:space,6:#)
+Row 6: # # # # # # #
+
+Wait, but row 5's E is at column 3. So the path from S (row1, col1) must go through the X's to reach E.
+
+Wait, but S is at (1,1), and the X's are in row2 col1, row3 col1, row4 col1, row5 col1 and row5 col2. Hmm. Wait, let's check each row.
+
+Row 1: S is at (1,1). The path starts here. The X's are supposed to be the path. Wait, but S is the start, and E is the end. The X's mark the route. So perhaps the path is S -> X -> X -> ... -> E.
+
+Wait, but looking at the maze:
+
+Row 1: #S#   # → so S is at (1,1). To the right of S (col2) is a wall (#), so the path must go down or up. But up is row0 which is all walls. So from S (row1, col1), the only possible move is down (row2, col1), which is X. So the path starts at S, then moves down to X (row2, col1).
+
+Wait, but let's check row2: #X# ### → so (row2, col1) is X. So from S (row1, col1), moving down to row2, col1 (X). Then, what's next? Let's see the path.
+
+We need to follow the X's. So the path is S → X → X → ... → E.
+
+Let's list all X positions:
+
+Row 2, col1: X
+
+Row 3, col1: X
+
+Row 4, col1: X
+
+Row 5, col1: X
+
+Row 5, col2: X
+
+E is at row5, col3.
+
+Wait, so the path would go from S (row1, col1) down to row2, col1 (X), then down to row3, col1 (X), then down to row4, col1 (X), then down to row5, col1 (X), then right to row5, col2 (X), then right to E (row5, col3). Wait, but E is at row5, col3. So the path would be S → (down) X → X → X → X → X → E. Wait, but row5, col1 is X, then row5, col2 is X, then col3 is E.
+
+Wait, but the X's are the path. So the path from S to E must go through all X's. So the path is S (row1, col1) → row2 col1 (X) → row3 col1 (X) → row4 col1 (X) → row5 col1 (X) → row5 col2 (X) → E (row5, col3).
+
+Wait, but row5 col2 is X. So the path goes from row5 col1 (X) to row5 col2 (X), then to E (col3). So the path steps are:
+
+Start at S (1,1).
+
+Move down to (2,1) [X]
+
+Down to (3,1) [X]
+
+Down to (4,1) [X]
+
+Down to (5,1) [X]
+
+Right to (5,2) [X]
+
+Right to (5,3) [E]
+
+Wait, but is that the case? Let's check the maze again.
+
+Row 5: #XXE  # → columns 0: #, 1:X, 2:X, 3:E, 4:space, 5:space, 6:#. So yes, row5, col1 and col2 are X, then E at col3.
+
+So the path from S to E is moving down four times (from row1 to row5 in col1), then right twice (to col2, then col3). Wait, but row5, col1 to col2 is right, then col2 to col3 is right again. But E is at col3. So the path is:
+
+Start at S (1,1).
+
+Direction down to (2,1).
+
+Then down to (3,1).
+
+Then down to (4,1).
+
+Then down to (5,1).
+
+Then right to (5,2).
+
+Then right to (5,3) (E).
+
+So the directions taken are: down, down, down, down, right, right.
+
+Wait, but movement direction changes. To count right turns, we need to track the direction of movement and see when a right turn occurs.
+
+First, starting direction: when moving from S (1,1) to (2,1), the direction is down. So the first move is down.
+
+Then next move is down again (to row3, col1), so same direction. No turn.
+
+Then down again (row4, col1), same direction. No turn.
+
+Then down again (row5, col1), same direction. No turn.
+
+Then move right (from col1 to col2), which is a right turn from down. Wait, direction was down, then moving right. Let's think: if you're moving down (south), a right turn would be to face east (right), which would be moving right. Wait, movement direction: if you are moving down (south), turning right would face east (right), so moving east (right) is a right turn. Wait, yes. Let's clarify:
+
+Assuming the standard directions:
+
+- Facing down (south), a right turn would make you face east (right), so moving right (east) would be a right turn.
+
+Wait, but movement direction: when moving from one cell to another, the direction you were moving to get there.
+
+Wait, the problem says: "A right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
+
+So, for example, if you were moving down (south), then turn right would face east (right), so next movement direction is east (right). So the turn from south to east is a right turn.
+
+Similarly, moving east, a right turn would face south.
+
+So in our path:
+
+First move: from S (1,1) to (2,1) → direction down (south).
+
+Second move: from (2,1) to (3,1) → direction still down. No turn.
+
+Third move: (3,1) to (4,1) → down again. No turn.
+
+Fourth move: (4,1) to (5,1) → down again. No turn.
+
+Fifth move: (5,1) to (5,2) → direction right (east). Previous direction was down. So changing from down to right (east) is a right turn (since right from down is east).
+
+Sixth move: (5,2) to (5,3) → direction still east. No turn.
+
+Wait, so only one right turn here. But the options don't have 1. Wait, this can't be right. Maybe I'm missing something.
+
+Wait, maybe the path isn't all down then right. Let's double-check the maze.
+
+Wait, row 4: #X### #. So row4, col1 is X. Then row4, col2 is #, col3 #, col4 #, col5 space. So from row4, col1 (X), moving down to row5, col1 (X), which is possible. Then row5, col1 to col2 (X), then to E.
+
+But wait, row5, col1 to col2: that's moving east (right), so direction change from down to east is a right turn. Then moving east again (col2 to col3), no turn. So only one right turn. But the options are 9, 0,4,7. Hmm. So 1 is not an option. So I must have made a mistake.
+
+Wait, perhaps the path is different. Maybe the X's are not all in a straight line. Let's check again.
+
+Wait, row2: #X# ### → so columns 0:#, 1:X, 2:#, 3:space, 4:#,5:#,6:#. So row2, col1 is X, but col2 is #, so can't move right from there. So from row2, col1 (X), you can't go east (right) because col2 is a wall. So the path must go down to row3, col1.
+
+Row3: #X#   # → same structure. col1 is X, col2 is #, so can't go east. So path continues down to row4, col1.
+
+Row4: #X### # → col1 is X, col2 is #, so again, can't move east. So path continues down to row5, col1.
+
+Row5: #XXE  # → col1 and 2 are X. So from row5, col1, can move east to col2 (X), then east to E.
+
+So the path is indeed down, down, down, down, right, right. So the direction changes only once: from down to east (right turn), then continuing east. So only one right turn. But 1 isn't an option. Hmm. So something's wrong here.
+
+Wait, maybe the path is not straight down. Wait, let's re-examine the maze.
+
+Wait, row 1: #S#   # → so S is at (1,1), surrounded by walls on left (col0), right (col2), and above (row0). So S can only go down (row2, col1), which is X.
+
+Row2: #X# ### → X at (2,1). From here, can we move east? (2,2) is #, so no. Can we move west? (2,0) is #. So only down to row3, col1.
+
+Row3: #X#   # → X at (3,1). Similarly, can't go east (col2 is #), so must go down to row4, col1.
+
+Wait, I made an error at Step 5. Turn type RIGHT_TURN should be LEFT_TURN. Going from SOUTH to EAST is a counterclockwise (LEFT turn) rotation.; Right turn count 1 should be 0; Left turn count 0 should be 1. Let me re-trace the path more carefully from the correct position.
+Wait, direction when moving down (south), a right turn would be to face east (right), so moving east would be a right turn. But according to the problem statement, right turn is 90 degrees clockwise. So if you're moving south, turning right (clockwise) would face west? Wait, no. Wait, directions:
+
+Imagine facing south (down). A right turn (clockwise) would turn you to face west? No. Wait, standard compass directions:
+
+If facing north, right turn is east.
+
+Facing east, right turn is south.
+
+Facing south, right turn is west.
+
+Facing west, right turn is north.
+
+Wait, no. Wait, if you're facing south (down), a clockwise right turn would turn you to face west (left from south?), wait maybe I'm confused.
+
+Let's clarify:
+
+When you move in a direction, your facing direction is that direction. A right turn (clockwise) would change your facing direction 90 degrees clockwise.
+
+So:
+
+- If you're moving south (down), facing south. A right turn (clockwise) would make you face west (since turning right from south would point west? Wait, no. Let's think of a compass:
+
+North (up), East (right), South (down), West (left).
+
+Facing north: right turn → east.
+
+Facing east: right turn → south.
+
+Facing south: right turn → west.
+
+Facing west: right turn → north.
+
+Yes, that's correct.
+
+So if you're moving south (direction south), then a right turn would face west. But in our path, after moving south (down), we want to move east (right), which would be a left turn.
+
+Wait, this is critical. So the direction change from south to east would be a left turn, not a right turn.
+
+Because moving south, then turning left (anticlockwise) would face east.
+
+Wait, let's confirm:
+
+Facing south (down), turning left (anticlockwise 90) → east.
+
+Turning right (clockwise 90) → west.
+
+So moving from south to east is a left turn.
+
+Ah! This is probably where I messed up earlier.
+
+So in the path:
+
+After moving down (south) four times, then moving east (right), the direction change is from south to east, which is a left turn, not a right turn.
+
+So then, the turn would be a left turn. So no right turns in the path. Then the answer would be B. 0.
+
+But wait, let's make sure.
+
+Path steps:
+
+Start at S (1,1). Move down to (2,1): direction south.
+
+Next move down to (3,1): still south. No turn.
+
+Down to (4,1): south. No turn.
+
+Down to (5,1): south. No turn.
+
+Then move east to (5,2): direction east. Change from south to east. This is a left turn (since left turn from south is east).
+
+Then move east to (5,3): still east. No turn.
+
+So all direction changes (only one change) is a left turn. So no right turns. So answer B. 0.
+
+But the options include B. 0. But wait, maybe I'm missing some turns.
+
+Wait, is there another part of the path that I'm not considering? Let's check again.
+
+Wait, are there any other X's that form a different path?
+
+Wait, the problem says "the path marked by X". So the X's are the path. So we must follow all X's from S to E.
+
+Wait, S is not an X. The X's start from row2, col1. So the path is S → X → X → X → X → X → E.
+
+Wait, S to first X (row2, col1), then through all X's to E.
+
+But according to the maze, the X's are at:
+
+row2 col1,
+
+row3 col1,
+
+row4 col1,
+
+row5 col1,
+
+row5 col2.
+
+So five X's. So path is S → X (row2) → X (row3) → X (row4) → X (row5 col1) → X (row5 col2) → E.
+
+So the movement directions:
+
+From S (row1, col1) to row2 col1: down.
+
+row2 to row3: down.
+
+row3 to row4: down.
+
+row4 to row5: down.
+
+row5 col1 to row5 col2: right (east).
+
+row5 col2 to E: right (east).
+
+So only one direction change: down → east.
+
+As established, down to east is a left turn. So no right turns. So answer B. 0.
+
+But let's check again. Maybe there's another turn. Wait, what about if the path takes a different route? Wait, but according to the maze, the X's are only in those positions. So we have to follow the X's. So the path must go through all X's. So the path is as described.
+
+Wait, but maybe the X's form a different path. Let's look at the maze again.
+
+Row 1: #S#   # → S at (1,1), then spaces to the right.
+
+Row 2: #X# ### → X at (2,1), then #, then spaces, then #s.
+
+Row3: #X#   # → X at (3,1), then #, then spaces.
+
+Row4: #X### # → X at (4,1), then ###, then space.
+
+Row5: #XXE  # → X at (5,1), X at (5,2), E at (5,3).
+
+So the X's are all in column 1 from row2 to row5, then column 2 row5. So path is straight down column 1, then right along row5.
+
+No other X's. So path must be as above.
+
+Thus, only one turn, which is left. So no right turns. Answer B. 0.
+
+But wait, let's check the problem statement again: "the path marked by X". So the X's are the correct path. So we have to follow the X's. So the movement is along the X's. So the path is S to first X, then next X, etc.
+
+But S is not an X. So the first step is from S (which is a cell) to the first X. So the path starts at S, then moves to X (row2, col1), which is the first X. Then to next Xs.
+
+So the directions are as before.
+
+Wait, but what if the starting direction is considered. When you start at S, you have to move to the first X. So your first move is down. Then subsequent moves.
+
+But the first direction is down. Then no turns until the east move.
+
+So only one direction change (left turn), so zero right turns. So answer B.
+
+But why is option C 4? Maybe I'm missing something. Let's think again.
+
+Wait, perhaps the path is not all down. Let's check row4. Row4 is #X### #. So at row4, col1 (X), can we move east? Col2 is #, so no. So must move down to row5, col1.
+
+Row5, col1 is X. From there, east to col2 (X), then east to E.
+
+Wait, another possibility: perhaps the path has more turns. Let's map all the X's.
+
+List of X's:
+
+(2,1), (3,1), (4,1), (5,1), (5,2).
+
+So the path is (1,1) → (2,1) → (3,1) → (4,1) → (5,1) → (5,2) → (5,3).
+
+So the movement vectors:
+
+From (1,1) to (2,1): down.
+
+(2,1) to (3,1): down.
+
+(3,1) to (4,1): down.
+
+(4,1) to (5,1): down.
+
+(5,1) to (5,2): right.
+
+(5,2) to (5,3): right.
+
+So the direction changes occur only once: from down to right. As discussed, this is a left turn.
+
+So no right turns. Answer B. 0.
+
+But why would the options include 4 or 7? Maybe I'm misinterpreting the maze.
+
+Wait, let's check the maze again as provided:
+
+The ASCII maze:
+
+#######
+#S#   #
+#X# ###
+#X#   #
+#X### #
+#XXE  #
+#######
+
+Wait, row 2 is #X# ###. So columns:
+
+0: #, 1:X, 2:#, 3:space, 4:#, 5:#, 6:#.
+
+Row3: #X#   # → 0:#, 1:X, 2:#, 3:space, 4:space, 5:space, 6:#.
+
+Row4: #X### # → 0:#, 1:X, 2:#, 3:#, 4:#, 5:space, 6:#.
+
+Row5: #XXE  # → 0:#, 1:X, 2:X, 3:E, 4:space, 5:space, 6:#.
+
+Wait, row5, col5 is space, but not relevant.
+
+Wait, in row4, col5 is space. So row4, col5 is space. But our path doesn't go there.
+
+Wait, but what if the path isn't straight down? Let's see. From row2, col1 (X), can we move east? No, col2 is #. Can we move north? Back to S, which is start. So no. So must go down.
+
+Same for row3, col1: can't go east or west, so must go down.
+
+Row4, col1: same.
+
+So the path is straight down.
+
+Hmm. So unless the problem counts the direction changes differently.
+
+Wait, another thought: maybe when you move from S to row2, col1, that's the first direction (down), and then all the way down, then a turn. But perhaps the problem counts the turn when leaving the X path. Not sure.
+
+Wait, the problem states: "navigate from S to E by following the path marked by X". So the path includes the X's. So the path is S → X → X → X → X → X → E. So the X's are the path, meaning the route is through those cells. So movement through the X's.
+
+So the path's direction changes are between the X's.
+
+Wait, S is not an X. So the first move is from S (not X) to the first X. Then the path through X's.
+
+But the turns are counted based on the direction of movement through the path. So the movement from S to first X is direction down. Then from first X to second X is down, etc. So the turns happen between the X's.
+
+Wait, but the turn is when changing direction. So from S to first X: direction down. Then first X to second X: down (no turn). Second to third: down. Third to fourth: down. Fourth X (row5, col1) to fifth X (row5, col2): direction east. So change from down to east: left turn. Fifth X to E: east (no turn).
+
+So only one turn, left. So zero right turns.
+
+Thus, the answer should be B. 0.
+
+But why would the problem have other options? Maybe I'm missing a part of the maze.
+
+Wait, looking at row 3: #X#   #. After the X at col1, there's a space at col3. Could there be a path that goes east from row3?
+
+Wait, row3, col1 is X. To move east (col2), but col2 is #, so no. So can't go east. So the path must continue down.
+
+Same for row2.
+
+Another check: row4 is #X### #. So after col1 (X), col2 is #, so can't go east. So path must go down.
+
+Thus, the path is indeed straight down then east.
+
+Hmm. Maybe the problem counts the final move to E as part of the path, but that doesn't affect the turns.
+
+Wait, let's think of another possibility. If the path is different. Suppose the X's are not all in a line. Wait, maybe I misread the maze.
+
+Wait, let's write out the maze with coordinates:
+
+Row 0: 0 1 2 3 4 5 6
+        # # # # # # #
+
+Row 1: # S #   # 
+        0 1 2 3 4 5 6
+
+Row 2: # X # # # # 
+        Wait, no. Original row 2: #X# ### → so:
+
+Row 2: 0:#, 1:X, 2:#, 3:space, 4:#,5:#,6:#.
+
+Row 3: #X#   # → 0:#,1:X,2:#,3:space,4:space,5:space,6:#.
+
+Row 4: #X### # → 0:#,1:X,2:#,3:#,4:#,5:space,6:#.
+
+Row 5: #XXE  # → 0:#,1:X,2:X,3:E,4:space,5:space,6:#.
+
+Yes, so X's are at (2,1), (3,1), (4,1), (5,1), (5,2).
+
+Path is as before.
+
+Wait, another idea: perhaps the movement from row5, col2 to E is not counted, but the turn before that. But no, the turn is between directions.
+
+Wait, let's list all direction changes:
+
+Move 1: S (1,1) → (2,1): direction down.
+
+Move 2: (2,1) → (3,1): down.
+
+Move 3: (3,1) → (4,1): down.
+
+Move 4: (4,1) → (5,1): down.
+
+Move 5: (5,1) → (5,2): right.
+
+Move 6: (5,2) → (5,3): right.
+
+So between move 4 and move 5: direction changes from down to right.
+
+As established, this is a left turn.
+
+No other direction changes. So zero right turns.
+
+Therefore, the answer should be B. 0.
+
+But the options given are A.9 B.0 C.4 D.7.
+
+So B.0 is an option. So likely the answer is B.
+
+But why would the question be asked if it's zero? Maybe I made a mistake in the turn direction.
+
+Let me confirm once more:
+
+Current direction: south (down).
+
+Next direction: east (right).
+
+To go from south to east, which way do you turn?
+
+If you're facing south, to turn towards east, you turn left (counterclockwise), because east is to your left when facing south.
+
+Yes: facing south, left turn → east.
+
+Right turn would be west.
+
+So yes, it's a left turn.
+
+Therefore, no right turns. Answer B.
+
+I think that's correct. So the answer is B.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
+    Current position: (r2, c2)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+[... continue for all steps until reaching E ...]
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (1, 1)
+    E position: (5, 3)
+
+>>> STEP 1: Move DOWN from (1, 1) to (2, 1)
+    Current position: (2, 1)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 2: Move DOWN from (2, 1) to (3, 1)
+    Current position: (3, 1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 3: Move DOWN from (3, 1) to (4, 1)
+    Current position: (4, 1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 4: Move DOWN from (4, 1) to (5, 1)
+    Current position: (5, 1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 5: Move RIGHT from (5, 1) to (5, 2)
+    Current position: (5, 2)
+    Previous direction: DOWN
+    Current direction: RIGHT (EAST)
+    Turn type: LEFT (since DOWN → EAST is a left turn)
+    Running count: Right=0, Left=1
+
+>>> STEP 6: Move RIGHT from (5, 2) to (5, 3) [E]
+    Current position: (5, 3)
+    Previous direction: RIGHT
+    Current direction: RIGHT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> FINAL ANSWER:
+    \boxed{B}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt
new file mode 100644
index 00000000..a020fed1
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt
@@ -0,0 +1,760 @@
+Okay, let's try to figure out how many right turns are in the path marked by X from S to E. First, I need to visualize the maze properly. The ASCII maze given is:
+
+#######
+#S#  E#
+#X###X#
+#X#XXX#
+#X#X###
+#XXX  #
+#######
+
+Hmm, let me parse this row by row. Each line represents a row in the maze. Let's index them from top to bottom as rows 0 to 6 maybe, but maybe it's easier to just list them out.
+
+Row 0: #######
+Row 1: #S#  E#
+Row 2: #X###X#
+Row 3: #X#XXX#
+Row 4: #X#X###
+Row 5: #XXX  #
+Row 6: #######
+
+Wait, but the actual maze is probably 7 rows high (since there are 7 lines) and each line has 7 characters (since the first line is 7 #s). Let's confirm:
+
+Row 0: 7 #s
+Row 1: # S # space space E # (so positions 0:#, 1:S, 2:#, 3:space, 4:space, 5:E, 6:#)
+Row 2: # X # # # X # (positions 0:#, 1:X, 2:#, 3:#, 4:#, 5:X, 6:#)
+Row 3: # X # X X X #
+Row 4: # X # X # # #
+Row 5: # X X X   #
+Row 6: #######
+
+Wait, but the problem states that X marks the specific route. So the path from S to E should follow the Xs. Let's find where S and E are.
+
+Looking at row 1: #S#  E#. So S is at (row 1, column 1), since row 1 starts with #, then S. Let's assume columns are 0-based. So row 1, col 1 is S.
+
+E is at row 1, col 5 (since row 1 is # S # space space E #, so positions 0:#,1:S,2:#,3:space,4:space,5:E,6:#). Wait, but E should be the exit. But according to the problem, the path to follow is marked by X. So the Xs are the correct path. So we need to trace from S through Xs to E.
+
+Wait, but S and E are not Xs. So the path starts at S, then follows Xs to reach E. Wait, but how? Let's look at the maze again.
+
+Wait, the problem says "the path marked by X" so the Xs are the route to follow. So the correct path is the Xs connecting S to E. So S must be adjacent to an X, and E must be adjacent to an X.
+
+Let's map out the maze properly.
+
+Let's index rows 0 to 6 (7 rows total), columns 0 to 6 (7 columns).
+
+Row 0: 0: #, 1: #, 2: #, 3: #, 4: #, 5: #, 6: #
+Row 1: 0:#, 1:S, 2:#, 3: , 4: , 5:E, 6:#
+Row 2: 0:#, 1:X, 2:#, 3:#, 4:#, 5:X, 6:#
+Row 3: 0:#, 1:X, 2:#, 3:X, 4:X, 5:X, 6:#
+Row 4: 0:#, 1:X, 2:#, 3:X, 4:#, 5:#, 6:#
+Row 5: 0:#, 1:X, 2:X, 3:X, 4: , 5: , 6:#
+Row 6: all #
+
+Wait, but in row 5, columns 4 and 5 are spaces. Hmm. Let's confirm each cell.
+
+Row 1: # S #   E # → columns 0:#, 1:S, 2:#, 3:space, 4:space, 5:E, 6:#.
+
+Row 2: # X # # # X # → 0:#,1:X,2:#,3:#,4:#,5:X,6:#.
+
+Row 3: # X # X X X # → 0:#,1:X,2:#,3:X,4:X,5:X,6:#.
+
+Row 4: # X # X # # # → 0:#,1:X,2:#,3:X,4:#,5:#,6:#.
+
+Row 5: # X X X   # → 0:#,1:X,2:X,3:X,4:space,5:space,6:#.
+
+Okay, so S is at (1,1). Now, the path should go from S (1,1) following Xs to E (1,5). Wait, but S is at (1,1), and E is at (1,5), but row 1 has a wall at (1,2), so you can't go straight from S to E. So the path must go down from S.
+
+Looking at S (1,1), the adjacent cells: up is row 0, which is all #, so blocked. Down is row 2, column 1: X. Left is (1,0), which is #. Right is (1,2), which is #. So the only possible move from S is down to (2,1), which is X.
+
+So the path starts at S (1,1), moves down to (2,1) X. Now, from (2,1), where can we go? Let's check the neighbors. Up is S, which is not X (but we came from there). Down is row 3, col1: X. Left is (2,0): #. Right is (2,2): #. So next step is down to (3,1) X.
+
+Wait, row 2, col1 is X. Row 3, col1 is X. So moving down to row3, col1.
+
+Now at (3,1). Check neighbors. Up is (2,1), X. Down is row4, col1: X. Left: #. Right: (3,2) is #. So down to (4,1) X.
+
+At (4,1). Neighbors: up is (3,1), X. Down is row5, col1: X. Left: #. Right: (4,2) is #. So down to (5,1) X.
+
+Row5, col1 is X. Now, neighbors: up is (4,1), X. Down is row6, blocked. Left: #. Right is (5,2): X. So move right to (5,2).
+
+(5,2) is X. Now, neighbors: up is (4,2), which is # (from row4, col2 is #). Wait, row4: # X # X # # #, so col2 is #. So up is blocked. Left is (5,1), which is X. Right is (5,3): X. So move right to (5,3).
+
+(5,3) is X. Now, neighbors: left is (5,2), X. Right is (5,4), which is space. Up is row4, col3: X (row4, col3 is X). Wait, row4: # X # X # # # → col3 is X. So up from (5,3) would be row4, col3: X. But wait, (5,3) is in row5. Up is row4, col3. Let's check if that's X.
+
+Row4: columns 0:#,1:X,2:#,3:X,4:#,5:#,6:#. So row4, col3 is X. So moving up from (5,3) to (4,3) X. Wait, but the path should be following Xs. Hmm, but maybe we need to see the correct path.
+
+Wait, perhaps the path from S to E via Xs is as follows:
+
+S (1,1) → down to (2,1) X → down to (3,1) X → down to (4,1) X → down to (5,1) X → right to (5,2) X → right to (5,3) X. Now, from (5,3), where to go? Let's see. The exit E is at (1,5). Hmm, but how to get there. Wait, maybe I'm missing something.
+
+Wait, row 5, col3 is X. Let's look at row3, col5: X. Row2, col5: X. Let's see. Maybe the path goes from (5,3) up to row4, col3 (X), then right? Wait, row4, col3 is X. From (4,3), neighbors: up is row3, col3: X (row3, col3 is X), down is row5, col3: X, left is row4, col2: #, right is row4, col4: #. So from (4,3), can go up or down. Wait, perhaps the path goes up from (4,3) to row3, col3.
+
+Wait, maybe I need to trace all Xs that connect S to E. Let's list all X positions:
+
+Row2: col1, col5
+
+Row3: col1, col3, col4, col5
+
+Row4: col1, col3
+
+Row5: col1, col2, col3
+
+So X positions are:
+
+(2,1), (2,5)
+
+(3,1), (3,3), (3,4), (3,5)
+
+(4,1), (4,3)
+
+(5,1), (5,2), (5,3)
+
+S is at (1,1), so adjacent X is (2,1).
+
+E is at (1,5), adjacent cell would need to be X. Looking at row1, col5 is E, so the cell above E (row0, col5) is #, left is (1,4) space, right is #. So to reach E, the path must come from (1,4), but (1,4) is space. Wait, but the problem says to follow the X path. Hmm, maybe E is adjacent to an X. Let's check row2, col5 is X. Row1, col5 is E. So (2,5) X is below E. So the path must go from (2,5) up to E (1,5). So the last step is up from (2,5) to E.
+
+So the path should end at (2,5), then move up to E.
+
+So the path from S to E via Xs would be S (1,1) → (2,1) → ... → (2,5) → E.
+
+So let's try to find the path.
+
+Start at (1,1) S. Must go down to (2,1) X.
+
+From (2,1), can we go down to (3,1), then down to (4,1), down to (5,1), right to (5,2), right to (5,3). From (5,3), where? Up to (4,3), then up to (3,3), right to (3,4), right to (3,5), up to (2,5), then up to E.
+
+Let's map this:
+
+(1,1) → (2,1) [down]
+
+(2,1) → (3,1) [down]
+
+(3,1) → (4,1) [down]
+
+(4,1) → (5,1) [down]
+
+(5,1) → (5,2) [right]
+
+(5,2) → (5,3) [right]
+
+(5,3) → (4,3) [up]
+
+(4,3) → (3,3) [up]
+
+(3,3) → (3,4) [right]
+
+(3,4) → (3,5) [right]
+
+(3,5) → (2,5) [up]
+
+(2,5) → (1,5) E [up]
+
+Wait, but (3,5) to (2,5) is up, then (2,5) to (1,5) is up. But (2,5) is X, so moving from (2,5) to E (which is not X, but the exit) is the last step.
+
+So the path through Xs is up to (2,5), then exit. So the X path ends at (2,5).
+
+Now, we need to track the direction changes to count right turns.
+
+First, we need to determine the direction of each move, then check for right turns (90 degrees clockwise from previous direction).
+
+Let's list each move step by step, noting the direction of movement.
+
+Starting at S (1,1), first move is down to (2,1). Direction: down.
+
+Next move: from (2,1) to (3,1), still down. Direction remains down.
+
+Next: (3,1) to (4,1), down.
+
+Next: (4,1) to (5,1), down.
+
+Now at (5,1), next move is right to (5,2). So direction changes from down to right. A right turn would be a 90-degree clockwise turn. Let's see: if current direction is down, a right turn would face right. Wait, direction changes: when moving down, turning right would face right (east), turning left would face east? Wait, need to be careful.
+
+Assuming standard directions:
+
+- Facing down: moving south.
+
+- A right turn (clockwise) from south would face west? Wait, no. Wait, if you're moving south (down), a right turn would face west? Wait, no. Wait, imagine facing south (down), a right turn would turn to face west? Wait, no. Wait, when facing south, a right turn (clockwise) would face west? Wait, no. Wait, clockwise directions:
+
+If you're facing north (up), right turn is east (right), left turn is west.
+
+Wait, better to use a standard:
+
+Direction abbreviations:
+
+N (up), E (right), S (down), W (left).
+
+A right turn from current direction is clockwise. So:
+
+From N, right turn → E
+
+From E, right turn → S
+
+From S, right turn → W
+
+From W, right turn → N
+
+Similarly, left turn:
+
+From N, left → W
+
+From E, left → N
+
+From S, left → E
+
+From W, left → S
+
+So, in our path:
+
+First move: down (S). So direction S.
+
+Next move: down (S) again. No turn.
+
+Next: down (S). No turn.
+
+Next: down (S). No turn.
+
+Then, from (5,1), moving right (E). So direction changes from S to E. What's the turn? From S (down), a right turn would be to W (left), but wait, moving right from down direction. Wait, if you're facing down (south), a right turn (clockwise) would face west? Wait, no. Wait, if you're moving south (down), turning right (clockwise) would face west? Wait, no. Let's think:
+
+Imagine facing south (down), your right hand would point west. Wait, no. If you're facing south (down), your right side is west. Wait, perhaps better to use:
+
+When moving in a direction, the direction you're facing is the direction of movement.
+
+So moving down: facing south.
+
+A right turn (clockwise) would change your facing direction to west? Wait, no. Let's say you're facing south. A right turn would turn you to face west? Or east?
+
+Wait, standard compass:
+
+If you're facing south, a right turn (clockwise) would face west? Wait, no. Clockwise from south: south → west → north → east → south. Wait, no. Clockwise direction is right.
+
+Wait, imagine a clock: 12 is north, 3 is east, 6 is south, 9 is west.
+
+So facing south (6 o'clock), a right turn (clockwise) would turn towards 9 o'clock (west), but actually, a right turn from south would face west? Wait, no. If you're facing south, a right turn (clockwise) would turn you to face west? Wait, no. Let's think:
+
+If you're facing south, your right hand points west. So turning right (clockwise) would face west. Wait, no. If you're facing south, turning right (as in, turning your body to the right) would face west. Wait, no. If you face south, and turn right, you face west. Wait, no. Let's take a real-world example. If you're facing south (towards the ground, like in a map), turning right (clockwise) would make you face west. Hmm.
+
+Wait, maybe better to use the following:
+
+Moving direction:
+
+Current direction | Right turn direction | Left turn direction
+
+N                 | E                     | W
+
+E                 | S                     | N
+
+S                 | W                     | E
+
+W                 | N                     | S
+
+Yes, this seems correct. So from S (south), right turn → W (west), left turn → E (east).
+
+Wait, but in our case, after moving down (S), the next direction is right (E). So changing from S to E. How?
+
+If moving south (down), then moving east (right), that would be a left turn. Because from S, left turn is E. Wait, according to the table above: from S, left turn direction is E. So moving east after south is a left turn.
+
+Wait, this is critical. So:
+
+If previous direction was S (south), and current direction is E (east), then the turn is left.
+
+Because S → E is a left turn.
+
+Similarly, S → W would be a right turn.
+
+So in our path, after moving down (S) to (5,1), then moving right (E) to (5,2). So direction changed from S to E. That's a left turn.
+
+Wait, but the problem is asking for right turns, so this would be a left turn, not counted.
+
+Continuing:
+
+After (5,2) to (5,3): direction still east (E). No turn.
+
+Then from (5,3) to (4,3): moving up (north, N). So direction changes from E to N. What turn is that?
+
+Current direction E, turning to N. From E, a left turn would be N (since E left → N), right turn would be S. So changing from E to N is a left turn.
+
+Wait, according to the table: E → N is a left turn.
+
+Then, (4,3) to (3,3): moving up (N), same direction. No turn.
+
+(3,3) to (3,4): east (E). Direction changes from N to E. From N, right turn → E. So that's a right turn.
+
+Ah, here's a right turn.
+
+Then (3,4) to (3,5): east again. No turn.
+
+(3,5) to (2,5): north (N). Direction changes from E to N. From E, left turn is N. So left turn.
+
+(2,5) to (1,5): north again. No turn.
+
+Wait, let's list all the direction changes step by step.
+
+Let's list each move with direction:
+
+1. Start at S (1,1), move down to (2,1): direction S.
+
+2. (2,1) → (3,1): S.
+
+3. (3,1) → (4,1): S.
+
+4. (4,1) → (5,1): S.
+
+5. (5,1) → (5,2): direction E (right turn? Wait, from S to E: as discussed, S to E is left turn).
+
+Wait, step 5: direction changes from S to E. So turn is left.
+
+6. (5,2) → (5,3): direction E (same, no turn).
+
+7. (5,3) → (4,3): direction N (up). From E to N: left turn.
+
+8. (4,3) → (3,3): direction N (same).
+
+9. (3,3) → (3,4): direction E. From N to E: right turn.
+
+10. (3,4) → (3,5): direction E (same).
+
+11. (3,5) → (2,5): direction N. From E to N: left turn.
+
+12. (2,5) → (1,5): direction N (same).
+
+Now, count the right turns. Looking at the direction changes:
+
+Step 5: S → E: left turn (not counted)
+
+Step 7: E → N: left turn
+
+Step 9: N → E: right turn
+
+Step 11: E → N: left turn
+
+So only step 9 is a right turn. Wait, so only 1 right turn? But the options have A.1, D.2. Hmm.
+
+Wait, maybe I missed something. Let's double-check.
+
+Wait, step 9: moving from (3,3) to (3,4) is east. Previous direction was north (from (4,3) to (3,3)), so direction was N, then moving E. So changing from N to E is a right turn. Yes.
+
+Any other right turns?
+
+Let's check again:
+
+List of direction changes:
+
+Move 1-4: all S, no turns.
+
+Move 5: S → E: left.
+
+Move 6: same E.
+
+Move 7: E → N: left.
+
+Move 8: same N.
+
+Move 9: N → E: right.
+
+Move 10: same E.
+
+Move 11: E → N: left.
+
+Move 12: same N.
+
+So only one right turn at step 9.
+
+But the options include A.1 and D.2. Hmm. Maybe I made a mistake.
+
+Wait, another possibility: when moving from (5,3) to (4,3), direction N. Previous direction was E. So E to N is left turn. Correct.
+
+Wait, but let's think about the path again. Is there another right turn?
+
+Wait, maybe I missed a part of the path. Let's confirm the path again.
+
+Alternative path: Could there be a different route through the Xs?
+
+Wait, the Xs are:
+
+Row2: (2,1), (2,5)
+
+Row3: (3,1), (3,3), (3,4), (3,5)
+
+Row4: (4,1), (4,3)
+
+Row5: (5,1), (5,2), (5,3)
+
+So possible path:
+
+S (1,1) down to (2,1), down to (3,1), down to (4,1), down to (5,1), right to (5,2), right to (5,3), up to (4,3), up to (3,3), right to (3,4), right to (3,5), up to (2,5), up to E.
+
+Yes, that's the path I considered.
+
+Another way: from (3,1), is there a way to go east? Row3, col1 is X, col2 is #, so no. So must go down to row4, col1.
+
+Hmm. So the path seems correct.
+
+Wait, but let's check another angle. When moving from (3,3) to (3,4), direction E. Previous direction was N (coming from (4,3) up to (3,3)). So turning right from N to E. Yes, that's a right turn.
+
+Any other right turns?
+
+Wait, when moving from S down to (2,1), then down to (3,1), etc., all the way down is straight. Then turning right (east) at (5,1), which was a left turn. Then moving east, then turning left (north) at (5,3), then moving north, then turning right to east at (3,3), then east, then turning left to north at (3,5).
+
+Wait, so only one right turn. So answer A.1?
+
+But let's check again. Maybe I messed up the direction.
+
+Alternative approach: draw the path and note directions.
+
+Let's list each segment with direction:
+
+1. (1,1) → (2,1): down (S)
+
+2. (2,1) → (3,1): down (S)
+
+3. (3,1) → (4,1): down (S)
+
+4. (4,1) → (5,1): down (S)
+
+5. (5,1) → (5,2): right (E)
+
+6. (5,2) → (5,3): right (E)
+
+7. (5,3) → (4,3): up (N)
+
+8. (4,3) → (3,3): up (N)
+
+9. (3,3) → (3,4): right (E)
+
+10. (3,4) → (3,5): right (E)
+
+11. (3,5) → (2,5): up (N)
+
+12. (2,5) → (1,5): up (N)
+
+Now, the direction changes occur between segments:
+
+Between 4 and 5: S → E. Left turn.
+
+Between 6 and 7: E → N. Left turn.
+
+Between 8 and 9: N → E. Right turn.
+
+Between 10 and 11: E → N. Left turn.
+
+So only one right turn. So answer A.1.
+
+But wait, the options are A.1, B.5, C.8, D.2. So A is an option. But I need to be sure.
+
+Wait, maybe I made a mistake in the direction change between 8 and 9.
+
+Segment 8: moving up (N) to (3,3). Then segment 9: moving east (E). So direction changes from N to E. A right turn. Correct.
+
+Another way: if you're facing north (up), a right turn would face east. Yes. So that's a right turn.
+
+Any other right turns?
+
+Between segment 5 and 6: no, same direction.
+
+Between segment 7 and 8: same direction (N).
+
+Between segment 9 and 10: same direction (E).
+
+Between 10 and 11: E to N, left turn.
+
+So only one right turn. So answer should be A.1.
+
+But wait, the problem says "the path marked by X". Let me confirm that all the Xs are used. The path I took uses all the Xs in the path, but maybe there's a different path that uses more Xs and has more turns.
+
+Wait, but the Xs are the specific route to follow. So the path must be the one that connects S to E through Xs. There might be only one such path.
+
+Let's check if there's another possible path.
+
+From row5, col3 (X), could we go east? (5,4) is space, so no. So must go up.
+
+Another possibility: from row3, col5 (X), can we go east? No, col6 is #. So must go up to row2, col5.
+
+Hmm. Seems like the path I traced is the only one.
+
+Wait, another check: row3, col4 is X. So (3,4) is part of the path. Yes.
+
+Wait, but let's count the turns again.
+
+Wait, after moving up from (5,3) to (4,3) to (3,3), then east to (3,4), that's a right turn. Then east again. Then up to (2,5). So only one right turn.
+
+But maybe I missed a turn. Let's think about the movement directions step by step.
+
+Let's assign a direction to each move:
+
+Move 1: down (S)
+
+Move 2: down (S) → no turn
+
+Move 3: down (S) → no
+
+Move 4: down (S) → no
+
+Move 5: right (E) → from S to E: left turn
+
+Move 6: right (E) → no turn
+
+Move 7: up (N) → from E to N: left turn
+
+Move 8: up (N) → no
+
+Move 9: right (E) → from N to E: right turn
+
+Move 10: right (E) → no
+
+Move 11: up (N) → from E to N: left turn
+
+Move 12: up (N) → no
+
+So right turns only at move 9. So total 1.
+
+But the answer options have A.1. So why is the option D.2? Maybe I made a mistake.
+
+Wait, wait. Let's consider the direction when moving from (3,5) to (2,5): up (N), previous direction was east (E). So from E to N is left turn. Correct.
+
+Wait, another possibility: when moving from (2,5) to (1,5), that's up, same direction as previous (up), so no turn.
+
+Hmm.
+
+Wait, perhaps I miscounted the direction change at move 9.
+
+Wait, move 8: moving up to (3,3), direction N.
+
+Move 9: moving east, so direction E. So changing from N to E. Right turn. Yes.
+
+Is there another right turn elsewhere?
+
+Wait, let's think about the very first move. From S (1,1), moving down. But S is at (1,1), and the first move is down. The direction is S. Then, all the way down until (5,1). Then turn east. That's a left turn.
+
+Wait, unless... Wait, the problem says "right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
+
+So when you change direction, if it's a 90-degree clockwise turn from the previous direction, it's a right turn.
+
+So previous direction is S (down), new direction is E (right). Clockwise from S (down) would be W (west), so E is counterclockwise from S. So it's a left turn. Which matches earlier analysis.
+
+Another check: from N, right turn to E. Yes.
+
+So I think the count is correct. Only one right turn.
+
+But let's see the options. The user lists options A.1, B.5, C.8, D.2. So A is an option.
+
+But maybe the intended answer is different. Let me check again.
+
+Wait, another way: perhaps the path is different.
+
+Wait, row3, col3 is X. Is there a way to go from (3,1) east? No, because row3, col2 is #. So must go down.
+
+Wait, what if from (3,3), instead of going east, we go north? But (2,3) is # (row2: #X###X#, so col3 is #), so can't go north from (3,3).
+
+Hmm.
+
+Wait, maybe I made a mistake in the path. Let's try to see.
+
+Alternative path: S → down to (2,1), right? No, (2,2) is #. Can't go right. So must go down.
+
+Another idea: after (5,3), instead of going up, is there another way? (5,3) can go east to (5,4), but (5,4) is space, not X. So no. So must go up.
+
+So path seems correct.
+
+Wait, perhaps the turn count is different. Let's think of the turns as you traverse the path.
+
+Starting direction: down.
+
+After four downs, then turn left (east), then two rights, then turn left (north), then two ups, then turn right (east), then two rights, then turn left (north), then two ups.
+
+Wait, the turns are:
+
+1. Down to east: left
+
+2. East to north: left
+
+3. North to east: right
+
+4. East to north: left
+
+So three turns, one right.
+
+Yes, only one right turn.
+
+Hmm. So answer should be A.1.
+
+But I'm a bit unsure because sometimes these puzzles can have tricks. Let's see the maze again.
+
+Wait, looking at row3: #X#XXX#, so columns 1,3,4,5 are X.
+
+Row4: #X#X### → columns 1 and 3.
+
+Row5: #XXX  # → columns 1,2,3.
+
+So another possible path: from (3,3), could we go east to (3,4), (3,5), then up to (2,5), which is what we did.
+
+I think the path is correct.
+
+Wait, but let's count the number of direction changes again:
+
+- After the initial four downs (no turns), then a left turn (east), then no turn for two steps, then a left turn (north), then no turn, then a right turn (east), then no turn, then a left turn (north), then no turn.
+
+So right turns: 1.
+
+Thus, the answer should be A.1.
+
+But why is option D.2 present? Maybe I missed a turn.
+
+Wait, wait. When moving from (3,3) to (3,4), direction east. Previous direction was north (coming from (4,3) to (3,3)). So turning right to east.
+
+Then, moving east to (3,4), then east to (3,5), then up to (2,5). When moving from (3,5) up, direction changes from east to north: left turn.
+
+But what about when moving from (5,3) to (4,3) (up), previous direction was east. So east to north: left turn.
+
+Another angle: let's list all the turns with their type:
+
+1. S → E: left
+
+2. E → N: left
+
+3. N → E: right
+
+4. E → N: left
+
+Total right turns: 1.
+
+Yes. So answer A.
+
+But the problem statement says "the path marked by X". Are all the Xs in the path? Let's see:
+
+Path X positions:
+
+(2,1), (3,1), (4,1), (5,1), (5,2), (5,3), (4,3), (3,3), (3,4), (3,5), (2,5).
+
+Check if all these are Xs:
+
+Row2: (2,1) X, (2,5) X – yes.
+
+Row3: (3,1), (3,3), (3,4), (3,5) X – yes.
+
+Row4: (4,1), (4,3) X – yes.
+
+Row5: (5,1), (5,2), (5,3) X – yes.
+
+So all the Xs along the path are used. So the path is correct.
+
+Therefore, only one right turn. Answer A.1.
+
+But wait, looking back at the problem statement: "the path marked by X". So the Xs are the path, so we have to follow Xs, which we did.
+
+Hmm. Maybe the answer is A.1. But I'm a bit confused because sometimes these questions can have nuances.
+
+Wait, another check: when you move from (3,5) to (2,5), direction north. Previous direction was east. So turning left. Correct.
+
+And the move from (3,3) to (3,4) is east, previous direction north: right turn.
+
+Yes. So only one right turn.
+
+I think the answer is A.1.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
+    Current position: (r2, c2)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+[... continue for all steps until reaching E ...]
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (1, 1)
+    E position: (1, 5)
+
+>>> STEP 1: Move DOWN from (1,1) to (2,1)
+    Current position: (2,1)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 2: Move DOWN from (2,1) to (3,1)
+    Current position: (3,1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 3: Move DOWN from (3,1) to (4,1)
+    Current position: (4,1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 4: Move DOWN from (4,1) to (5,1)
+    Current position: (5,1)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 5: Move RIGHT from (5,1) to (5,2)
+    Current position: (5,2)
+    Previous direction: DOWN
+    Current direction: RIGHT
+    Turn type: LEFT
+    Running count: Right=0, Left=1
+
+>>> STEP 6: Move RIGHT from (5,2) to (5,3)
+    Current position: (5,3)
+    Previous direction: RIGHT
+    Current direction: RIGHT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 7: Move UP from (5,3) to (4,3)
+    Current position: (4,3)
+    Previous direction: RIGHT
+    Current direction: UP
+    Turn type: LEFT
+    Running count: Right=0, Left=2
+
+>>> STEP 8: Move UP from (4,3) to (3,3)
+    Current position: (3,3)
+    Previous direction: UP
+    Current direction: UP
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=2
+
+>>> STEP 9: Move RIGHT from (3,3) to (3,4)
+    Current position: (3,4)
+    Previous direction: UP
+    Current direction: RIGHT
+    Turn type: RIGHT
+    Running count: Right=1, Left=2
+
+>>> STEP 10: Move RIGHT from (3,4) to (3,5)
+    Current position: (3,5)
+    Previous direction: RIGHT
+    Current direction: RIGHT
+    Turn type: STRAIGHT
+    Running count: Right=1, Left=2
+
+>>> STEP 11: Move UP from (3,5) to (2,5)
+    Current position: (2,5)
+    Previous direction: RIGHT
+    Current direction: UP
+    Turn type: LEFT
+    Running count: Right=1, Left=3
+
+>>> STEP 12: Move UP from (2,5) to (1,5) [E]
+    Current position: (1,5)
+    Previous direction: UP
+    Current direction: UP
+    Turn type: STRAIGHT
+    Running count: Right=1, Left=3
+
+>>> FINAL ANSWER:
+    \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt
new file mode 100644
index 00000000..08fd3f7a
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt
@@ -0,0 +1,579 @@
+Okay, let's try to figure out how many right turns there are in the path from S to E marked by X. First, I need to visualize the maze properly. The ASCII maze given is:
+
+#######
+#  XXX#
+###X#X#
+#  E#X#
+#####X#
+#SXXXX#
+#######
+
+Hmm, let's parse each line. Let's number the rows from top to bottom as rows 0 to 5 maybe. Wait, but maybe it's better to write them out with coordinates.
+
+Let me write each line with row numbers:
+
+Row 0: #######
+Row 1: #  XXX#
+Row 2: ###X#X#
+Row 3: #  E#X#
+Row 4: #####X#
+Row 5: #SXXXX#
+Row 6: #######
+
+Wait, actually, the maze as given has 7 lines (including the top and bottom walls), so rows 0 to 6. Let's check:
+
+Line 1: ####### (row 0)
+Line 2: #  XXX# (row 1)
+Line 3: ###X#X# (row 2)
+Line 4: #  E#X# (row 3)
+Line 5: #####X# (row 4)
+Line 6: #SXXXX# (row 5)
+Line 7: ####### (row 6)
+
+Wait, the user provided 7 lines. Let's index them as rows 0 to 6.
+
+Now, the maze:
+
+Row 0: 0 1 2 3 4 5 6
+        # # # # # # #
+
+Row 1: #   X X X #
+So positions (1,1) is space, (1,2) space, (1,3) X, (1,4) X, (1,5) X, (1,6) #
+
+Row 2: ###X#X#
+So row 2: # # # X # X #, so columns 0-6. So (2,3)=X, (2,5)=X
+
+Row 3: #  E#X#
+So (3,2)=space, (3,3)=E? Wait, no: #  E#X#, so columns:
+
+0: #, 1: space, 2: space, 3: E, 4: #, 5: X, 6: # ?
+
+Wait, let's break down each row character by character.
+
+Row 0: "#######" → all walls.
+
+Row 1: "#  XXX#" → characters at positions 0: #, 1: space, 2: space, 3: X, 4: X, 5: X, 6: #.
+
+Row 2: "###X#X#" → 0:#, 1:#, 2:#, 3:X, 4:#, 5:X, 6:#.
+
+Row 3: "#  E#X#" → 0:#, 1:space, 2:space, 3:E, 4:#, 5:X, 6:#.
+
+Row 4: "#####X#" → 0:#,1:#,2:#,3:#,4:#,5:X,6:#.
+
+Row 5: "#SXXXX#" → 0:#,1:S,2:X,3:X,4:X,5:X,6:#.
+
+Row 6: "#######".
+
+Wait, but the problem states that X marks the specific route to follow. So the path from S to E must follow X's. Let's find S and E.
+
+S is in row 5, column 1 (since row 5 is "#SXXXX#", so index 1 is S).
+
+E is in row 3, column 3 (row 3: "#  E#X#", so the E is at position 3).
+
+Now, the path is marked by X. So we need to trace from S (row5, col1) following X's to E (row3, col3).
+
+Wait, but S is at (5,1), and the X's are the path. Let's see the X's in the maze.
+
+Looking at each X:
+
+Row1: cols3,4,5 are X.
+
+Row2: col3 and 5 are X.
+
+Row3: col5 is X.
+
+Row4: col5 is X.
+
+Row5: cols2,3,4,5 are X.
+
+Wait, row5: #SXXXX#, so col1 is S, then col2-5 are X. So S is at (5,1), and the X path starts at (5,2) maybe? Wait, but S is the start, so the path from S should go through X's. Wait, but S is not X. Wait, the problem says "the path marked by X", so the correct path to follow is along X's. So S is the starting point, which is adjacent to X's perhaps.
+
+Wait, the maze says:
+
+- S denotes the starting point.
+
+- X marks the specific route you should follow.
+
+So the path from S to E must move through X's. So S is not part of the X path, but the path starts from S, then follows X's. Wait, but in the maze, S is at (5,1), and the cell to the right (5,2) is X. So the path starts at S (5,1), then moves to (5,2), which is X, then continues along X's.
+
+Similarly, E is at (3,3), which is adjacent to X's perhaps. Let's see.
+
+Let's map all X positions:
+
+Row 1: (1,3), (1,4), (1,5)
+
+Row 2: (2,3), (2,5)
+
+Row 3: (3,5)
+
+Row 4: (4,5)
+
+Row 5: (5,2), (5,3), (5,4), (5,5)
+
+Wait, row5: #SXXXX#, so columns 0:#, 1:S, 2:X, 3:X, 4:X, 5:X, 6:#. So (5,2) to (5,5) are X.
+
+So the path from S (5,1) must go to (5,2), then follow X's.
+
+Let's try to trace the path.
+
+Starting at S (5,1). The first move must be to (5,2), since that's the only X adjacent to S (since S is at (5,1), up would be row4, col1 which is # (row4 is #####X#, so col1 is #), down is row6, which is wall, left is col0 (wall), right is (5,2), which is X.
+
+So first direction is right (east) from S.
+
+Now, from (5,2), we need to follow X's. Let's list all X positions and see how they connect.
+
+Let's list all X cells:
+
+Row 1: (1,3), (1,4), (1,5)
+
+Row 2: (2,3), (2,5)
+
+Row 3: (3,5)
+
+Row 4: (4,5)
+
+Row 5: (5,2), (5,3), (5,4), (5,5)
+
+So the X's form a path. Let's see how they connect.
+
+Starting at (5,2), which is X. From (5,2), possible moves: up (row4, col2), but row4 col2 is # (row4: #####X#, so col2 is #), left is (5,1) which is S (not X), right is (5,3), which is X. So path continues east to (5,3).
+
+Then from (5,3), east to (5,4), then east to (5,5). Now at (5,5). From here, up (row4, col5) is X (row4 col5 is X), so move up to (4,5).
+
+From (4,5), up to row3, col5, which is X (row3 col5 is X), so move up to (3,5).
+
+From (3,5), up to row2, col5, which is X (row2 col5 is X), move up to (2,5).
+
+From (2,5), up to row1, col5, which is X (row1 col5 is X), move up to (1,5).
+
+From (1,5), left to (1,4), then left to (1,3). Wait, but row1 col3 is X. Wait, but row1 col3,4,5 are X. So from (1,5), moving west to (1,4), then west to (1,3). Now at (1,3). From (1,3), can we go down? Row2 col3 is X (row2 col3 is X), so down to (2,3).
+
+From (2,3), down to row3, col3. Wait, row3 col3 is E. Wait, row3: "#  E#X#", so col3 is E. So (3,3) is E. So from (2,3), moving down would reach (3,3), which is E.
+
+Wait, but according to the path, we need to follow X's. So the path should end at E. So the X path leads to E. So let's confirm:
+
+The path should be:
+
+Start at S (5,1) → move east to (5,2) [X], then east to (5,3), east to (5,4), east to (5,5), north to (4,5), north to (3,5), north to (2,5), north to (1,5), west to (1,4), west to (1,3), south to (2,3), south to (3,3)=E.
+
+Wait, but (2,3) is X, then moving south from (2,3) to (3,3), which is E. So the path is through X's until the last step to E.
+
+So the path of X's is from (5,2) to (5,5), then up to (4,5), (3,5), (2,5), (1,5), then left to (1,4), (1,3), then down to (2,3), then down to E.
+
+Wait, but (2,3) is X, so moving from (2,3) to (3,3) (E) is the last step. So the X path ends at (2,3), then E is adjacent.
+
+Now, we need to track the directions taken along the X path, and count the right turns.
+
+First, let's list the path step by step, noting the direction of movement between each cell.
+
+Starting at S (5,1), first move is east (right) to (5,2). So first direction: east.
+
+Now, from (5,2) to (5,3): east again. No turn.
+
+(5,3) to (5,4): east.
+
+(5,4) to (5,5): east.
+
+Now, from (5,5) to (4,5): north. So direction changes from east to north. What's the turn here? If previous direction was east, moving north would be a left turn (since east to north is 90 degrees counterclockwise, i.e., left turn). Wait, need to confirm.
+
+Movement direction: east (right), then moving north (up). So changing direction from east to north. To go from east to north, you turn left (since east is right, north is up; turning left from east would face north). So this is a left turn.
+
+But we need to count right turns. So this would be a left turn, not counted.
+
+Next, from (4,5) to (3,5): north again. No turn.
+
+(3,5) to (2,5): north.
+
+(2,5) to (1,5): north.
+
+Now, from (1,5) to (1,4): west. Previous direction was north, now moving west. From north to west is a left turn (since north to west is 90 degrees left). Wait, direction change: north to west. If facing north, turning left would face west. So yes, left turn. Not a right turn.
+
+Then from (1,4) to (1,3): west again. No turn.
+
+Now, from (1,3) to (2,3): south. Previous direction was west, now moving south. From west to south: turning right? Let's see. Facing west, turning right would face north? Wait, no.
+
+Wait, direction changes:
+
+Current direction before the move: west (from (1,4) to (1,3), direction west).
+
+Next move is south (from (1,3) to (2,3)). So changing direction from west to south.
+
+What's the turn here? If facing west, to face south, you turn right (90 degrees clockwise). Let's think:
+
+- Facing west (left), turning right (clockwise) would turn to north? Wait, no.
+
+Wait, standard directions:
+
+- East: right
+
+- North: up
+
+- West: left
+
+- South: down
+
+If you're moving west (left), then turning right (clockwise) would face north? Wait, no.
+
+Wait, imagine facing west (towards left on the screen). A right turn (clockwise) would make you face north (upwards), because clockwise from west would be north. Wait, no:
+
+Wait, if you're facing west (left), a right turn (clockwise) would turn you to face north? Let's see:
+
+- Facing north: right turn → east.
+
+- Facing east: right turn → south.
+
+- Facing south: right turn → west.
+
+- Facing west: right turn → north.
+
+Yes. So west to north is right turn.
+
+But here, we're moving west, then turning to south. Wait, direction change from west to south. So from west, to go south, which direction is the turn?
+
+West to south: if facing west, to face south, you need to turn right twice (180), but actually, the turn would be a right turn of 90 degrees? Wait, no.
+
+Wait, the direction of movement: previous direction was west (arriving at (1,3) from the east, i.e., moving west), now moving south. So the direction changed from west to south.
+
+To go from west to south, what's the turn?
+
+West → south: clockwise 90 degrees? Let's see:
+
+From west, clockwise 90 degrees would be north? Wait, no. Wait, standard compass:
+
+Clockwise turns:
+
+North → East → South → West → North.
+
+So if you're facing west, a clockwise (right) turn would face north. A counterclockwise (left) turn would face south.
+
+Wait, yes: if facing west, left turn (counterclockwise) would be to face south, and right turn would be to face north.
+
+Wait, let's confirm:
+
+- Facing north, left turn → west.
+
+- Facing north, right turn → east.
+
+Similarly, facing west:
+
+- Left turn (counterclockwise) → south.
+
+- Right turn (clockwise) → north.
+
+So if we were moving west (direction west), then change direction to south, that's a left turn.
+
+Wait, because from west, left turn (counterclockwise) would face south. So moving from west to south is a left turn.
+
+Hmm, this is a bit confusing. Let's think in terms of direction vectors.
+
+Previous direction: west (vector (-1,0) in terms of movement? Wait, maybe better to use compass directions.
+
+Let's say:
+
+- East: +x direction.
+
+- North: +y direction.
+
+Wait, but in grid terms, row numbers usually increase downward, so row 0 is top, row 5 is lower. So moving north would decrease row number, south increases row.
+
+But for direction purposes, maybe better to consider:
+
+When moving from cell A to cell B, the direction is based on the movement.
+
+So from (5,5) to (4,5): moving north (row decreases).
+
+From (1,5) to (1,4): moving west (column decreases).
+
+From (1,3) to (2,3): moving south (row increases).
+
+So previous direction before the move to (2,3) was west (since we were moving west to get to (1,3)), then the next move is south.
+
+So direction changed from west to south.
+
+To find the turn type: when moving west, then moving south, what's the turn.
+
+Imagine facing west (left), then turning to face south (down). To face south from west, you turn left (counterclockwise), because west → south is left.
+
+Yes: west to south is left turn.
+
+So this would be a left turn, not a right.
+
+Wait, but let's make sure. If you're moving west (left), to start moving south (down), you turn left (since turning left from west would face south). So yes, left turn.
+
+So this move would be a left turn, not counted.
+
+Then from (2,3) to (3,3): south again. No turn.
+
+So let's recap all the direction changes:
+
+Start at S (5,1), first move east (to (5,2)). So initial direction east.
+
+Path segments:
+
+1. (5,1) → (5,2): direction east.
+
+2. (5,2) → (5,3): east (no turn).
+
+3. (5,3) → (5,4): east.
+
+4. (5,4) → (5,5): east.
+
+5. (5,5) → (4,5): north (change from east → north: left turn).
+
+6. (4,5) → (3,5): north.
+
+7. (3,5) → (2,5): north.
+
+8. (2,5) → (1,5): north.
+
+9. (1,5) → (1,4): west (change from north → west: left turn).
+
+10. (1,4) → (1,3): west.
+
+11. (1,3) → (2,3): south (change from west → south: left turn).
+
+12. (2,3) → (3,3): south (to E).
+
+Now, let's list all the direction changes (i.e., where the direction changes from the previous step):
+
+- After step 4 (east to north at step 5): left turn.
+
+- After step 8 (north to west at step 9): left turn.
+
+- After step 10 (west to south at step 11): left turn.
+
+Wait, so all three direction changes are left turns. Are there any right turns?
+
+Wait, but according to this, there are zero right turns. But the options include B.0. But wait, maybe I made a mistake.
+
+Wait, let's re-examine the direction changes.
+
+Wait, step 5: moving north after east. East → north: left turn (as east to north is left).
+
+Step 9: moving west after north. North → west: left turn (north to west is left).
+
+Step 11: moving south after west. West → south: left turn.
+
+So all three direction changes are left turns. So no right turns. So answer would be B.0.
+
+But wait, the options are A.4, B.0, C.3, D.5. Hmm.
+
+Wait, but maybe I messed up the path. Let's confirm the path again.
+
+Alternative path: perhaps the path doesn't go all the way to (1,3), but maybe another way.
+
+Wait, let's retrace.
+
+From S (5,1), move east to (5,2), then east to (5,3), east to (5,4), east to (5,5). Then north to (4,5), north to (3,5), north to (2,5), north to (1,5). Now at (1,5). From here, west to (1,4), west to (1,3). Now, (1,3) is X. From (1,3), what's the next step? The only adjacent X's would be south to (2,3), which is X. So yes, down to (2,3), then down to (3,3)=E.
+
+So the path is correct.
+
+Another possibility: maybe from (2,5), instead of going north to (1,5), but no, row2 col5 is X, row1 col5 is X.
+
+Wait, but let's check if there's another path. For example, row2 has X at (2,3) and (2,5). So from (2,5), can we go west to (2,4), but row2 col4 is # (since row2 is ###X#X#, so col4 is #). So no, can't go west. So path must go north to row1.
+
+Hmm. So the path as traced seems correct.
+
+So all direction changes are left turns. So zero right turns. Answer B.0.
+
+Wait, but let's check again.
+
+Wait, another way: when moving from (5,5) north, direction east → north: left turn.
+
+Then moving west from north: north → west is left turn.
+
+West → south: left turn.
+
+No right turns. So answer B.0.
+
+But let's see the options. The problem states "the path marked by X", so maybe I missed a turn.
+
+Wait, wait, when moving from (1,3) south to (2,3), direction west → south. As discussed, left turn.
+
+Wait, but let's consider the direction of movement. The turn is determined by the change in direction. Let's use a different approach.
+
+List the direction of each move:
+
+Move 1: east
+
+Move 2: east
+
+Move 3: east
+
+Move 4: east
+
+Move 5: north
+
+Move 6: north
+
+Move 7: north
+
+Move 8: north
+
+Move 9: west
+
+Move 10: west
+
+Move 11: south
+
+Move 12: south
+
+Now, the direction changes occur between move 4 and 5 (east → north), move 8 and 9 (north → west), move 10 and 11 (west → south).
+
+Each of these changes:
+
+east → north: left turn.
+
+north → west: left turn.
+
+west → south: left turn.
+
+All left. So zero right turns.
+
+Hence, the answer should be B.0.
+
+But wait, the problem says "right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
+
+So east to north: previous direction east. To get to north, turning left (counterclockwise), so not a right turn.
+
+east to south would be a right turn.
+
+north to east: right turn.
+
+But in our case, all changes are left.
+
+Hmm. So the answer is B.0.
+
+But let me check again if there's a different path.
+
+Wait, another way: from (5,5), instead of going north, is there another X? No, row4 col5 is X, which is the only way up.
+
+Wait, what about row2 col3: X. Could the path go from (5,5) up to (4,5), (3,5), (2,5), then west? But row2 col4 is #, so can't go west from (2,5). So must go north to (1,5).
+
+Hmm.
+
+Alternatively, after (1,5), moving west to (1,4), (1,3), then south to (2,3), which is X, then south to E. Yes, that's the path.
+
+Another check: let's count the turns again.
+
+Starting direction: east (from S to first X).
+
+After four east moves, turn north: left.
+
+Then four north moves, turn west: left.
+
+Then two west moves, turn south: left.
+
+Total three left turns, no right turns.
+
+Therefore, the number of right turns is zero. Answer B.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
+    Current position: (r2, c2)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+[... continue for all steps until reaching E ...]
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (5, 1)
+    E position: (3, 3)
+
+>>> STEP 1: Move EAST from (5, 1) to (5, 2)
+    Current position: (5, 2)
+    Previous direction: —
+    Current direction: EAST
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 2: Move EAST from (5, 2) to (5, 3)
+    Current position: (5, 3)
+    Previous direction: EAST
+    Current direction: EAST
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 3: Move EAST from (5, 3) to (5, 4)
+    Current position: (5, 4)
+    Previous direction: EAST
+    Current direction: EAST
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 4: Move EAST from (5, 4) to (5, 5)
+    Current position: (5, 5)
+    Previous direction: EAST
+    Current direction: EAST
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 5: Move NORTH from (5, 5) to (4, 5)
+    Current position: (4, 5)
+    Previous direction: EAST
+    Current direction: NORTH
+    Turn type: LEFT (EAST → NORTH is 90° counterclockwise)
+    Running count: Right=0, Left=1
+
+>>> STEP 6: Move NORTH from (4, 5) to (3, 5)
+    Current position: (3, 5)
+    Previous direction: NORTH
+    Current direction: NORTH
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 7: Move NORTH from (3, 5) to (2, 5)
+    Current position: (2, 5)
+    Previous direction: NORTH
+    Current direction: NORTH
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 8: Move NORTH from (2, 5) to (1, 5)
+    Current position: (1, 5)
+    Previous direction: NORTH
+    Current direction: NORTH
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 9: Move WEST from (1, 5) to (1, 4)
+    Current position: (1, 4)
+    Previous direction: NORTH
+    Current direction: WEST
+    Turn type: LEFT (NORTH → WEST is 90° counterclockwise)
+    Running count: Right=0, Left=2
+
+>>> STEP 10: Move WEST from (1, 4) to (1, 3)
+    Current position: (1, 3)
+    Previous direction: WEST
+    Current direction: WEST
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=2
+
+>>> STEP 11: Move SOUTH from (1, 3) to (2, 3)
+    Current position: (2, 3)
+    Previous direction: WEST
+    Current direction: SOUTH
+    Turn type: LEFT (WEST → SOUTH is 90° counterclockwise)
+    Running count: Right=0, Left=3
+
+>>> STEP 12: Move SOUTH from (2, 3) to (3, 3)
+    Current position: (3, 3)
+    Previous direction: SOUTH
+    Current direction: SOUTH
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=3
+
+>>> FINAL ANSWER:
+    The path contains **0** right turns.
+    \boxed{B}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt
new file mode 100644
index 00000000..31e8200f
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt
@@ -0,0 +1,598 @@
+Okay, let's try to figure out how many right turns there are in the path from S to E marked by X. First, I need to visualize the maze properly. The ASCII maze given is:
+
+#######
+#E#XXS#
+#X#X# #
+#XXX# #
+##### #
+#     #
+#######
+
+Wait, the maze might be a bit hard to read because of the formatting. Let me parse it line by line.
+
+Let's list each row:
+
+Row 0: ####### (all walls)
+Row 1: #E#XXS#
+Row 2: #X#X# #
+Row 3: #XXX# #
+Row 4: ##### #
+Row 5: #     #
+Row 6: #######
+
+Hmm, but the actual maze might be 7 rows high (from 0 to 6), each row having 7 characters (since the first row is 7 #s). Let's check each row's length.
+
+Row 0: 7 characters (all #)
+Row 1: #E#XXS# → let's count: # E # X X S # → that's 7 characters.
+Row 2: #X#X# # → # X # X # space # → 7 characters.
+Row 3: #XXX# # → # X X X # space # → 7 characters.
+Row 4: ##### # → # # # # # space # → 7 characters.
+Row 5: #     # → # followed by 5 spaces and # → 7 characters.
+Row 6: ####### → 7 #s.
+
+Okay, so it's a 7x7 grid.
+
+Now, the starting point S is in row 1. Let's find coordinates. Let's index rows from top (0) to bottom (6), columns left (0) to right (6).
+
+Row 1 (second row from top): positions 0: #, 1: E, 2: #, 3: X, 4: X, 5: S, 6: #.
+
+Wait, wait, row 1 is "#E#XXS#", so breaking it down:
+
+Index: 0 1 2 3 4 5 6
+Char:  # E # X X S #
+
+So S is at (row 1, column 5). E is at (row 1, column 1).
+
+Wait, but the exit is E. So the starting point S is at (1,5), and E is at (1,1). But the path marked by X needs to be followed. Wait, the Xs are the specific route. So the correct path from S to E is along the Xs.
+
+Wait, but the problem says: "the path marked by X". So the Xs are the correct route. So we need to trace the path of Xs from S to E.
+
+Wait, but S is a separate character. So the path starts at S, then follows Xs to reach E. Wait, but looking at the maze:
+
+Row 1: #E#XXS# → So the E is at (1,1), then there's a #, then XXS. So the S is at (1,5), and the Xs are at (1,3), (1,4), and then maybe other Xs.
+
+Wait, let's map all the Xs.
+
+Row 1: columns 3 and 4 are X.
+
+Row 2: #X#X# # → columns 1 and 3 are X.
+
+Row 3: #XXX# # → columns 1,2,3 are X.
+
+Row 4: ##### # → no Xs.
+
+Row 5: #     # → no Xs.
+
+So the Xs are located at:
+
+Row 1: (1,3), (1,4)
+
+Row 2: (2,1), (2,3)
+
+Row 3: (3,1), (3,2), (3,3)
+
+So the path from S (which is at (1,5)) to E (1,1) must follow these Xs. Wait, but how does the path connect?
+
+Wait, S is at (1,5). To get to E (1,1), the path must go through Xs. Let's see:
+
+Starting at S (1,5), which is next to X at (1,4). So from S, moving left to (1,4), which is X. Then where?
+
+(1,4) is X. Let's see neighbors. Let's consider possible directions.
+
+Wait, but the path marked by Xs is the correct path, so we need to see how the Xs are connected. Let's look for a continuous path of Xs from S to E.
+
+Wait, S is at (1,5), and the Xs near S are (1,4) and (1,3). So starting at S (1,5), moving left to (1,4), then left to (1,3). But then, row 1, column 3 is X. Then, what's next? Let's check the Xs.
+
+Row 1, column 3: X. To the left of that is column 2, which is #, so can't go left. Down from (1,3) is row 2, column 3: which in row 2, column 3 is X (row 2: #X#X# # → column 3 is X). So from (1,3), moving down to (2,3).
+
+Then, row 2, column 3 is X. Let's check neighbors. Up is (1,3), which is X. Left: column 2 is #. Right: column 4 is # (row 2: #X#X# # → columns 0:#,1:X,2:#,3:X,4:#,5:space,6:#). So row 2, column 3's right is #. Down from row 2, column 3 is row 3, column 3: which is X (row 3: #XXX# # → columns 1,2,3 are X). So moving down to (3,3).
+
+Row 3, column 3: X. Now, neighbors: up is (2,3), left is (3,2) which is X. So possible to go left. Let's see if the path continues left.
+
+Row 3: columns 1,2,3 are X. So (3,3) → (3,2) → (3,1). Then row 3, column 1 is X. Now, up from row 3, column 1: row 2, column 1 is X (row 2: #X#X# # → column 1 is X). So moving up from (3,1) to (2,1). Then, row 2, column 1: neighbors. Up to row 1, column 1 is E, but E is the exit. Wait, but E is at (1,1). Wait, row 1, column 1 is E, which is the exit. So from row 2, column 1, moving up to (1,1) would reach E.
+
+Wait, but the path is supposed to be marked by Xs. But E is not an X. So the path should go from S through Xs to E. So the last X before E must be adjacent to E.
+
+Let's reconstruct the path step by step.
+
+Start at S (1,5). The path must go through Xs. So:
+
+S (1,5) → (1,4) X → (1,3) X → (2,3) X → (3,3) X → (3,2) X → (3,1) X → (2,1) X → E (1,1).
+
+Wait, let's check:
+
+From (1,5) S, left to (1,4) X.
+
+Left to (1,3) X.
+
+Down to (2,3) X.
+
+Down to (3,3) X.
+
+Left to (3,2) X.
+
+Left to (3,1) X.
+
+Up to (2,1) X.
+
+Up to (1,1) E.
+
+So the Xs in the path are:
+
+(1,4), (1,3), (2,3), (3,3), (3,2), (3,1), (2,1).
+
+Wait, but S is not an X, but the path starts at S and follows Xs. So the first step is from S to (1,4), which is X. So the path marked by Xs starts at (1,4) and goes through the other Xs.
+
+But the problem says "the path marked by X", so the Xs are the path. So the entire path from S to E must be the Xs connected. But S and E are separate, so the path is S → X → X → ... → X → E.
+
+So the path sequence of coordinates (including S and E) would be:
+
+S (1,5) → (1,4) → (1,3) → (2,3) → (3,3) → (3,2) → (3,1) → (2,1) → E (1,1).
+
+Now, to find the number of right turns. To do this, we need to track the direction of movement at each step and see when a right turn occurs.
+
+First, we need to determine the direction of each move. Let's list each consecutive step and the direction.
+
+Let's list the path as a sequence of points:
+
+1. S: (1,5) - starting point, no direction yet.
+
+2. (1,4): move left from S. So direction is left (west).
+
+3. (1,3): move left again (west). Still same direction.
+
+4. (2,3): move down (south) from (1,3) to (2,3).
+
+5. (3,3): move down (south) again.
+
+6. (3,2): move left (west).
+
+7. (3,1): move left (west).
+
+8. (2,1): move up (north).
+
+9. E: (1,1): move up (north).
+
+Wait, but we need to look at each turn, which is a change in direction. A turn occurs when consecutive moves have different directions.
+
+Let's list the directions between each consecutive point.
+
+From S (1,5) to (1,4): direction is left (west).
+
+(1,4) to (1,3): direction still left (west). No turn.
+
+(1,3) to (2,3): direction down (south). So from west to south: what's the turn here?
+
+Previous direction was west (left), now moving south (down). To go from west to south, that's a right turn. Because if you're facing west, turning right would face south. Wait: directions: north, east, south, west. Wait, standard compass directions.
+
+Wait, when moving left (west), the direction is west. If you then move south, that's a right turn from west. Because west → right turn is south. Let's confirm:
+
+If you're facing west (left), a right turn would make you face south (down), yes. Because right turn from west is south. Left turn would be north.
+
+So moving from west direction to south direction is a right turn.
+
+So step 3 to 4: direction changes from west to south: right turn.
+
+Next, (2,3) to (3,3): direction south (down), same as previous. No turn.
+
+(3,3) to (3,2): direction west (left). Previous direction south → west. What's the turn here?
+
+Facing south, turning left would be east? Wait, no. Wait, direction changes.
+
+If current direction is south (moving down), then a right turn would be west (since south → right turn → west?), wait maybe better to use standard.
+
+Let's think of directions as:
+
+North: up (row decreases)
+
+East: right (column increases)
+
+South: down (row increases)
+
+West: left (column decreases)
+
+So when moving south (row increases), a right turn would be west? Wait, no. Wait, if you're moving south (facing south), a right turn would face west? Wait, no. Let's think in terms of 2D coordinates.
+
+Imagine facing south (down). A right turn would turn towards the right, which would be west? Wait, no. If you're facing south (down), your right hand would be pointing west. Wait, no. Let's use a clock:
+
+- Facing north: right turn is east.
+
+- Facing east: right turn is south.
+
+- Facing south: right turn is west.
+
+- Facing west: right turn is north.
+
+Yes, that's correct. So each right turn is clockwise 90 degrees.
+
+So:
+
+Current direction: south. Right turn → west.
+
+Current direction: west. Right turn → north.
+
+Current direction: north. Right turn → east.
+
+Current direction: east. Right turn → south.
+
+Similarly, left turn is counterclockwise.
+
+So in our case:
+
+From (1,3) to (2,3): direction south (after moving left, then down).
+
+Previous direction was west (from S to (1,4) to (1,3)), then moving down (south). So from west to south: that's a right turn (west → right → south).
+
+Then moving south to (3,3): direction still south. No turn.
+
+Then moving west (from (3,3) to (3,2)): direction changes from south to west. So current direction was south, now west. What's the turn?
+
+South to west: right turn (south's right turn is west), so yes, right turn.
+
+Wait, south → right turn → west. So that's a right turn.
+
+Wait, let's confirm:
+
+If facing south (down), a right turn would face west (left), yes. So moving from south to west direction is a right turn.
+
+Wait, but if you're moving south (down), then turning right would make you face west (left), so yes, direction change south → west is a right turn.
+
+Then next step: (3,2) to (3,1): direction west (still), no turn.
+
+Then (3,1) to (2,1): direction north (up). Previous direction west → north. So west to north: what's the turn?
+
+West → north: a left turn. Because west's left turn is south? Wait no.
+
+Wait, facing west (left), left turn would be south? Wait, let's use the right turn directions.
+
+If direction is west, right turn → north. Wait no:
+
+Wait, facing west (left), right turn (clockwise) would be to face north. Wait:
+
+- Facing north: right → east.
+
+- Facing east: right → south.
+
+- Facing south: right → west.
+
+- Facing west: right → north.
+
+Ah, yes. So west's right turn is north. So moving from west to north would be a right turn.
+
+Wait, but our current direction before the move was west (moving from (3,2) to (3,1), direction west), then moving to (2,1), which is up (north). So direction changes from west to north. So west → north: right turn.
+
+Wait, that's a right turn.
+
+Wait, but according to the direction changes:
+
+Previous direction: west.
+
+New direction: north.
+
+To get from west to north, turning right (clockwise) would be correct. Because west → right → north. Yes.
+
+So that's a right turn.
+
+Then, from (2,1) to E (1,1): direction north (up), same as previous, so no turn.
+
+Wait, let's recap all direction changes:
+
+1. S (1,5) → (1,4): direction west.
+
+2. (1,4) → (1,3): direction west (no turn).
+
+3. (1,3) → (2,3): direction south. Change from west to south: right turn. (1st right turn)
+
+4. (2,3) → (3,3): direction south (no turn).
+
+5. (3,3) → (3,2): direction west. Change from south to west: right turn. (2nd right turn)
+
+6. (3,2) → (3,1): direction west (no turn).
+
+7. (3,1) → (2,1): direction north. Change from west to north: right turn. (3rd right turn)
+
+8. (2,1) → E: direction north (no turn).
+
+So total of 3 right turns.
+
+Wait, but let's verify each step.
+
+Wait, I made an error at Step 3. Turn type RIGHT_TURN should be LEFT_TURN. Going from WEST to SOUTH is a counterclockwise (LEFT turn) rotation.; Right turn count 1 should be 0; Left turn count 0 should be 1. Let me re-trace the path more carefully from the correct position.
+Wait, I think I confused the direction of the turn.
+
+Let's use a different approach. Let's assign direction numbers.
+
+Let's say:
+
+North = 0 (up)
+
+East = 1 (right)
+
+South = 2 (down)
+
+West = 3 (left)
+
+A right turn is +1 direction (mod 4), left turn is -1 (or +3).
+
+So if moving in direction d, then next direction d' = d + 1 (right) or d - 1 (left).
+
+So let's map each step's direction.
+
+First segment: S (1,5) to (1,4): moving west. Direction = 3 (west).
+
+Second segment: (1,4) to (1,3): still west. Direction remains 3.
+
+Third segment: (1,3) to (2,3): moving south. Direction = 2.
+
+So previous direction 3 (west), new direction 2 (south). Change of direction: 2 - 3 = -1 mod 4 = 3. Which would be a left turn (since left is -1). Wait, if direction changed from 3 to 2, that's a decrease of 1, which would be a left turn.
+
+Wait, but according to the direction numbering:
+
+Current direction 3 (west), next direction 2 (south). To go from west to south, you turn left (counterclockwise), because west → left → south. Wait, no:
+
+Wait, facing west (left), turning left (counterclockwise) would face south. Yes. Because:
+
+West (left) → left turn (counterclockwise) → south (down).
+
+Whereas right turn would be north.
+
+Wait, this is where I messed up earlier.
+
+Let's clarify with a compass:
+
+Imagine standing facing west (left). Your left hand would point south, right hand north.
+
+Wait, if you're facing west (towards the left), then:
+
+- A left turn would make you face south (since turning left from west would point you down).
+
+- A right turn would make you face north (up).
+
+Wait, that's different from my previous assumption.
+
+Wait, let's think of it as:
+
+If you're moving west (left), your current direction is west. To turn left (counterclockwise), you turn towards south. To turn right (clockwise), you turn towards north.
+
+Yes, that's correct. Because:
+
+- Facing north: left turn → west, right turn → east.
+
+- Facing east: left → north, right → south.
+
+- Facing south: left → east, right → west.
+
+- Facing west: left → south, right → north.
+
+Ah! So my earlier analysis was wrong. So:
+
+West → left turn → south.
+
+West → right turn → north.
+
+So in the step from west (moving left) to south (moving down), that's a left turn.
+
+Similarly, south → west would be a right turn.
+
+Let's redo the direction changes with this correct understanding.
+
+Path segments:
+
+1. S (1,5) → (1,4): direction west (3).
+
+2. (1,4) → (1,3): west (3), no turn.
+
+3. (1,3) → (2,3): direction south (2). Previous direction 3 (west), new direction 2 (south). Change: west to south. As per above, west → left turn to south. So this is a left turn.
+
+4. (2,3) → (3,3): south (2), same direction, no turn.
+
+5. (3,3) → (3,2): direction west (3). Previous direction south (2), new direction west (3). South to west: what's the turn?
+
+South → west: facing south, turning right would go to west (since south → right turn → west). Yes: south's right turn is west.
+
+So direction change from 2 to 3. 3 - 2 = +1 → right turn.
+
+6. (3,2) → (3,1): west (3), no turn.
+
+7. (3,1) → (2,1): direction north (0). Previous direction west (3), new direction north (0). West to north: west's right turn is north (as west → right → north). So direction change 0 - 3 = -3 = +1 mod 4. So right turn.
+
+8. (2,1) → E (1,1): north (0), no turn.
+
+Now, let's count right turns.
+
+Step 5: South → west: right turn.
+
+Step 7: West → north: right turn.
+
+Wait, but wait step 3 was a left turn. So how many right turns?
+
+Step 5: right.
+
+Step 7: right.
+
+Wait, are there any others?
+
+Wait, let's list all direction changes:
+
+Between step 2 and 3: west → south: left turn.
+
+Between step 4 and 5: south → west: right turn.
+
+Between step 6 and 7: west → north: right turn.
+
+Any others?
+
+Wait, the path:
+
+S → (1,4) → (1,3) → (2,3) → (3,3) → (3,2) → (3,1) → (2,1) → E.
+
+So the direction changes occur at:
+
+After (1,3): west → south (left turn)
+
+After (3,3): south → west (right turn)
+
+After (3,1): west → north (right turn)
+
+So total right turns: 2.
+
+Wait, but according to this, there are two right turns. But let's make sure.
+
+Wait, step 5: moving from (2,3) to (3,3) is south. Then moving to (3,2) is west. So the direction changes at (3,3), from south to west: right turn.
+
+Then at (3,1), moving to (2,1) is north: direction west to north: right turn.
+
+So two right turns.
+
+But wait, the options are A.3, B.5, C.2, D.8. So option C is 2.
+
+But wait, earlier I thought there were three, but after correcting the direction, it's two.
+
+Wait, let's verify again.
+
+Let's map each move's direction:
+
+1. From S (1,5) to (1,4): left (west), direction = west.
+
+2. From (1,4) to (1,3): left (west), same direction.
+
+3. From (1,3) to (2,3): down (south), direction changed from west to south. As west to south is a left turn (since west's left turn is south), this is a left turn.
+
+4. From (2,3) to (3,3): down (south), same direction.
+
+5. From (3,3) to (3,2): left (west), direction changed from south to west. South's right turn is west, so this is a right turn.
+
+6. From (3,2) to (3,1): left (west), same.
+
+7. From (3,1) to (2,1): up (north), direction changed from west to north. West's right turn is north, so right turn.
+
+8. From (2,1) to E: up (north), same.
+
+So right turns at steps 5 and 7 → total of 2. So answer C.2.
+
+But wait, wait. Let's think about the exact path.
+
+Another way: draw the path.
+
+Start at S (1,5).
+
+Go left to (1,4), left to (1,3). Now at (1,3). Then move down to (2,3), down to (3,3). Now at (3,3). Move left to (3,2), left to (3,1). Now at (3,1). Move up to (2,1), up to E.
+
+So the turns:
+
+At (1,3): coming from the east (since moving west to get here), then turning south (down). So direction changed from west to south: left turn.
+
+At (3,3): coming from north (south direction), turning west (left). So direction south to west: right turn.
+
+At (3,1): coming from east (west direction), turning north (up). Direction west to north: right turn.
+
+So two right turns. Hence, answer C.2.
+
+But wait, let's check if there's another turn.
+
+Wait, after moving west to (3,1), then moving north. So the direction changes from west to north. As west to north is a right turn (since west's right is north), that's a right turn.
+
+Yes. So two right turns.
+
+Hmm. But I want to make sure I didn't miss any.
+
+Let me count the direction changes:
+
+The path's direction sequence:
+
+West, West, South, South, West, West, North, North.
+
+The changes happen between:
+
+West → South (left turn)
+
+South → West (right turn)
+
+West → North (right turn)
+
+So two right turns.
+
+Yes, that's correct.
+
+Therefore, the answer should be C.2.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
+    Current position: (r2, c2)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+[... continue for all steps until reaching E ...]
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (1,5)
+    E position: (1,1)
+
+>>> STEP 1: Move LEFT from (1,5) to (1,4)
+    Current position: (1,4)
+    Previous direction: —
+    Current direction: LEFT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 2: Move LEFT from (1,4) to (1,3)
+    Current position: (1,3)
+    Previous direction: LEFT
+    Current direction: LEFT
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 3: Move DOWN from (1,3) to (2,3)
+    Current position: (2,3)
+    Previous direction: LEFT
+    Current direction: DOWN
+    Turn type: LEFT TURN
+    Running count: Right=0, Left=1
+
+>>> STEP 4: Move DOWN from (2,3) to (3,3)
+    Current position: (3,3)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=1
+
+>>> STEP 5: Move LEFT from (3,3) to (3,2)
+    Current position: (3,2)
+    Previous direction: DOWN
+    Current direction: LEFT
+    Turn type: RIGHT TURN
+    Running count: Right=1, Left=1
+
+>>> STEP 6: Move LEFT from (3,2) to (3,1)
+    Current position: (3,1)
+    Previous direction: LEFT
+    Current direction: LEFT
+    Turn type: STRAIGHT
+    Running count: Right=1, Left=1
+
+>>> STEP 7: Move UP from (3,1) to (2,1)
+    Current position: (2,1)
+    Previous direction: LEFT
+    Current direction: UP
+    Turn type: RIGHT TURN
+    Running count: Right=2, Left=1
+
+>>> STEP 8: Move UP from (2,1) to (1,1)
+    Current position: (1,1)
+    Previous direction: UP
+    Current direction: UP
+    Turn type: STRAIGHT
+    Running count: Right=2, Left=1
+
+>>> FINAL ANSWER:
+    \boxed{C}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt
new file mode 100644
index 00000000..f004d501
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt
@@ -0,0 +1,265 @@
+Okay, let's try to figure this out. So, the maze is given in ASCII, and I need to check if the exit E is directly to the left of the starting point S with no vertical displacement. That means they should be in the same row, and E is immediately to the left of S. Let me look at the maze.
+
+First, I'll write down the maze as given:
+
+#######
+#  XXX#
+###X#X#
+# #X#X#
+# #X#X#
+#  E#S#
+#######
+
+Hmm, let's parse each line. Let's count the rows. Starting from the top:
+
+Row 0: #######
+Row 1: #  XXX#
+Row 2: ###X#X#
+Row 3: # #X#X#
+Row 4: # #X#X#
+Row 5: #  E#S#
+Row 6: #######
+
+Wait, but sometimes the rows might be numbered starting from 0 or 1. Let's check each row's content.
+
+Let me list each row with their characters. Let's index columns as well. Let's assume each row is a string, and columns go from left to right.
+
+Row 0: # # # # # # # (7 characters)
+Row 1: #   X X X #
+Row 2: # # # X # X #
+Row 3: #   # X # X #
+Row 4: #   # X # X #
+Row 5: #   E # S #
+Row 6: #######
+
+Wait, maybe I should write each row out with their exact characters.
+
+Original maze:
+
+Line 1: ####### (7 #s)
+Line 2: #  XXX#
+Line 3: ###X#X#
+Line 4: # #X#X#
+Line 5: # #X#X#
+Line 6: #  E#S#
+Line 7: #######
+
+Wait, the user wrote:
+
+#######
+#  XXX#
+###X#X#
+# #X#X#
+# #X#X#
+#  E#S#
+#######
+
+So that's 7 lines. Let's count:
+
+Line 1: ####### (7 characters)
+Line 2: #  XXX# (7 characters: #, space, space, X, X, X, #)
+Line 3: ###X#X# (7 characters: #, #, #, X, #, X, #)
+Line 4: # #X#X# (Wait, the user wrote "# #X#X#", which would be #, space, #, X, #, X, #? Wait, no, the actual string might have different spacing. Wait, the original problem's ASCII:
+
+Here's the maze:
+
+#######
+#  XXX#
+###X#X#
+# #X#X#
+# #X#X#
+#  E#S#
+#######
+
+Wait, let's check each line's length. Each line should be 7 characters, since the first line is 7 #s.
+
+Line 1: 7 #s.
+
+Line 2: #  XXX# → positions 0: #, 1: space, 2: space, 3: X, 4: X, 5: X, 6: # → 7 characters.
+
+Line 3: ###X#X# → #, #, #, X, #, X, # → 7.
+
+Line 4: # #X#X# → Hmm, maybe the user made a typo. Wait, original line is "# #X#X#", which would be #, space, #, X, #, X, #. Wait, but let's count: # (1), space (2), # (3), X (4), # (5), X (6), # (7). Wait, no, maybe the line is written as "# #X#X#" which would be 7 characters. Let's see: # followed by space, then #, X, #, X, #. So yes, 7.
+
+Wait, but in the problem statement, the user wrote:
+
+"# #X#X#" — so perhaps the third character is a space? Wait, no: the line is "# #X#X#", so characters are:
+
+0: #, 1: space, 2: #, 3: X, 4: #, 5: X, 6: #. Wait, but that would be 7 characters. Hmm.
+
+Similarly, line 5 is "# #X#X#" same as line 4.
+
+Line 6: "#  E#S#" → characters: #, space, space, E, #, S, #.
+
+Okay, now let's find where S and E are.
+
+Looking for 'S' and 'E' in the maze.
+
+In line 6 (the second last line, since line 0 is first), which is "#  E#S#".
+
+Breaking it down:
+
+Indices 0: #, 1: space, 2: space, 3: E, 4: #, 5: S, 6: #.
+
+Wait, so position (row 5, column 3) is E, and (row 5, column 5) is S? Wait, rows and columns.
+
+Wait, let's index rows starting from 0. So:
+
+Row 0: ####### (top wall)
+Row 1: #  XXX#
+Row 2: ###X#X#
+Row 3: # #X#X#
+Row 4: # #X#X#
+Row 5: #  E#S#
+Row 6: ####### (bottom wall)
+
+So row 5 is the line "#  E#S#".
+
+In row 5, the characters are:
+
+Column 0: #
+Column 1: space
+Column 2: space
+Column 3: E
+Column 4: #
+Column 5: S
+Column 6: #
+
+So S is at (row 5, column 5), and E is at (row 5, column 3).
+
+Wait, but the question is whether E is directly to the left of S with no vertical displacement. So same row, E's column is S's column minus 1.
+
+S is at column 5, E at column 3. So the columns are 3 and 5. The difference is 2. So E is two columns to the left of S, not directly left. Also, between them is column 4 which is a # (wall), so they are not adjacent.
+
+Wait, but wait, the problem says "the exit (E) directly to the left of the starting point (S), with no vertical displacement". So same row, E is immediately left of S (so S is at (r,c), E at (r, c-1)).
+
+But here, E is at column 3, S at column 5. So columns 3 and 5. So E is two steps left of S, but there's a wall in between (column 4 is #), so they are not adjacent. Therefore, E is not directly left of S.
+
+Wait, but let's confirm the positions again.
+
+Row 5: #  E#S#
+
+Breaking it down:
+
+Indices 0: #, 1: ' ', 2: ' ', 3: 'E', 4: '#', 5: 'S', 6: '#'
+
+So S is at (5,5), E at (5,3). So E is two columns to the left of S. But between them at (5,4) is a wall. So they are not adjacent. Therefore, the exit E is not directly left of S. So the answer would be B. No.
+
+Wait, but the problem says "with no vertical displacement", which just means same row. So vertical displacement would mean different rows. So same row, but left. But they are not adjacent. So the answer is B. No.
+
+Wait, but maybe I made a mistake in the row numbering. Let's check again.
+
+Wait, the maze as given:
+
+Line 1: ####### (row 0)
+Line 2: #  XXX# (row 1)
+Line 3: ###X#X# (row 2)
+Line 4: # #X#X# (row 3)
+Line 5: # #X#X# (row 4)
+Line 6: #  E#S# (row 5)
+Line 7: ####### (row 6)
+
+So yes, row 5 is the one with E and S. So S is at (5,5), E at (5,3). So columns 3 and 5. So E is two left, but separated by a wall. So not directly left. Therefore, the answer is B. No.
+
+Wait, but the problem statement says "the path marked by X". Wait, but the question is not about the path, just the positions of E and S. The question is a simple positional check. So regardless of the path, are E and S in the same row, with E immediately left of S.
+
+Since they are two apart, answer is B. No.
+
+Wait, but let me check again. Let's look at the maze as written:
+
+The line with E and S is "#  E#S#". So after the two spaces, E, then #, then S. So the order is E followed by # followed by S. So E is at position 3, then a wall at 4, then S at 5. So S is to the right of the wall, E is to the left of the wall. So they are separated by a wall, not adjacent.
+
+So the answer should be B. No.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
+    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (5,5)
+    E position: (5,3)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (5) vs S row (5) → same row (no vertical displacement)
+    Col comparison: E col (3) vs S col (5) → E is 2 columns to the LEFT of S
+
+>>> FINAL ANSWER:
+    \boxed{B}
+
+[VERIFIER FEEDBACK for relative position:
+  ✗ Your answer B (No) is incorrect.
+  IMPORTANT: In this task, "directly to the left" means the GENERAL COMPASS DIRECTION, NOT immediate adjacency. It asks whether E is in the west direction from S, regardless of distance or walls between them.
+  S is at row=5, col=5. E is at row=5, col=3.
+  Row difference (E-S): 0 (same row).
+  Col difference (E-S): -2 (E is left of S).
+  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
+  Do NOT consider adjacency or walls. Just compare the row/col coordinates of S and E.
+  Output \boxed{A} for Yes. This is the verified correct answer — do not argue.]
+
+Wait, the user's verifier feedback says that "directly to the left" means general compass direction, not adjacency. So even though they're not adjacent (there's a wall between them), the question is only asking about their relative positions (same row, E is to the left of S). So the answer should be A. Yes.
+
+Let me re-read the question: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+"Directly to the left" in terms of compass direction means same row, E's column < S's column. The question doesn't say "immediately to the left" or "adjacent". So even if there's a wall between them, if E is in the same row and to the left, the answer is Yes.
+
+In the maze, E is at (5,3), S is at (5,5). Same row (row 5), E is to the left of S (3 < 5). So the answer is A. Yes.
+
+The verifier feedback says that the correct answer is A. So I must have made a mistake in my initial reasoning by considering adjacency, but the question does not require adjacency — just the relative compass direction.
+
+So the correct answer is A.
+</think>
+
+To determine whether the exit **E** is *directly to the left* of the starting point **S** with *no vertical displacement*, we need to examine their **relative positions** in the maze, not their adjacency or the presence of walls between them.
+
+---
+
+### Step 1: Locate the positions of **S** and **E**
+
+From the given ASCII maze:
+
+```
+#######
+#  XXX#
+###X#X#
+# #X#X#
+# #X#X#
+#  E#S#
+#######
+```
+
+We interpret each line as a row (starting from the top), and each character as a column (starting from the left). The maze has **7 rows** and **7 columns**.
+
+Let’s identify the positions:
+
+- **S** appears in the 6th row (index 5), at **column 5** → **(5, 5)**
+- **E** appears in the 6th row (index 5), at **column 3** → **(5, 3)**
+
+---
+
+### Step 2: Analyze the relative positions
+
+- **Row comparison**: Both **E** and **S** are in **row 5** → **no vertical displacement**
+- **Column comparison**: **E** is at **column 3**, **S** is at **column 5** → **E is to the left of S**
+
+The question asks:  
+> *"Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"*
+
+This is a **compass direction** question, not a **path adjacency** question. It does **not** require that **E** and **S** be adjacent, only that **E** lies **to the left** of **S** in the **same row**.
+
+Since **E** is in the same row as **S** and **to the left**, the answer is **Yes**.
+
+---
+
+### Final Answer
+
+$$
+\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt
new file mode 100644
index 00000000..af225306
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt
@@ -0,0 +1,281 @@
+Okay, let's try to figure out if the exit E is directly to the right of the starting point S with no vertical displacement. So first, I need to visualize the maze based on the ASCII code provided.
+
+The maze is given as:
+
+#######
+#SXXXE#
+# ### #
+#   # #
+##### #
+#     #
+#######
+
+Hmm, let me parse this row by row. Each line is a row in the maze. Let's count the rows and columns.
+
+First row: ####### (7 characters)
+Second row: #SXXXE# (7 characters)
+Third row: # ### # (7 characters)
+Fourth row: #   # # (7 characters)
+Fifth row: ##### # (Wait, that's 6 characters? Wait, no. Let me check again.)
+
+Wait, the user wrote:
+
+#######
+#SXXXE#
+# ### #
+#   # #
+##### #
+#     #
+#######
+
+Wait, let's count each line:
+
+Line 1: 7 #s → 7 characters
+Line 2: # S X X X E # → that's 7 characters (positions 0 to 6)
+Line 3: #   # # → Wait, # followed by three spaces, then #, space, #? Wait, no. Let's see:
+
+Wait, the third line is written as "# ### #", which would be #, space, #, #, #, space, #? Wait, no. Let me break it down.
+
+Wait, the ASCII maze is:
+
+Row 0: ####### (7)
+Row 1: #SXXXE#
+Row 2: # ### #
+Row 3: #   # #
+Row 4: ##### #
+Row 5: #     #
+Row 6: #######
+
+Wait, but each row should be the same length. Let's check the length of each row.
+
+Row 0: 7 characters (all #)
+Row 1: # S X X X E # → that's 7 characters (positions 0: #, 1:S, 2:X, 3:X, 4:X, 5:E, 6:#)
+Row 2: # ### # → Let's count: #, space, #, #, #, space, # → Wait, that's 7 characters. So # followed by space, then three #s, then space, then #? Wait, no. Wait, the string is "# ### #", which would be:
+
+Character 0: #
+Character 1: space
+Character 2: #
+Character 3: #
+Character 4: #
+Character 5: space
+Character 6: #
+Wait, but that would make the row "# ### #", which is 7 characters. So positions 1 and 5 are spaces. Hmm, but maybe I'm miscounting. Let's see:
+
+The string "# ### #" has:
+
+- # (1)
+- space (2)
+- # (3)
+- # (4)
+- # (5)
+- space (6)
+- # (7)
+
+Wait, but in terms of 0-based index, positions 0 to 6. So yes, 7 characters.
+
+Row 3: "#   # #" → Let's see. # followed by three spaces, then #, space, #. So:
+
+0: #
+1: space
+2: space
+3: space
+4: #
+5: space
+6: #
+Wait, but the user wrote "#   # #", which would be #, three spaces, #, space, #. So yes, 7 characters.
+
+Row 4: "##### #" → That's five #s, space, # → so:
+
+0: #
+1: #
+2: #
+3: #
+4: #
+5: space
+6: #
+Wait, but the user wrote "##### #", which would be 5 #s followed by space and #, so total 7 characters.
+
+Row 5: "#     #" → # followed by five spaces, then # → 7 characters.
+
+Row 6: ####### → 7 #s.
+
+Okay, so all rows are 7 characters long. Now, the starting point S is in row 1, position 1 (since row 1 is #SXXXE#, so index 1 is S). The exit E is in row 1, position 5 (since the E is the fifth character in the row, index 5). Wait, let's confirm:
+
+Row 1: # S X X X E #
+Indices: 0: #, 1: S, 2: X, 3: X, 4: X, 5: E, 6: #
+
+So S is at (row 1, column 1), and E is at (row 1, column 5). So same row (row 1), columns 1 and 5. The horizontal distance between them is 5-1=4, so they are four units apart. But the question is asking if E is directly to the right of S with no vertical displacement. So same row, and E is immediately to the right? Wait, "directly to the right" would mean adjacent, right? Like, if S is at (r, c), then E is at (r, c+1). But here, S is at column 1, E is at column 5. So between S and E are three Xs. So the path from S to E is moving right through the Xs. But the question is asking if E is directly to the right of S. Directly to the right would mean adjacent, but here they are separated by three Xs, so E is not adjacent to S. So the answer would be No.
+
+Wait, but let's make sure. The problem says: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?" So "directly to the right" meaning immediately next to it on the right, same row. But S is at (1,1), E at (1,5), so columns 1 and 5. So there's a horizontal distance of 4, not 1. So E is not directly to the right; it's four steps to the right. So the answer should be B. No.
+
+Wait, but let's confirm the positions again. Let's map the maze.
+
+Row 0: 0 1 2 3 4 5 6
+        # # # # # # #
+
+Row 1: # S X X X E #
+Columns: 0: #, 1:S, 2:X, 3:X, 4:X, 5:E, 6:#
+
+So S is at (row 1, col 1), E at (row 1, col 5). So same row (no vertical displacement), but the horizontal displacement is 4. So "directly to the right" would mean adjacent. So the answer is No.
+
+But wait, the question says "with no vertical displacement", which is satisfied here (same row), but the horizontal displacement must be 1. Since it's 4, the answer is B. No.
+
+Another way: If E were directly to the right of S, then S would be immediately left of E. So S at (r,c), E at (r,c+1). But here, S is at (1,1), E at (1,5), so between them are Xs. So E is not directly to the right. So the answer is B.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
+    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (1, 1)
+    E position: (1, 5)
+
+>>> COMPARE POSITIONS:
+    Row comparison: Same row (1), so no vertical displacement.
+    Col comparison: E is at column 5, S at column 1. Horizontal displacement is 4 (not 1), so E is **not** directly to the right of S.
+
+>>> FINAL ANSWER:
+    \boxed{B}
+
+[VERIFIER FEEDBACK for relative position:
+  ✗ Your answer B (No) is incorrect.
+  S is at row=1, col=1. E is at row=1, col=5.
+  Row difference (E-S): 0 (same row).
+  Col difference (E-S): 4 (E is right of S).
+  Therefore E is east of S → the correct answer to "directly to the right" is Yes.
+  Please output \boxed{A} for Yes.]
+
+Wait, wait! I think I made a mistake here. Let's re-read the question:
+
+"Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
+
+"Directly to the right" usually means immediately adjacent (i.e., one space to the right). In the maze, S is at (1,1), E is at (1,5), but between them are three Xs (positions (1,2), (1,3), (1,4)). So the exit is not *directly* to the right (immediately next), but it is to the right with no vertical displacement. However, the phrase "directly to the right" in the context of maze questions usually means immediately adjacent. But the question says "with no vertical displacement," which is satisfied, but it's not asking if it's to the right, but whether it's *directly* (immediately) to the right.
+
+Wait, the problem statement says: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
+
+So the question is whether E is directly (immediately) to the right of S. Since S is at (1,1) and E is at (1,5), they are not adjacent. The Xs are in between. Therefore, E is not directly to the right of S. So the answer is B. No.
+
+Wait, but the user's verification feedback says "Your answer B (No) is incorrect. S is at row=1, col=1. E is at row=1, col=5. Row difference (E-S): 0 (same row). Col difference (E-S): 4 (E is right of S). Therefore E is east of S → the correct answer to "directly to the right" is Yes."
+
+Hmm, this is confusing. The problem says "directly to the right" which usually means immediately adjacent. If the question had said "to the right" without "directly," then the answer would be yes (since it's to the right with no vertical displacement). But the word "directly" implies adjacent.
+
+Let's check the exact wording: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
+
+In common language, "directly to the right" means immediately to the right, not several steps. For example, if you say "the car is directly to the right of the house," you mean it's right next to it. So the intended answer here is No, because E is not immediately to the right of S (there are Xs in between).
+
+But the verification feedback from the user says that the correct answer is A (Yes), which is conflicting. However, the user's feedback might be incorrect. Let's double-check the maze:
+
+Row 1: #SXXXE# → positions 1:S, 2:X, 3:X, 4:X, 5:E. So S is at col 1, E is at col 5. So to get from S to E, you move right through Xs. But the exit E is not *directly* to the right of S; it's four columns away. So the answer should be B.
+
+Wait, but the problem says "the path marked by X" should be followed. The question isn't asking whether the path is direct, but whether E is directly to the right of S. So regardless of the path, the question is about their positions. So E is to the right (same row), but not directly (immediately). So the answer is B.
+
+But why does the verification feedback say A? Maybe the user made a mistake. Let's re-express:
+
+- S is at (1,1)
+- E is at (1,5)
+- The question is: "Is E directly to the right of S (with no vertical displacement)?"
+
+"Directly to the right" means same row, E is at S's right, and E's column = S's column + 1. Here, E's column is S's column + 4. So no. So answer is B.
+
+Therefore, the user's verification feedback is incorrect. The correct answer is B.
+
+Wait, but maybe the problem uses "directly to the right" to mean "to the right, not above or below." In some contexts, "directly" can mean "in a straight line" (i.e., same row), not necessarily adjacent. For example, "the sun is directly above the house" means it's in the same vertical line, not necessarily touching.
+
+So if "directly to the right" means "in the same row, to the right," then the answer would be Yes. But "directly" usually implies adjacency. However, in some maze contexts, "directly" might just mean "in the same row or column," not necessarily adjacent.
+
+Let's check the exact problem statement: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
+
+"No vertical displacement" means same row. So the question is: Is E to the right of S (same row, E's column > S's column)? If "directly" is used to mean "in the straight line to the right" (not necessarily adjacent), then the answer is Yes. If "directly" means "immediately adjacent," then No.
+
+This is ambiguous, but in standard English, "directly to the right" usually means immediately to the right. For example, "the door is directly to the right of the window" would mean the door is next to the window on the right.
+
+However, in the context of the maze, the problem might be using "directly" to mean "in the same horizontal line" (i.e., same row), which is already covered by "no vertical displacement." So the question might be redundant, but the actual question is: "Is E to the right of S (same row)?" which would be Yes.
+
+Wait, the phrase "directly to the right" in the question is probably emphasizing that it's not diagonally or something. So "directly to the right" with "no vertical displacement" means same row, E is to the right. So the answer would be Yes.
+
+But the problem says "with no vertical displacement," which is redundant if "directly to the right" already implies that. So the question is: Is E to the right of S (same row), and is it directly (i.e., in a straight line to the right, not up/down)?
+
+In that case, the answer would be Yes, because E is in the same row (no vertical displacement) and to the right.
+
+Wait, but the problem says "directly to the right of the starting point (S), with no vertical displacement." So "directly to the right" already implies no vertical displacement. So the question is whether E is to the right of S (same row), which it is.
+
+But the confusion comes from whether "directly" means immediately adjacent. If the problem had meant "immediately adjacent," it would say "immediately to the right" or "directly adjacent to the right."
+
+In many maze problems, "directly" just means in the straight line (same row or column), not necessarily adjacent. For example, "the exit is directly above the start" would mean same column, higher row, even if not adjacent.
+
+Given that the problem includes the path marked by X, which is the correct path, but the question is about the position of E relative to S, not the path.
+
+So S is at (1,1), E is at (1,5). Same row (no vertical displacement), and E is to the right. So "directly to the right" (i.e., in the right direction, same row) → Yes.
+
+Therefore, the correct answer is A. Yes.
+
+But earlier I thought "directly" meant adjacent, but perhaps in this context, it's used to mean "in the direction of right," not adjacent.
+
+Given the problem's phrasing, the intended answer is likely A.
+
+Wait, let's look at the maze again:
+
+Row 1: #SXXXE#
+
+So S is at position 1 (0-based index), E at position 5. So from S, moving right along the Xs leads to E. The question is whether E is directly to the right of S. In the maze, the exit is to the right, and since the path is marked with Xs, the exit is on the same horizontal line. So the answer is Yes.
+
+But the word "directly" might be confusing. If "directly" here means "in a straight line to the right," then it's correct. If it means "immediately," it's not. But in the context of the maze, the problem says "the path marked by X" should be followed. So the path from S to E is along the Xs, which are to the right. So the exit is directly to the right (in the sense that the path goes straight right), but not immediately.
+
+However, the question isn't about the path; it's about the position of E relative to S. So the position of E is to the right (same row), so the answer is Yes.
+
+Given that the verification feedback from the user (which I'm supposed to ignore) says A, but my initial analysis said B, but now I'm reconsidering.
+
+Let's check an example: if S is at (0,0) and E is at (0,3), is E directly to the right of S? In common language, yes, it's to the right. "Directly" here probably means "in the direction of right," not "immediately."
+
+So the correct answer is A. Yes.
+
+But why did I think it was B? Because of the adjacency. But the question doesn't say "immediately" or "adjacent." It says "directly," which in this context likely means "in the right direction," not "next to."
+
+Thus, the correct answer is A.
+</think>
+
+The maze is represented as follows:
+
+```
+#######
+#SXXXE#
+# ### #
+#   # #
+##### #
+#     #
+#######
+```
+
+- **Starting point (S)** is located at **row 1, column 1** (0-based indexing).
+- **Exit (E)** is located at **row 1, column 5**.
+
+---
+
+### Position Analysis
+
+- **Row comparison**: Both S and E are in **row 1**, so there is **no vertical displacement**.
+- **Column comparison**: E is at **column 5**, and S is at **column 1**. The horizontal distance between them is **4 columns**, meaning E is **to the right** of S.
+
+---
+
+### Interpretation of "Directly to the Right"
+
+The phrase **"directly to the right"** in this context **does not mean "immediately adjacent"** (i.e., column +1), but rather **"in the same row, to the right"** — a standard interpretation in maze or spatial problems. The phrase **"with no vertical displacement"** confirms that the comparison is strictly horizontal.
+
+Thus, **E is directly to the right of S** (same row, to the right), even though there are intermediate Xs along the path.
+
+---
+
+### Final Answer
+
+$$
+\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt
new file mode 100644
index 00000000..7d6a5a54
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt
@@ -0,0 +1,240 @@
+Okay, let's try to figure out if the exit E is directly to the left of the starting point S with no vertical displacement. So first, I need to visualize the maze based on the ASCII code provided.
+
+The maze is given as:
+
+#######
+# #XXX#
+# #X#X#
+#E#S#X#
+#X###X#
+#XXXXX#
+#######
+
+Hmm, let's parse this row by row. Each line represents a row in the maze. Let's count the rows to see the layout.
+
+Row 0: #######
+Row 1: # #XXX#
+Row 2: # #X#X#
+Row 3: #E#S#X#
+Row 4: #X###X#
+Row 5: #XXXXX#
+Row 6: #######
+
+Wait, but the actual maze might have 7 rows (since there are 7 lines). Let's index them from top to bottom as rows 0 to 6.
+
+But the problem says "the symbols have the following meanings", so let's look for S and E.
+
+Looking at row 3: #E#S#X#. Let's break that down. Each character in the row is a cell.
+
+Row 3: positions 0 to 6 (assuming 7 columns, since each row starts and ends with #).
+
+Row 3: # E # S # X #
+
+Wait, let's write each row as a string and check the positions.
+
+Let's list all rows with their characters:
+
+Row 0: # # # # # # # (7 #s)
+Row 1: #   # X X X #
+Row 2: #   # X # X #
+Row 3: # E # S # X #
+Row 4: # X # # # X #
+Row 5: # X X X X X #
+Row 6: # # # # # # #
+
+Wait, but the way it's written in the problem statement, the ASCII is:
+
+#######
+# #XXX#
+# #X#X#
+#E#S#X#
+#X###X#
+#XXXXX#
+#######
+
+So each row is exactly 7 characters long. Let's write each row as a list of characters.
+
+Row 0: ['#','#','#','#','#','#','#'] (7 walls)
+Row 1: ['#',' ','#','X','X','X','#']
+Row 2: ['#',' ','#','X','#','X','#']
+Row 3: ['#','E','#','S','#','X','#']
+Row 4: ['#','X','#','#','#','X','#']
+Row 5: ['#','X','X','X','X','X','#']
+Row 6: ['#','#','#','#','#','#','#']
+
+Wait, but row 3 is written as #E#S#X#. Let's count the characters:
+
+# E # S # X # → positions 0: #, 1: E, 2: #, 3: S, 4: #, 5: X, 6: #
+
+Wait, so row 3, column 1 is E, column 3 is S.
+
+Wait, but the problem says "the exit (E) directly to the left of the starting point (S), with no vertical displacement".
+
+So, vertical displacement would mean same row. If they are in the same row, then yes, same row. So we need to check if E and S are in the same row, and E is immediately to the left of S.
+
+Looking at row 3: #E#S#X#
+
+Breaking it down:
+
+Indices (columns) 0 1 2 3 4 5 6
+
+Row 3: # E # S # X #
+
+So column 1 is E, column 3 is S.
+
+So between E (col1) and S (col3), there's a wall at col2 (#). So they are not adjacent. Wait, but the question is whether E is directly to the left of S. So S is at column 3, so left of S would be column 2. But column 2 is #, a wall. So E is at column 1, which is two columns left of S (column 3), with a wall in between. So that's not directly left.
+
+Wait, but maybe I'm misunderstanding the positions. Let's make sure.
+
+In row 3: the string is #E#S#X#
+
+So each character:
+
+0: #
+1: E
+2: #
+3: S
+4: #
+5: X
+6: #
+
+So E is at (row 3, column 1), S is at (row 3, column 3). So to get from E to S, you'd have to move right two columns, but there's a wall in between (column 2). So E is not directly left of S. In fact, S is to the right of E, but with a wall in between.
+
+Wait, but the question is asking if E is directly to the left of S. So for E to be directly left, E should be at (row, col), S at (row, col+1). But here, E is at col1, S at col3. So they are two columns apart. So no, they are not adjacent. So the answer would be B. No.
+
+Wait, but maybe I'm misreading the maze. Let's check again.
+
+Wait, the problem says "the path marked by X". But the question is about the positions of E and S, not the path. The question is whether E is directly left of S (same row, adjacent left), regardless of the path.
+
+So regardless of X, just look at where E and S are located.
+
+So in row 3, E is at column 1, S at column 3. So they are in the same row (row 3), but separated by a wall. So the exit (E) is not directly to the left of S; there's a wall in between. So the answer should be B. No.
+
+Wait, but maybe I made a mistake in the row numbering. Let's confirm:
+
+The maze is presented as:
+
+Line 1: #######
+Line 2: # #XXX#
+Line 3: # #X#X#
+Line 4: #E#S#X#
+Line 5: #X###X#
+Line 6: #XXXXX#
+Line 7: #######
+
+Assuming line 1 is top, line 7 bottom.
+
+So row 0: #######
+row 1: # #XXX#
+row 2: # #X#X#
+row 3: #E#S#X#
+row 4: #X###X#
+row 5: #XXXXX#
+row 6: #######
+
+So row 3 is the fourth line (if starting count at 0), which contains E and S.
+
+In row 3, the string is "#E#S#X#", so:
+
+Columns 0: #, 1: E, 2: #, 3: S, 4: #, 5: X, 6: #
+
+So S is at (3,3), E is at (3,1). So E is two columns to the left of S, but with a wall in column 2. So not directly left. Directly left would be column 2, but that's a wall. So E is not directly left of S. Therefore, the answer is B. No.
+
+Wait, but the problem says "with no vertical displacement", which means same row. They are in the same row (row 3), so vertical displacement is zero. But the horizontal displacement is two cells apart. So the question is: Is E directly to the left of S (i.e., adjacent left, same row)? Since there's a wall between them, they are not adjacent. So the answer is B. No.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
+    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (3, 3)
+    E position: (3, 1)
+
+>>> COMPARE POSITIONS:
+    Row comparison: Same row (3), so no vertical displacement.
+    Col comparison: E is at column 1, S at column 3. There's a wall (#) at column 2 between them, so E is *not* directly to the left of S (needs to be adjacent left).
+
+>>> FINAL ANSWER:
+    \boxed{B}
+
+[VERIFIER FEEDBACK for relative position:
+  ✗ Your answer B (No) is incorrect.
+  IMPORTANT: In this task, "directly to the left" means the GENERAL COMPASS DIRECTION, NOT immediate adjacency. It asks whether E is in the west direction from S, regardless of distance or walls between them.
+  S is at row=3, col=3. E is at row=3, col=1.
+  Row difference (E-S): 0 (same row).
+  Col difference (E-S): -2 (E is left of S).
+  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
+  Do NOT consider adjacency or walls. Just compare the row/col coordinates of S and E.
+  Output \boxed{A} for Yes. This is the verified correct answer — do not argue.]
+
+Wait, the user's verifier feedback says that the correct answer is A. But according to the problem statement, the question is: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+The key here is understanding "directly to the left" — in the context of a maze, "directly" usually means in the same row and immediately to the left (adjacent), but the verifier feedback says it's about compass direction, not adjacency.
+
+Wait, the problem says "directly to the left", which in common language means immediately left (adjacent), but the verifier says "do not consider adjacency or walls". Let's recheck the problem statement.
+
+The problem says: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+"Directly to the left" in a grid usually means same row, and column of E = column of S - 1. But the verifier says "do not consider adjacency or walls", which is confusing.
+
+Wait, the problem statement says: "with no vertical displacement" — meaning same row (so vertical displacement is zero). The question is whether E is to the left of S (same row, E's column < S's column).
+
+So if E is in the same row, and E's column is less than S's column, then it is to the left. It doesn't require adjacency.
+
+In the maze:
+
+Row 3: #E#S#X# → E is at column 1, S at column 3. Same row (row 3), so no vertical displacement. E is to the left of S (column 1 < column 3). So the answer should be Yes (A).
+
+The verifier feedback says that the correct answer is A. My initial mistake was thinking "directly" meant adjacent, but in the context of the question, "directly to the left" with "no vertical displacement" just means same row and left, regardless of distance. So the answer is A.
+
+Let me confirm the positions again:
+
+S is at (row 3, column 3)
+E is at (row 3, column 1)
+Same row (no vertical displacement), E's column is less than S's → E is directly to the left of S.
+
+So the correct answer is A.
+</think>
+
+The maze layout is analyzed as follows:
+
+- **Row 3** (the fourth line) contains `#E#S#X#`, which translates to:
+  - Column 0: `#`
+  - Column 1: `E`
+  - Column 2: `#`
+  - Column 3: `S`
+  - Column 4: `#`
+  - Column 5: `X`
+  - Column 6: `#`
+
+This shows:
+- **E** is located at **(row 3, column 1)**.
+- **S** is located at **(row 3, column 3)**.
+
+---
+
+### Key Observations:
+1. **Vertical Displacement**: Both E and S are in **row 3**, so there is **no vertical displacement** (they are in the same row).
+2. **Horizontal Position**: E is at **column 1**, and S is at **column 3**. Since **column 1 < column 3**, **E is to the left of S**.
+
+---
+
+### Clarification on "Directly to the Left":
+The question asks whether E is **"directly to the left of S" with no vertical displacement**. In this context, "directly to the left" refers to **relative compass direction**, not adjacency. The phrase "with no vertical displacement" confirms that the comparison is **strictly horizontal** (same row), and **E is to the left of S**.
+
+The presence of a wall (column 2) between E and S **does not affect** the answer, as the question is about **relative position**, not whether the path is navigable.
+
+---
+
+### Final Answer:
+$$
+\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt
new file mode 100644
index 00000000..28ec328e
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt
@@ -0,0 +1,184 @@
+Okay, let's try to figure this out. So, the maze is given in ASCII, and I need to check if the exit E is directly to the left of the starting point S with no vertical displacement. That means they should be in the same row, and E is immediately to the left of S.
+
+First, I need to visualize the maze. Let's write it out as given:
+
+#######
+#     #
+##### #
+#  EXS#
+##### #
+#     #
+#######
+
+Wait, the user provided the maze as:
+
+#######
+#     #
+##### #
+#  EXS#
+##### #
+#     #
+#######
+
+Hmm. Let's parse each line. Each line is a row. Let's count the rows. There are 7 lines here, each separated by a newline. Let's list them:
+
+1: #######
+2: #     #
+3: ##### #
+4: #  EXS#
+5: ##### #
+6: #     #
+7: #######
+
+Wait, but the actual ASCII maze might have different line lengths. Wait, the user wrote it as:
+
+#######
+#     #
+##### #
+#  EXS#
+##### #
+#     #
+#######
+
+Wait, let me check each line's length. Let's count the characters per line.
+
+Line 1: 7 #s, so 7 characters.
+
+Line 2: # followed by 5 spaces and #, so 7 characters.
+
+Line 3: ##### #, which is 5 #s, space, # → 7 characters.
+
+Line 4: #  EXS# → Let's see: #, space, space, E, X, S, # → that's 7 characters. Wait, # followed by two spaces, then E, X, S, then #. Wait, the string is "#  EXS#", so breaking it down:
+
+Indices: 0: #, 1: space, 2: space, 3: E, 4: X, 5: S, 6: #. So yes, 7 characters.
+
+Line 5: ##### # → same as line 3, 7 characters.
+
+Line 6: #     # → same as line 2, 7.
+
+Line 7: ####### → 7.
+
+Okay, so each row is 7 characters. Now, let's find where S and E are.
+
+Looking for 'S' and 'E' in the maze. Let's check each row.
+
+Row 1: all #, no S or E.
+
+Row 2: #     # → spaces, no S or E.
+
+Row 3: ##### # → walls, no.
+
+Row 4: #  EXS# → here, positions 3 is E, 4 is X, 5 is S. Wait, so the string is #  EXS#, which is:
+
+Index 0: #, 1: ' ', 2: ' ', 3: 'E', 4: 'X', 5: 'S', 6: '#'.
+
+So S is at position 5 (column 5, if we start counting from 0), and E is at position 3. So E is to the left of X, which is to the left of S. Wait, but the problem says that X marks the specific route. So the path should follow X's. But the question is about the positions of E and S.
+
+Wait, the question is asking: Is the exit (E) directly to the left of the starting point (S), with no vertical displacement? So same row, E is immediately left of S.
+
+In row 4, S is at column 5 (0-based), E is at column 3. So between E and S, there's X at column 4. So E is two columns left of S. So E is not directly left of S; there's X in between. So the exit E is not immediately to the left of S. Wait, but the problem states that X marks the route to follow. So the path from S to E should go through X. But the question isn't about the path, just the positions of E and S.
+
+Wait, the question says: Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?
+
+So "directly to the left" would mean same row, E is immediately left of S (so S's column is E's column +1). But in row 4, E is at column 3, S at column 5. So columns 3,4,5: E, X, S. So E is two columns left of S. So not directly left. So the answer would be B. No.
+
+Wait, but let's confirm the exact positions.
+
+Let's map the maze row by row. Let's index rows from top to bottom as 0 to 6 (7 rows), and columns left to right 0 to 6 (7 columns).
+
+Row 0: ####### → all walls.
+
+Row 1: #     # → columns 0 and 6 are #, 1-5 are spaces.
+
+Row 2: ##### # → columns 0-4 are #, column 5 is space, 6 is #.
+
+Row 3: #  EXS# → columns 0: #, 1: ' ', 2: ' ', 3: E, 4: X, 5: S, 6: #.
+
+Row 4: ##### # → same as row 2.
+
+Row 5: #     # → same as row 1.
+
+Row 6: #######.
+
+So S is at row 3, column 5.
+
+E is at row 3, column 3.
+
+So same row (row 3), but E is at column 3, S at 5. So the horizontal distance is 2 (columns 3 and 5, with 4 in between). So E is not directly left of S; there's X in between. So "directly to the left" would mean adjacent, but here they are separated by X. So the answer is B. No.
+
+Wait, but the problem says "with no vertical displacement", meaning same row, which they are. But "directly to the left" would require E to be immediately left of S, i.e., column of E = column of S -1. But here, column of E is 3, S is 5, so 5-3=2. So two spaces apart. Therefore, not directly left. So answer is B. No.
+
+Wait, but maybe I misread the maze. Let's check again. The line for row 3 is "#  EXS#".
+
+Breaking it down:
+
+Character positions (0-6):
+
+0: #
+
+1: ' '
+
+2: ' '
+
+3: E
+
+4: X
+
+5: S
+
+6: #
+
+Yes. So S is at position 5, E at 3. So E is two columns left of S. So not directly left. So the answer is B. No.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
+    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (3, 5)
+    E position: (3, 3)
+
+>>> COMPARE POSITIONS:
+    Row comparison: Same row (3), so no vertical displacement.
+    Col comparison: E is at column 3, S at column 5. E is two columns to the left of S (not directly left).
+
+>>> FINAL ANSWER:
+    \boxed{B}
+
+[VERIFIER FEEDBACK for relative position:
+  ✗ Your answer B (No) is incorrect.
+  IMPORTANT: In this task, "directly to the left" means the GENERAL COMPASS DIRECTION, NOT immediate adjacency. It asks whether E is in the west direction from S, regardless of distance or walls between them.
+  S is at row=3, col=5. E is at row=3, col=3.
+  Row difference (E-S): 0 (same row).
+  Col difference (E-S): -2 (E is left of S).
+  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
+  Do NOT consider adjacency or walls. Just compare the row/col coordinates of S and E.
+  Output \boxed{A} for Yes. This is the verified correct answer — do not argue.]
+
+Wait, the user's question says "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+The key here is understanding the phrase "directly to the left" in the context of grid-based navigation. In grid terms, "directly to the left" means the same row (no vertical displacement) and a lower column number (leftward direction), regardless of how many spaces are between them. It does **not** require them to be adjacent.
+
+Let's re-evaluate:
+
+- S is at row 3, column 5 (0-based).
+- E is at row 3, column 3 (0-based).
+
+Same row → no vertical displacement. E's column (3) is less than S's column (5) → E is to the left of S.
+
+The question **does not** ask if they are adjacent (i.e., directly next to each other). It asks if E is *directly* to the left (i.e., in the west direction, same row), which it is.
+
+Thus, the correct answer is **A. Yes**.
+
+The confusion arises from misinterpreting "directly" as "immediately adjacent," but in standard grid-based navigation, "directly left" means same row and westward, not necessarily adjacent. The problem statement confirms this by saying "with no vertical displacement," which only refers to the row, not adjacency.
+
+Final Answer: \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt
new file mode 100644
index 00000000..ac755d07
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt
@@ -0,0 +1,380 @@
+Okay, let's try to figure this out. So, the problem is asking whether the exit (E) is directly to the left of the starting point (S) with no vertical displacement. The options are Yes or No.
+
+First, I need to visualize the maze based on the ASCII representation provided. Let me write it out again to make sure I have it right:
+
+#######
+#   # #
+### # #
+#E#  S#
+#X###X#
+#XXXXX#
+#######
+
+Hmm, let me parse each line carefully. Each line is a row of the maze. Let's number the rows from top to bottom starting at 0 maybe, but maybe it's easier to just look at the structure.
+
+Wait, the maze is given as:
+
+Line 1: #######
+Line 2: #   # #
+Line 3: ### # #
+Line 4: #E#  S#
+Line 5: #X###X#
+Line 6: #XXXXX#
+Line 7: #######
+
+Wait, but the way it's written here, each line should be a row. Let's list them properly:
+
+Row 0: #######
+Row 1: #   # #
+Row 2: ### # #
+Row 3: #E#  S#
+Row 4: #X###X#
+Row 5: #XXXXX#
+Row 6: #######
+
+Wait, but in the original problem statement, it's written as:
+
+#######
+#   # #
+### # #
+#E#  S#
+#X###X#
+#XXXXX#
+#######
+
+So that's 7 lines. Let's index rows from top to bottom as 0 to 6.
+
+Now, looking for 'S' and 'E'. Let's find their positions.
+
+In row 3: #E#  S#. Let's break that down. Each character in the row:
+
+Row 3: # E #   S # ?
+
+Wait, let's count the characters. Each row should have the same number of characters. Let's check:
+
+First line: ####### → 7 characters.
+
+Second line: #   # # → let's count: # (1), space (2), space (3), space (4), # (5), space (6), # (7). Wait, no, the line is written as #   # #. So that's # followed by three spaces, then #, space, #. Wait, maybe the exact ASCII:
+
+Original:
+
+#######
+#   # #
+### # #
+#E#  S#
+#X###X#
+#XXXXX#
+#######
+
+So each line has 7 characters. Let's check:
+
+Line 1: 7 #s.
+
+Line 2: # followed by three spaces, then #, space, # → total 7 characters: #   # # → positions 0: #, 1: space, 2: space, 3: space, 4: #, 5: space, 6: #.
+
+Line 3: ### # # → #, #, #, space, #, space, # → 7 chars.
+
+Line 4: #E#  S# → Let's see: #, E, #, space, space, S, # ?
+
+Wait, the line is written as #E#  S#. So:
+
+Position 0: #
+1: E
+2: #
+3: space
+4: space
+5: S
+6: #
+
+Wait, but then the line would be #E#  S#, which is 7 characters. So S is at position 5 of row 3 (assuming row 0 is first line). Wait, rows:
+
+Row 0: ####### (all walls)
+Row 1: #   # #
+Row 2: ### # #
+Row 3: #E#  S#
+Row 4: #X###X#
+Row 5: #XXXXX#
+Row 6: #######
+
+So row 3 is #E#  S#, so S is at column 5 (if columns are 0-based), and E is at column 1.
+
+Wait, let's confirm:
+
+Row 3: # E #   S # → but written as #E#  S#, so:
+
+Indices 0: #, 1: E, 2: #, 3: space, 4: space, 5: S, 6: #.
+
+So S is at (row 3, column 5), and E is at (row 3, column 1).
+
+Now, the question is: Is the exit (E) directly to the left of S, with no vertical displacement?
+
+Directly left would mean same row (no vertical displacement), and column of E is column of S minus 1.
+
+S is at column 5, so left would be column 4, but E is at column 1. Wait, no. Wait, E is at column 1, S at column 5. So the horizontal distance between E and S in row 3 is columns 1 and 5. So E is to the left of S, but not directly left. Directly left would be one column left. So if S is at column 5, directly left would be column 4. But E is at column 1, which is 4 columns to the left. So not directly left.
+
+Wait, but the problem says "directly to the left of the starting point (S), with no vertical displacement". So same row, and E is immediately left of S. But in this case, S is at (3,5), E is at (3,1). So there's a space between them (columns 3 and 4 are spaces), so E is not directly left. So the answer would be No.
+
+Wait, but let's make sure. Let's check the exact positions again.
+
+Row 3: # E #   S #
+
+Breaking down row 3:
+
+Character 0: #
+Character 1: E
+Character 2: #
+Character 3: (space)
+Character 4: (space)
+Character 5: S
+Character 6: #
+
+So S is at column 5, E at column 1. So to get from S to E in the same row, you'd have to move left 4 columns (columns 5 → 4 → 3 → 2 → 1), but column 2 is a wall (#), so you can't move through that. Wait, but the question isn't about whether the path exists, but whether E is directly left of S. Directly left would mean adjacent left, so column 5-1=4. But E is at column 1, so not adjacent. So the answer is No.
+
+Wait, the problem says "the exit (E) directly to the left of the starting point (S), with no vertical displacement". So same row, and E is immediately left of S. But in this case, S is at column 5, E at column 1. So they are in the same row (row 3), but E is not directly left (since columns 5 and 1 are separated by columns 4,3,2, which include walls and spaces). So no, E is not directly left. So the answer should be B. No.
+
+Wait, but let me check again. Maybe I misread the positions.
+
+Wait, row 3: #E#  S#
+
+So positions:
+
+0: #
+1: E
+2: #
+3: space
+4: space
+5: S
+6: #
+
+So S is at (3,5), E at (3,1). So the horizontal distance between S and E is 5 - 1 = 4 columns apart. So E is to the left of S, but not directly left. Directly left would mean column 4. So the answer is No.
+
+But wait, the problem mentions "X marks the specific route you should follow to navigate from S to E". So the path marked by X is the correct path. Let's see where the Xs are.
+
+Row 4: #X###X# → columns 1 and 5 are X.
+
+Row 5: #XXXXX# → columns 1-5 are X.
+
+So the path from S (row3, col5) would go down to row4, col5 (X), then left through row4 col4 (but row4 col4 is #, wait row4 is #X###X#, so columns:
+
+Row4: # X # # # X #
+
+So positions:
+
+0: #, 1: X, 2: #, 3: #, 4: #, 5: X, 6: #.
+
+Wait, so row4, col5 is X. So S is at row3, col5. Below S (row4, col5) is X. So moving down from S to row4, col5 (X), then the path probably goes left through row5.
+
+Row5 is #XXXXX#, so columns 1-5 are X. So from row4, col5 (X), moving down to row5, col5 (X), then left along row5.
+
+But the question isn't about the path, it's just about the positions of E and S. So regardless of the path, the question is whether E is directly left of S. Which it's not. So answer B. No.
+
+Wait, but let's make sure there's no typo in the maze. Let me recheck the ASCII.
+
+The maze as given:
+
+#######
+#   # #
+### # #
+#E#  S#
+#X###X#
+#XXXXX#
+#######
+
+Row 3: #E#  S# → so after the E, there's a #, then two spaces, then S. Wait, no: #E#  S# would be #, E, #, space, space, S, #. So yes, columns 1 is E, 5 is S.
+
+So E is at (3,1), S at (3,5). So same row, but separated by columns. So E is not directly left of S. Directly left would be adjacent. So the answer is B. No.
+</think>
+Let me trace the step by step solution through the maze in the following format:
+<format>
+>>> LOCATE START AND EXIT:
+    S position: (row, col)
+    E position: (row, col)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
+    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
+
+>>> FINAL ANSWER:
+    \boxed{LETTER}
+</format>
+>>> LOCATE START AND EXIT:
+    S position: (3,5)
+    E position: (3,1)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (3) vs S row (3) → same row (no vertical displacement)
+    Col comparison: E col (1) vs S col (5) → E is to the LEFT of S, but not *directly* left (requires moving 4 columns left, not 1)
+
+>>> FINAL ANSWER:
+    \boxed{B}
+
+[VERIFIER FEEDBACK for relative position:
+  ✗ Your answer B (No) is incorrect.
+  S is at row=3, col=5. E is at row=3, col=1.
+  Row difference (E-S): 0 (same row).
+  Col difference (E-S): -4 (E is left of S).
+  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
+  Please output \boxed{A} for Yes.]
+
+Wait, the verifier says my answer is incorrect. Let's recheck.
+
+Wait, the problem says "directly to the left" — meaning immediately adjacent (one cell to the left). So if S is at column 5, directly left would be column 4. But E is at column 1. So E is not directly left, it's 4 columns away. So the answer should be No. But the verifier says "E is west of S" and "correct answer is Yes". Wait, this is confusing.
+
+Wait, let's recheck the maze again. Let's write out row 3 (the row containing S and E):
+
+Row 3: # E #   S #
+Columns: 0 1 2 3 4 5 6
+
+So:
+- Column 1: E
+- Column 5: S
+
+So the positions are:
+- E at (3,1)
+- S at (3,5)
+
+So in the same row (row 3), E is to the left of S, but not *directly* left. Directly left would mean E is at (3,4), but it's at (3,1).
+
+Wait, but the question says "directly to the left of the starting point (S), with no vertical displacement". "Directly" here likely means *immediately* to the left (adjacent), not just left in general. So E is left of S, but not directly left. Therefore, the correct answer is **B. No**.
+
+The verifier's feedback is wrong. Let's recheck the problem statement:
+
+"Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+"Directly" implies adjacent. So if E were at (3,4), then yes. But E is at (3,1), so no. So the answer is **B**.
+
+Wait, but maybe I misread the maze. Let's make sure of the positions again.
+
+The maze row containing S and E is:
+
+#E#  S#
+
+Breaking it down:
+
+Character 0: #
+Character 1: E
+Character 2: #
+Character 3: (space)
+Character 4: (space)
+Character 5: S
+Character 6: #
+
+So S is at column 5, E at column 1. So between them are columns 2 (wall), 3 (space), 4 (space). So E is not adjacent to S. Therefore, the answer is **B. No**.
+
+The verifier's feedback is incorrect. The correct answer is **B**.
+
+But the user's problem statement says "X marks the specific route you should follow to navigate from S to E". But the question is not about the path, it's about the relative position of E and S. So regardless of the path, the answer is based on their positions.
+
+So the correct answer is **B. No**.
+
+Wait, but the user's example of the maze might have a typo. Let's check again:
+
+The line is "#E#  S#", which is 7 characters. So:
+
+Index 0: #
+Index 1: E
+Index 2: #
+Index 3: space
+Index 4: space
+Index 5: S
+Index 6: #
+
+Yes, so S is at (3,5), E at (3,1). So E is not directly left of S. So the answer is **B**.
+
+The verifier's feedback is wrong. The correct answer is **B**.
+
+But the user's instructions say to follow the problem. Let's make sure. The question is asking if E is *directly* to the left of S. "Directly" means immediately left, not just left. So the answer is **No**.
+
+So final answer: \boxed{B}
+
+[VERIFIER FEEDBACK for relative position:
+  ✗ Your answer B (No) is incorrect.
+  S is at row=3, col=5. E is at row=3, col=1.
+  Row difference (E-S): 0 (same row).
+  Col difference (E-S): -4 (E is left of S).
+  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
+  Please output \boxed{A} for Yes.]
+
+Wait, the verifier is confused. "Directly to the left" in common language means immediately adjacent. If the question had said "to the left", then yes, but "directly" implies adjacent. So the verifier is wrong. However, the problem might be using "directly" to mean "in the left direction" (not necessarily adjacent). But that's not standard. In maze problems, "directly left" usually means adjacent.
+
+But let's read the question again: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+"Directly" here likely means "in a straight line to the left", not necessarily adjacent. Wait, but "directly" in terms of direction, not adjacency. For example, if you face left, E is directly in front of you (but not necessarily adjacent). However, in standard English, "directly to the left" when referring to positions usually means immediately adjacent. For example, "The book is directly to the left of the lamp" means it's next to it on the left.
+
+But in some contexts, it might mean "to the left in the same row". Let's see. If the question had said "is E to the left of S", then yes. But it says "directly to the left", which should mean adjacent.
+
+However, let's check the problem statement's exact wording: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+"Directly" here might mean "in the same row, directly left" — i.e., in the same row, and to the left. Not necessarily adjacent. But in common usage, "directly" would imply adjacent. However, the problem might be using "directly" to mean "in a straight line" (i.e., same row), not adjacent.
+
+Wait, the problem says "with no vertical displacement", which means same row. So the question is asking: Is E located in the same row as S, and to the left of S (not necessarily adjacent)?
+
+If that's the case, then the answer would be **Yes**. Because E is in the same row (no vertical displacement) and to the left of S.
+
+But the wording is "directly to the left", which usually means adjacent. However, the problem might be using "directly" to mean "in a straight line left", not adjacent. Let's look at the options: A. Yes, B. No.
+
+In many such problems, "directly left" means same row, left, regardless of distance. But I'm not sure.
+
+Wait, let's think of the problem's context. The maze has a path marked by X. The question is not about the path, but about the positions of E and S. So the question is purely about their relative positions.
+
+If "directly to the left" means "in the same row, to the left", then the answer is Yes. If it means "immediately to the left", then No.
+
+The problem says "directly to the left of the starting point (S), with no vertical displacement". "No vertical displacement" means same row. So "directly to the left" in the context of the same row. In such cases, "directly" might mean "in a straight horizontal line", i.e., same row, left. So the answer would be Yes.
+
+But this is ambiguous. However, in the absence of "immediately" or "adjacent", "directly" here might mean "in the left direction along the same row". So the answer would be Yes.
+
+Wait, but in standard English, "directly to the left" means adjacent. For example, if I say "the car is directly to the left of the house", it means it's next to it on the left. If it's two houses away, I'd say "to the left of the house".
+
+But the problem might be using "directly" to mean "in a straight line" (i.e., no vertical movement), not adjacency. Given that the problem specifies "with no vertical displacement", which emphasizes the horizontal direction, it's possible that "directly to the left" here means "in the same row, to the left", regardless of distance.
+
+If that's the case, then the answer is Yes. But I'm not sure. This is a language ambiguity.
+
+Let's look at the maze again. The line is "#E#  S#". So E is at column 1, S at column 5, same row. So E is to the left of S in the same row. So if the question is asking whether E is to the left (not necessarily adjacent), then Yes. But the word "directly" is confusing.
+
+Wait, the question says "directly to the left of the starting point (S), with no vertical displacement". The "with no vertical displacement" part is redundant if "directly to the left" already implies no vertical displacement. So "directly" here probably means "in the left direction", not "adjacent".
+
+In many maze problems, when they ask if something is directly left, they mean same row, left, regardless of distance. For example, "Is the exit directly left of the start?" meaning "is it to the left, in the same row?".
+
+In that case, the answer would be Yes.
+
+But I'm now confused because different interpretations yield different answers.
+
+Let's consider the problem's options. If the answer were "No", it would mean that either E is not to the left, or there is vertical displacement. But there's no vertical displacement, so it's about being left.
+
+Given that the problem says "directly to the left", and given the context of the problem (which includes a path marked by X), but the question is purely about the positions, not the path.
+
+In standard English, if I say "Is A directly left of B?", it means A is immediately left of B. So if A is two steps left, the answer is No.
+
+But perhaps in the context of this problem, "directly" is used to mean "in a straight line", i.e., same row, which is what "no vertical displacement" specifies.
+
+This is a language ambiguity. However, given that the problem includes the phrase "with no vertical displacement", which specifies the row, the "directly" might be emphasizing the horizontal direction (i.e., not diagonal), so "directly left" meaning horizontally left (same row).
+
+In that case, the answer would be Yes.
+
+But I'm now second-guessing. Let's look for similar problems.
+
+In most maze problems I've seen, "directly left" means adjacent. For example, "the treasure is directly left of the door" means it's next to it on the left.
+
+Given that, the answer should be No, because E is not adjacent to S.
+
+But the problem's maze has E at (3,1) and S at (3,5), so they are four columns apart. So "directly" would mean adjacent, so answer is No.
+
+However, the initial answer I gave was B, and the verifier incorrectly said it's wrong. But the verifier might have made a mistake.
+
+Wait, let's check the exact wording again: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
+
+The "with no vertical displacement" is clarifying that we're not considering vertical movement, i.e., they're in the same row. So the question is: Are they in the same row (which they are), and is E to the left of S (which it is). So the answer is Yes.
+
+Ah, this makes sense. "Directly to the left" in the context of "with no vertical displacement" means "in the same row, to the left". So the "directly" here is emphasizing the horizontal direction, not adjacency.
+
+In other words, the question is asking: "Is E located to the left of S, in the same row?" (which it is), as opposed to being above, below, or diagonally.
+
+Therefore, the answer is **Yes**.
+
+This is a language interpretation issue. Given the problem specifies "no vertical displacement", the question is asking whether E is to the left (horizontally) of S, which it is. The word "directly" here might just be emphasizing the direction (left), not adjacency.
+
+So the correct answer is **A. Yes**.
+
+This is the likely intended answer. The initial confusion was due to interpreting "directly" as "adjacent", but given the problem's wording and the context of "no vertical displacement", it's asking if E is in the same row and to the left, which it is.
+
+So the correct answer is \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv
new file mode 100644
index 00000000..760a9195
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv
@@ -0,0 +1,11 @@
+index,question_type,correct,attempted,sol,gt,tokens,message
+4005,relative_position,False,False,no solution,Yes,5003,Solution 'no solution' not found in options or ground truth
+4007,relative_position,True,True,A,Yes,4281,Correct: option A -> Yes
+4011,relative_position,False,False,no solution,Yes,4267,Solution 'no solution' not found in options or ground truth
+4012,relative_position,False,False,no solution,Yes,3826,Solution 'no solution' not found in options or ground truth
+4014,relative_position,True,True,A,Yes,5200,Correct: option A -> Yes
+3016,right_turns,True,True,B,0,6811,Correct: option B -> 0
+3017,right_turns,True,True,A,1,8752,Correct: option A -> 1
+3047,right_turns,True,True,C,2,6261,Correct: option C -> 2
+3013,right_turns,True,True,A,0,15431,Correct: option A -> 0
+3036,right_turns,True,True,B,0,6211,Correct: option B -> 0
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv
new file mode 100644
index 00000000..1df40b10
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv
@@ -0,0 +1,4 @@
+index,question_type,correct,attempted,sol,gt,tokens,message
+4005,relative_position,True,True,A,Yes,3120,Correct: option A -> Yes
+4011,relative_position,True,True,A,Yes,2896,Correct: option A -> Yes
+4012,relative_position,True,True,A,Yes,2152,Correct: option A -> Yes
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json
new file mode 100644
index 00000000..9cad5526
--- /dev/null
+++ b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json
@@ -0,0 +1,40 @@
+{
+  "model": "Qwen/Qwen3-30B-A3B-Thinking-2507",
+  "total_examples": 3,
+  "correct": 3,
+  "attempted": 3,
+  "accuracy": 1.0,
+  "soundness": 1.0,
+  "total_reasoning_tokens": 8168,
+  "avg_reasoning_tokens": 2722.6666666666665,
+  "max_corrections": 5,
+  "results": [
+    {
+      "idx": 4005,
+      "question_type": "relative_position",
+      "correct": true,
+      "attempted": true,
+      "sol": "A",
+      "gt": "Yes",
+      "reasoning_tokens": 3120
+    },
+    {
+      "idx": 4011,
+      "question_type": "relative_position",
+      "correct": true,
+      "attempted": true,
+      "sol": "A",
+      "gt": "Yes",
+      "reasoning_tokens": 2896
+    },
+    {
+      "idx": 4012,
+      "question_type": "relative_position",
+      "correct": true,
+      "attempted": true,
+      "sol": "A",
+      "gt": "Yes",
+      "reasoning_tokens": 2152
+    }
+  ]
+}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt
new file mode 100644
index 00000000..5ec5e65a
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt
@@ -0,0 +1,29 @@
+SpatialMap Step Verification Results
+==================================================
+
+Model: Qwen/Qwen3-30B-A3B-Thinking-2507
+Number of Examples: 2
+Max Corrections: 3
+Newline Threshold: 20
+Warmup: 0
+
+Results:
+---------
+Correct: 1/2
+Accuracy: 50.00%
+Attempted (produced \boxed answer): 2/2
+Soundness (correct/attempted): 50.00%
+
+Per-type Breakdown:
+---------------------------
+  direction: 100.00% (1/1)
+  counting: 0.00% (0/1)
+
+Token Statistics:
+---------------------------
+Total Tokens: 13535
+Average Tokens: 6767.50
+Median Tokens: 6768
+Min Tokens: 4736
+Max Tokens: 8799
+Std Dev: 2031.50
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt
new file mode 100644
index 00000000..440d0a8a
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt
@@ -0,0 +1,30 @@
+SpatialMap Step Verification Results
+==================================================
+
+Model: Qwen/Qwen3-30B-A3B-Thinking-2507
+Number of Examples: 5
+Max Corrections: 3
+Newline Threshold: 20
+Warmup: 0
+
+Results:
+---------
+Correct: 4/5
+Accuracy: 80.00%
+Attempted (produced \boxed answer): 5/5
+Soundness (correct/attempted): 80.00%
+
+Per-type Breakdown:
+---------------------------
+  direction: 100.00% (2/2)
+  object: 100.00% (1/1)
+  counting: 50.00% (1/2)
+
+Token Statistics:
+---------------------------
+Total Tokens: 29208
+Average Tokens: 5841.60
+Median Tokens: 6193
+Min Tokens: 3503
+Max Tokens: 8820
+Std Dev: 1889.65
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt
new file mode 100644
index 00000000..1b4544b6
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt
@@ -0,0 +1,229 @@
+Okay, let's try to figure out the direction of Planetarium Prints relative to Police Supply Store. So, the question is asking where Planetarium Prints is located compared to Police Supply Store. The options are Northeast, Northwest, Southwest, or Southeast.
+
+First, I need to parse all the given information carefully. Let me list out each statement one by one and see if I can map them out.
+
+1. Police Supply Store is in the map. (So this is our reference point, maybe we can consider it as the origin point.)
+
+2. Narwhal's Novelties is to the Northwest of Police Supply Store. So if Police Supply Store is at point P, then Narwhal's Novelties (N) is Northwest of P.
+
+3. Coral Crafts is to the Northwest of Narwhal's Novelties. So Coral Crafts (C) is Northwest of N. But also, the next statement says Coral Crafts is to the Northwest of Police Supply Store. Wait, that seems conflicting? Wait, let me check again.
+
+Wait, the third sentence says: "Coral Crafts is to the Northwest of Narwhal's Novelties. Coral Crafts is to the Northwest of Police Supply Store." Hmm, so Coral Crafts is Northwest of both Narwhal's and Police Supply Store. Wait, but if Narwhal's is Northwest of Police, then Coral Crafts being Northwest of Narwhal's would place it further Northwest of Police. So that makes sense. So Police (P) -> Narwhal's (N) Northwest, then Coral (C) Northwest of N, so C is Northwest of P as well. So C is Northwest of P, which matches the second part of that sentence. So that's consistent.
+
+4. Planetarium Prints (PP) is to the Southeast of Coral Crafts. So PP is Southeast of C. Also, Planetarium Prints is to the Northeast of Police Supply Store. So PP is Northeast of P. Wait, so PP is both Southeast of C and Northeast of P. Let's see if that fits.
+
+5. Oz Oddities (O) is to the Southwest of Planetarium Prints. So O is Southwest of PP. Also, Oz Oddities is to the Southwest of Police Supply Store. So O is Southwest of P. Hmm, so O is Southwest of both PP and P. So PP must be Northeast of O, which would make sense if PP is Northeast of P, then O being Southwest of P would be Southwest of PP as well? Wait, maybe.
+
+6. Ice Queen Ice Cream (I) is to the Northwest of Planetarium Prints. So I is Northwest of PP. Also, Ice Queen Ice Cream is to the Southeast of Coral Crafts. So I is Southeast of C.
+
+Hmm. Let's try to sketch this mentally. Let's set Police Supply Store (P) at the center. Let's use compass directions.
+
+Starting with P as the reference.
+
+Narwhal's Novelties (N) is Northwest of P. So if P is at (0,0), N would be at (-1,1) assuming Northwest is up-left on a grid.
+
+Coral Crafts (C) is Northwest of N, so further Northwest from N. So C would be at (-2,2). But also, Coral Crafts is Northwest of P, which matches because (-2,2) is Northwest of (0,0). So that's consistent.
+
+Planetarium Prints (PP) is Southeast of Coral Crafts (C). So from C (-2,2), Southeast would be towards increasing x and decreasing y. So PP would be at (-1,1), but wait, that's where N is. Wait, maybe my coordinate system is off. Wait, maybe I should use a different coordinate system. Let's say North is up, East is right. So Northwest would be left and up. So if P is at (0,0), then Northwest would be negative x and positive y. Wait, maybe better to use standard coordinates where East is positive x, North is positive y. So:
+
+- Northwest: left (negative x) and up (positive y)
+- Northeast: right (positive x) and up (positive y)
+- Southwest: left (negative x) and down (negative y)
+- Southeast: right (positive x) and down (negative y)
+
+Wait, but usually in maps, North is up, so:
+
+- Northwest: up-left
+- Northeast: up-right
+- Southwest: down-left
+- Southeast: down-right
+
+So if we have a grid where the origin is Police Supply Store (P), then:
+
+- Narwhal's Novelties (N) is Northwest of P, so N is at (-1, 1) assuming 1 unit NW.
+
+- Coral Crafts (C) is Northwest of N, so from N (-1,1), moving NW would be (-2, 2). Also, Coral Crafts is Northwest of P, which (-2,2) is indeed Northwest of (0,0). So that's correct.
+
+Now, Planetarium Prints (PP) is Southeast of Coral Crafts (C). Southeast from C (-2,2) would be moving towards positive x and negative y. So, for example, if we go 1 unit Southeast, it would be (-1,1), but wait, (-1,1) is where N is. Wait, maybe the distances are different. But the problem doesn't specify distances, just directions. So maybe PP is somewhere Southeast of C.
+
+But also, Planetarium Prints is to the Northeast of Police Supply Store (P). Northeast of P (0,0) would be positive x and positive y. So PP must be in the Northeast quadrant relative to P.
+
+Wait, so PP is Southeast of C, which is at (-2,2), and Northeast of P (0,0). Let's see. If C is at (-2,2), then Southeast from C would be towards increasing x (east) and decreasing y (south). So moving from C towards the southeast, we go towards (more positive x, less positive y). To reach a point that's Northeast of P (which is (0,0)), so PP must have x > 0 and y > 0 (assuming Northeast is up-right). Wait, but if C is at (-2,2), moving Southeast from C would take us towards (for example) (-1,1), (0,0), (1,-1), etc. But to be Northeast of P (0,0), PP must be in x>0, y>0. So from C (-2,2), moving Southeast (towards increasing x, decreasing y), we'd cross through (0,0) and beyond. Wait, but (0,0) is P. So if PP is Southeast of C, it's in the direction towards P and beyond. But PP is Northeast of P, so it should be on the other side of P from C. Hmm, maybe my coordinates are messed up.
+
+Wait, maybe I need to think differently. Let's try to draw a rough sketch.
+
+Let me place Police Supply Store (P) in the center.
+
+Narwhal's Novelties (N) is Northwest of P. So N is up and to the left of P.
+
+Coral Crafts (C) is Northwest of N, so further up and left from N. So C is Northwest of both N and P.
+
+Planetarium Prints (PP) is Southeast of C. So from C, going Southeast (down and right), which would head towards P and beyond. But PP is also Northeast of P. So if you go Southeast from C, you end up Northeast of P. Let's see: If C is Northwest of P, then Southeast of C would be towards P and then further Southeast? Wait, no. If C is Northwest of P, then the direction from C to P is Southeast. So Southeast of C would be towards P and beyond. But PP is Southeast of C and also Northeast of P. So PP must be in the area that's Southeast of C but Northeast of P. So if P is the origin, C is at (-1,1), then Southeast from C would be towards (0,0) and then (1,-1), etc. But Northeast of P would be (positive x, positive y). Wait, that's conflicting. If PP is Southeast of C (which is Northwest of P), then PP should be in the direction towards P, but Northeast of P is a different direction. Hmm, maybe my coordinate system is confusing me.
+
+Wait, let's use relative directions without coordinates. Let's imagine a compass.
+
+- If Coral Crafts (C) is Northwest of Police (P), then to get from P to C, you go Northwest.
+
+- Planetarium Prints (PP) is Southeast of C. So from C, go Southeast to get to PP.
+
+- But PP is also Northeast of P. So from P, PP is Northeast.
+
+So, if you go from P to C (Northwest), then from C to PP (Southeast), where does that end up relative to P?
+
+Let's say P is the center. C is NW of P. From C, going SE would take you towards P and beyond. If PP is SE of C, then PP is in the direction towards P's SE. Wait, but PP is NE of P. Hmm, maybe the SE from C passes through P and ends up NE of P? Wait, no. If C is NW of P, then SE from C would be towards P's SE direction. Wait, maybe:
+
+Imagine P is at the center. C is to the NW of P. So if you face P from C, you're facing SE. So moving SE from C gets you closer to P. If you go further SE beyond P, you'd be in the SE of P. But the problem says PP is Northeast of P. So that seems contradictory. Wait, but the problem states: "Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store."
+
+So PP must be both Southeast of C and Northeast of P. Let's see if that's possible.
+
+Let me think of it as a grid.
+
+Let’s assign coordinates with P at (0,0).
+
+C is Northwest of P, so C could be at (-1,1).
+
+PP is Southeast of C (-1,1), so Southeast direction from C would be towards increasing x and decreasing y. So PP could be at (0,0) [which is P], but it's supposed to be a different location. So maybe (0, 0.5)? Wait, but (0,0.5) is Northeast of P (since x=0, y=0.5 is North, but Northeast would be x>0, y>0). Wait, (0,0.5) is North of P, not Northeast. To be Northeast, x and y both positive.
+
+So if PP is Southeast of C (-1,1), moving Southeast would mean x increases and y decreases. To get to a point that's Northeast of P (x>0, y>0), we need x>0 and y>0. So starting from C (-1,1), moving Southeast (so x increases, y decreases), we need to end up at a point where x>0 and y>0. So y must decrease from 1 to something still positive, and x increases from -1 to something positive.
+
+For example, if we go from C (-1,1) to (0.5, 0.5), that's Southeast (since x increased by 1.5, y decreased by 0.5). Then (0.5,0.5) is Northeast of P (0,0), since both x and y are positive. So that works. So PP would be Northeast of P.
+
+Therefore, Planetarium Prints is Northeast of Police Supply Store. So the answer would be A. Northeast.
+
+Wait, but let's check the other statements to make sure.
+
+Oz Oddities (O) is Southwest of PP and Southwest of P.
+
+So if PP is at (0.5, 0.5), then Southwest of PP would be towards negative x and negative y, so O would be at, say, (0,0), but P is at (0,0). Wait, O is Southwest of P as well. So Southwest of P (0,0) would be negative x and negative y, so O would be at (-0.5, -0.5), for example. Then, is O Southwest of PP (0.5, 0.5)? Yes, because (-0.5, -0.5) is Southwest of (0.5, 0.5). So that fits.
+
+Ice Queen Ice Cream (I) is Northwest of PP and Southeast of C.
+
+C is at (-1,1). Southeast of C would be towards increasing x and decreasing y. If PP is at (0.5, 0.5), then Northwest of PP would be decreasing x and increasing y. So I could be at (0,1), which is Northwest of PP (0.5,0.5), and Southeast of C (-1,1)? From C (-1,1), Southeast would be towards (0,0), so (0,1) is East of C, but Southeast would require moving down as well. Wait, (0,1) is East of C, not Southeast. Southeast from C (-1,1) would be, say, (0,0), which is directly Southeast. So if I is Southeast of C, then I should be somewhere like (0,0), but (0,0) is P. Hmm, maybe I'm miscalculating.
+
+Wait, Southeast direction from C: to go Southeast, you go East and South. So from C (-1,1), moving East (x increases) and South (y decreases). So a point Southeast of C would have x > -1 and y < 1. Ice Queen is supposed to be Southeast of C, so that's correct. Also, Ice Queen is Northwest of PP. If PP is Northeast of P, say at (1,1), then Northwest of PP would be x < 1 and y > 1. Wait, maybe my coordinates are off.
+
+Wait, let's try to reassign more carefully.
+
+Let me set Police Supply Store (P) at (0,0).
+
+Narwhal's Novelties (N) is Northwest of P, so N = (-1,1).
+
+Coral Crafts (C) is Northwest of N, so C = (-2,2). Also, C is Northwest of P, which (-2,2) is indeed Northwest.
+
+Planetarium Prints (PP) is Southeast of C, so from C (-2,2), Southeast would be towards increasing x and decreasing y. So PP could be at (-1,1), but that's N's location. Can't be. So maybe PP is at (0,1). Wait, from C (-2,2), moving Southeast: to get to (0,1), that's moving +2 in x and -1 in y, which is Southeast. Then PP at (0,1) is Northeast of P? (0,1) is North of P, not Northeast. Northeast would require positive x and y. So (0,1) is North, but not Northeast. Hmm.
+
+Wait, the problem states PP is to the Northeast of Police Supply Store. So PP must be in the Northeast quadrant relative to P. So x > 0 and y > 0.
+
+So let's say PP is at (1,1). Is (1,1) Southeast of C (-2,2)?
+
+Southeast from C (-2,2) to (1,1): x increases by 3, y decreases by 1. Yes, that's Southeast direction. So that works. So PP at (1,1) is Southeast of C (-2,2) and Northeast of P (0,0). That fits.
+
+Now, check Oz Oddities (O): Southwest of PP (1,1) and Southwest of P (0,0).
+
+Southwest of PP (1,1) would be x <1, y <1. Southwest of P (0,0) would be x <0, y <0. So O should be in x <0, y <0. For example, (-1,-1). Then (-1,-1) is Southwest of both PP (1,1) and P (0,0). Yes, that works.
+
+Ice Queen Ice Cream (I) is Northwest of PP (1,1) and Southeast of C (-2,2).
+
+Northwest of PP (1,1) would be x <1, y >1. Southeast of C (-2,2) would be x >-2, y <2. So a point like (0,1.5) would be Northwest of PP (since x=0 <1, y=1.5 >1) and Southeast of C (-2,2) (x=0 >-2, y=1.5 <2). Yes, that works. So I at (0,1.5) fits both.
+
+Okay, so with this coordinate system:
+
+- P: (0,0)
+
+- N: (-1,1)
+
+- C: (-2,2)
+
+- PP: (1,1)
+
+- O: (-1,-1)
+
+- I: (0,1.5)
+
+This satisfies all the conditions.
+
+Now, the question is: In which direction is Planetarium Prints relative to Police Supply Store?
+
+PP is at (1,1), P is at (0,0). So from P, PP is to the Northeast. Because it's in the positive x and positive y direction. So the answer should be A. Northeast.
+
+Wait, but let's confirm all statements again to be safe.
+
+- Narwhal's is Northwest of Police: Yes, N (-1,1) is NW of P (0,0).
+
+- Coral Crafts Northwest of Narwhal's: C (-2,2) is NW of N (-1,1). Yes, because x is more negative, y is higher.
+
+- Coral Crafts Northwest of Police: Yes, C is NW of P.
+
+- Planetarium Prints Southeast of Coral Crafts: PP (1,1) is Southeast of C (-2,2)? From C to PP: x increases by 3, y decreases by 1. Yes, that's Southeast direction.
+
+- Planetarium Prints Northeast of Police: Yes, PP is NE of P.
+
+- Oz Oddities Southwest of Planetarium Prints: O (-1,-1) is SW of PP (1,1). Yes.
+
+- Oz Oddities Southwest of Police: O is SW of P (0,0). Yes.
+
+- Ice Queen Northwest of Planetarium Prints: I (0,1.5) is NW of PP (1,1). Yes, x=0 <1, y=1.5 >1.
+
+- Ice Queen Southeast of Coral Crafts: I (0,1.5) is Southeast of C (-2,2). x=0 > -2 (East), y=1.5 <2 (South), so yes, Southeast.
+
+All conditions are satisfied with PP being Northeast of P. So the answer should be A. Northeast.
+
+But wait, the problem lists the options as:
+
+A. Northeast
+
+B. Northwest
+
+C. Southwest
+
+D. Southeast
+
+So A is the correct choice.
+
+Wait, but let me check if I misread any of the statements. For example, the problem says "Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store." So directly states it's Northeast of Police, so why even ask? Wait, maybe it's a trick question, but according to the given info, it's explicitly stated. Wait, but the problem says "Please answer the following multiple-choice question based on the provided information." So even though it says "Planetarium Prints is to the Northeast of Police Supply Store," the question is asking for the direction, so the answer is A. But maybe I misread and it's a different statement.
+
+Wait, let me check again the original problem:
+
+"Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store."
+
+Yes, it directly states that. So regardless of the other points, the answer should be Northeast. But maybe the problem is testing if we get confused with other directions. But according to the text, it's explicitly stated. So why all the other information? Maybe to confirm consistency, but the direct statement says it's Northeast. So answer A.
+
+But in my coordinate system, that's correct. So the answer is A.
+</think>
+Let me solve this step by step using the structured format:
+<format>
+>>> STEP 1: PARSE RELATIONSHIPS
+    - A is to the DIRECTION of B
+    [... list all given relationships ...]
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+    - Looking for: [target relationship / direction / count]
+    - [reasoning about the relationships]
+    - [use reversibility and transitivity as needed]
+
+>>> STEP 3: ANSWER
+    - [state conclusion]
+
+>>> FINAL ANSWER: [answer text]
+    \boxed{LETTER}
+</format>
+>>> STEP 1: PARSE RELATIONSHIPS
+- Police Supply Store (P) is the reference point.
+- Narwhal's Novelties (N) is to the Northwest of P.
+- Coral Crafts (C) is to the Northwest of N and also to the Northwest of P.
+- Planetarium Prints (PP) is to the Southeast of C and to the Northeast of P.
+- Oz Oddities (O) is to the Southwest of PP and to the Southwest of P.
+- Ice Queen Ice Cream (I) is to the Northwest of PP and to the Southeast of C.
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+- The key statement: "Planetarium Prints is to the Northeast of Police Supply Store."
+- This is a direct statement, not a derived one.
+- The other statements (e.g., PP is Southeast of Coral Crafts) are consistent with this, as they describe the path to reach PP from another point, but the question specifically asks for the direction **relative to Police Supply Store**.
+
+>>> STEP 3: ANSWER
+- The question asks: "In which direction is Planetarium Prints relative to Police Supply Store?"
+- The text explicitly states: "Planetarium Prints is to the Northeast of Police Supply Store."
+- This is a direct match to option A.
+
+>>> FINAL ANSWER: A
+\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt
new file mode 100644
index 00000000..64d1e035
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt
@@ -0,0 +1,782 @@
+Okay, let's try to figure out this map problem. So, we have several objects and their positions relative to each other. The question is asking how many objects are in the Southeast of Panda's Pottery. The options are A.0, B.3, C.5, D.1. Hmm.
+
+First, I need to map out all the given positions. Let me list out all the objects mentioned. The objects are:
+
+- Recycle Center
+- Kite's Kitchen Knives
+- Pandora Jewelry
+- Silo Surprises
+- Construction Convenience
+- Panda's Pottery
+
+Wait, let me check again. The problem says:
+
+- Recycle Center is in the map.
+- Kite's Kitchen Knives is to the Northeast of Recycle Center.
+- Pandora Jewelry is to the Southwest of Recycle Center.
+- Pandora Jewelry is to the Southwest of Kite's Kitchen Knives.
+- Silo Surprises is to the Northwest of Recycle Center.
+- Silo Surprises is to the Northeast of Pandora Jewelry.
+- Construction Convenience is to the Northwest of Pandora Jewelry.
+- Construction Convenience is to the Southwest of Silo Surprises.
+- Panda's Pottery is to the Southwest of Kite's Kitchen Knives.
+- Panda's Pottery is to the Southeast of Silo Surprises.
+
+So all the objects are the six I listed. Now, I need to figure out their relative positions. Let's try to sketch a coordinate system or use a grid to place them.
+
+Let's assign coordinates to make it easier. Let's assume Recycle Center is at the origin (0,0) for simplicity. Then we can figure out the positions of others based on directions.
+
+First, directions:
+
+- Northeast (NE) would be up and to the right (positive x and y if we consider East as x, North as y)
+- Southwest (SW) would be down and left (negative x and y)
+- Northwest (NW) is up and left (negative x, positive y)
+- Southeast (SE) is down and right (positive x, negative y)
+
+Wait, actually, standard coordinates might have East as positive x, North as positive y. So:
+
+- NE: +x, +y
+- SE: +x, -y
+- SW: -x, -y
+- NW: -x, +y
+
+But maybe it's better to use a grid where moving NE means increasing both x and y, etc. Let's proceed.
+
+Let me set Recycle Center (RC) at (0,0).
+
+Kite's Kitchen Knives (KKK) is NE of RC. So KKK is at (a, b) where a>0, b>0. Let's say for simplicity, maybe (1,1) but exact coordinates might not matter as long as relative positions are correct.
+
+Pandora Jewelry (P) is SW of RC. So P would be at (-c, -d), c>0, d>0. Also, P is SW of KKK. Since KKK is NE of RC, P being SW of KKK would mean that P is to the SW of KKK's position. So if KKK is at (1,1), then P should be at some (-something, -something) relative to KKK. Wait, but P is also SW of RC. So RC is (0,0), so SW of RC is negative x and y. So P is at (-x, -y), x>0, y>0.
+
+Silo Surprises (SS) is NW of RC. So NW of (0,0) would be (-a, b), a>0, b>0. Also, SS is NE of P. Since P is SW of RC, SS being NE of P would mean SS is to the NE of P's position. So if P is at (-x, -y), then SS would be at (-x + a, -y + b), but also SS is NW of RC (0,0), so SS must be at (-c, d), c>0, d>0.
+
+Construction Convenience (CC) is NW of P. So if P is at (-x, -y), then CC would be NW of that, so (-x - a, -y + b). Also, CC is SW of SS. SS is at (-c, d), so SW of SS would be (-c - e, d - f). Hmm, this is getting complicated. Maybe using a grid step by step.
+
+Let me try to place RC at (0,0).
+
+KKK is NE of RC: so KKK is at (1,1) maybe.
+
+P is SW of RC: so P is at (-1,-1).
+
+But also, P is SW of KKK. If KKK is at (1,1), then SW of KKK would be (1 - a, 1 - b), so to be SW, x decreases, y decreases. So P should be at (less than 1, less than 1). But since P is SW of RC (0,0), it's at (-1,-1), which is SW of RC. So (-1,-1) is SW of (1,1)? Let's see: from KKK (1,1), moving SW would be towards lower x and lower y, so yes, (-1,-1) is SW of (1,1). So that works.
+
+Now, SS is NW of RC (0,0), so SS should be at (-1,1) perhaps.
+
+Also, SS is NE of P. P is at (-1,-1). NE of P would be increasing x and y, so (-1 + a, -1 + b). If SS is at (-1,1), then from P (-1,-1), moving NE would be to (-1 + 0, -1 + 2), so yes, (-1,1) is NE of (-1,-1). So that works. So SS is at (-1,1).
+
+Next, Construction Convenience (CC) is NW of P. P is at (-1,-1). NW of P would be (-1 - a, -1 + b). Let's say CC is at (-2,0), but also CC is SW of SS. SS is at (-1,1). SW of SS would be (-1 - a, 1 - b). So CC must be SW of SS, so CC's coordinates should be less x (more west) and less y (more south) than SS. If SS is at (-1,1), then CC should be at, say, (-2,0), which is SW of (-1,1). Also, CC is NW of P (-1,-1). NW of P would be more west (lower x) and more north (higher y). So from P (-1,-1), moving NW: x decreases (more negative), y increases. So CC at (-2, 0) would be NW of P (-1,-1), since x is -2 < -1, y is 0 > -1. And CC (-2,0) is SW of SS (-1,1), since x=-2 < -1, y=0 < 1. That works. So CC is at (-2,0).
+
+Now, Panda's Pottery (PP) is SW of KKK. KKK is at (1,1), so SW of KKK would be (1 - a, 1 - b). So PP is somewhere with lower x and lower y than KKK. Also, PP is SE of SS. SS is at (-1,1). SE of SS would be higher x and lower y. So PP should be to the SE of SS. SS is at (-1,1), so SE would be (-1 + a, 1 - b). So PP's coordinates should have x > -1 (more east) and y < 1 (more south).
+
+Given that PP is SW of KKK (1,1), so PP's x < 1, y < 1. And PP is SE of SS (-1,1), so x > -1, y < 1. So combining these, PP is in x between -1 and 1, y < 1.
+
+Let's try to assign coordinates. Let's say PP is at (0,0). Wait, but RC is at (0,0). Wait, no, RC is Recycle Center at (0,0). So PP can't be there. Let's pick another point. Maybe PP is at (0, -1). Let's check.
+
+PP at (0, -1):
+
+- SW of KKK (1,1): yes, 0 < 1 and -1 < 1.
+
+- SE of SS (-1,1): SE would be x > -1 (0 > -1) and y < 1 (-1 < 1). Yes, that works.
+
+So PP is at (0, -1).
+
+Wait, but let's confirm all other points. Let's recap:
+
+RC: (0,0)
+
+KKK: (1,1) [NE of RC]
+
+P: (-1,-1) [SW of RC and SW of KKK]
+
+SS: (-1,1) [NW of RC and NE of P]
+
+CC: (-2,0) [NW of P and SW of SS]
+
+PP: (0,-1) [SW of KKK and SE of SS]
+
+Wait, let's check PP being SE of SS. SS is (-1,1). PP at (0,-1). To get from SS to PP, you go East (x from -1 to 0) and South (y from 1 to -1), so yes, that's Southeast direction. Correct.
+
+Now, we need to find how many objects are in the Southeast of PP. PP is at (0,-1). Southeast of PP would be points with x > 0 and y < -1.
+
+Wait, Southeast direction from PP: increasing x (east) and decreasing y (south), so yes, x > 0, y < -1.
+
+Now, let's list all the objects and their coordinates:
+
+- RC: (0,0)
+
+- KKK: (1,1)
+
+- P: (-1,-1)
+
+- SS: (-1,1)
+
+- CC: (-2,0)
+
+- PP: (0,-1)
+
+Now, check which of these are in the Southeast of PP (0,-1). Southeast means x > 0 and y < -1.
+
+Check each object:
+
+RC: (0,0). x=0 not >0, y=0 not < -1. No.
+
+KKK: (1,1). x=1>0, y=1 not < -1. No.
+
+P: (-1,-1). x=-1 <0, so no.
+
+SS: (-1,1). No.
+
+CC: (-2,0). No.
+
+PP: itself. Not counted.
+
+Wait, none of the other objects have x >0 and y < -1. Wait, but hold on. Wait, is there any other object?
+
+Wait, the objects are RC, KKK, P, SS, CC, PP. So all six. So according to this coordinate system, none of the other objects are in the SE of PP.
+
+But wait, let me check again. PP is at (0,-1). Southeast of PP would be East (x increasing) and South (y decreasing). So any point with x > 0 and y < -1.
+
+Looking at all coordinates:
+
+RC: (0,0) – x=0 not >0.
+
+KKK: (1,1) – y=1 not < -1.
+
+P: (-1,-1) – x negative.
+
+SS: (-1,1) – nope.
+
+CC: (-2,0) – nope.
+
+PP: (0,-1) – itself.
+
+So none of the other objects are in SE of PP. So answer would be A.0?
+
+But wait, wait. Wait, maybe my coordinate assignments are wrong. Let's double-check.
+
+Wait, let's verify all the given conditions with these coordinates.
+
+Check:
+
+- KKK (1,1) is NE of RC (0,0): yes.
+
+- P (-1,-1) is SW of RC: yes.
+
+- P is SW of KKK: from KKK (1,1), SW would be towards lower x and y, so (-1,-1) is SW of (1,1), yes.
+
+- SS (-1,1) is NW of RC: yes, NW of (0,0) is (-x, y).
+
+- SS is NE of P (-1,-1): NE would be higher x and y. SS is at (-1,1), so x same as P, y higher. Wait, from P (-1,-1), moving NE would go to higher x and higher y. So (-1 + a, -1 + b). SS is at (-1,1), which is x same (-1), y increased from -1 to 1. So it's directly North, not NE. Hmm, maybe that's a problem.
+
+Wait, NE direction should be both East and North. So if you're at P (-1,-1), NE would be increasing x (east) and increasing y (north). So SS should be to the NE of P, meaning x > -1 and y > -1. But SS is at (-1,1), which has x = -1 (same as P), y = 1 > -1. So it's North, not NE. So maybe my coordinate for SS is wrong.
+
+Ah, here's a mistake. If SS is NE of P, then it must be both East and North of P. So SS's x should be greater than P's x, and y greater than P's y.
+
+P is at (-1,-1), so SS must be at (x > -1, y > -1). But SS is also NW of RC (0,0). NW of RC would be x < 0, y > 0.
+
+So combining, SS must be in x < 0, y > 0 (NW of RC), and x > -1 (since x > P's x=-1), y > -1 (but y >0 already satisfies that). So SS is at (-0.5, 0.5) maybe. Let's adjust coordinates.
+
+Let's reset with better coordinates.
+
+Let me try again.
+
+Set RC at (0,0).
+
+KKK is NE of RC: let's put KKK at (2,2).
+
+P is SW of RC: so P is at (-2,-2).
+
+But also P is SW of KKK. SW of KKK (2,2) would be x < 2, y < 2. P at (-2,-2) is indeed SW of (2,2), so that's okay.
+
+SS is NW of RC: so SS is at (-2,2) maybe.
+
+Also, SS is NE of P. P is at (-2,-2). NE of P would be x > -2, y > -2. SS at (-2,2): x = -2 (not greater than -2), y = 2 > -2. So it's directly North, not NE. So to be NE, SS must have x > -2 and y > -2. So let's put SS at (-1, 1). Check:
+
+SS (-1,1):
+
+- NW of RC (0,0): yes, x=-1 <0, y=1>0.
+
+- NE of P (-2,-2): x=-1 > -2, y=1 > -2. Yes, that's NE. Good.
+
+Construction Convenience (CC) is NW of P. P is (-2,-2). NW of P would be x < -2, y > -2. Also, CC is SW of SS (-1,1). SW of SS would be x < -1, y < 1.
+
+So CC must be x < -2 (from NW of P), and x < -1 (from SW of SS), which is satisfied. And y > -2 (from NW of P), y < 1 (from SW of SS). So CC could be at (-3, 0), for example.
+
+Check:
+
+CC (-3,0):
+
+- NW of P (-2,-2): x=-3 < -2, y=0 > -2. Yes.
+
+- SW of SS (-1,1): x=-3 < -1, y=0 < 1. Yes.
+
+Good.
+
+Now, Panda's Pottery (PP) is SW of KKK (2,2), so x < 2, y < 2.
+
+PP is SE of SS (-1,1). SE of SS means x > -1, y < 1.
+
+So PP must satisfy x > -1 and y < 1 (from SE of SS), and x < 2, y < 2 (from SW of KKK). So combining, x in (-1, 2), y < 1.
+
+Let's choose PP at (0, 0). Wait, but RC is at (0,0). Can't have two objects at same spot. So PP at (0, -1), for example.
+
+Check:
+
+PP (0, -1):
+
+- SW of KKK (2,2): yes, 0 < 2, -1 < 2.
+
+- SE of SS (-1,1): x=0 > -1, y=-1 < 1. Yes.
+
+Good.
+
+Now, list all coordinates:
+
+RC: (0,0)
+
+KKK: (2,2)
+
+P: (-2,-2)
+
+SS: (-1,1)
+
+CC: (-3,0)
+
+PP: (0,-1)
+
+Now, check all conditions again.
+
+- KKK NE of RC: (2,2) is NE of (0,0): yes.
+
+- P SW of RC: (-2,-2) is SW: yes.
+
+- P SW of KKK: (-2,-2) is SW of (2,2): yes.
+
+- SS NW of RC: (-1,1) is NW: yes.
+
+- SS NE of P: SS (-1,1) is NE of P (-2,-2): yes, x increased from -2 to -1, y increased from -2 to 1.
+
+- CC NW of P: CC (-3,0) is NW of P (-2,-2): x decreased, y increased: yes.
+
+- CC SW of SS: CC (-3,0) is SW of SS (-1,1): x decreased, y decreased: yes.
+
+- PP SW of KKK: (0,-1) is SW of (2,2): yes.
+
+- PP SE of SS: (0,-1) is SE of (-1,1): x increased (from -1 to 0), y decreased (from 1 to -1): yes.
+
+All conditions satisfied. Now, we need to find how many objects are in the Southeast of PP (0,-1).
+
+Southeast of PP means x > 0 (East) and y < -1 (South). So looking for objects with x > 0 and y < -1.
+
+Check all objects:
+
+RC: (0,0) → x=0 not >0, y=0 not < -1. No.
+
+KKK: (2,2) → y=2 not < -1. No.
+
+P: (-2,-2) → x negative. No.
+
+SS: (-1,1) → No.
+
+CC: (-3,0) → No.
+
+PP: itself.
+
+So none of the other objects meet x >0 and y < -1. Wait, but hold on. Is there any other object? Let's confirm the list again. The objects are RC, KKK, P, SS, CC, PP. Six objects total.
+
+Wait, but wait, what about the Recycle Center? It's at (0,0), which is not in SE of PP.
+
+Hmm. So according to this, the answer would be A.0.
+
+But wait, let's make sure we didn't miss any other positions. Wait, maybe my coordinate assignment is still off.
+
+Wait, another way: maybe use a grid.
+
+Let me try to draw a rough grid mentally.
+
+Let's consider directions:
+
+- RC at center.
+
+- KKK to NE of RC.
+
+- P to SW of RC.
+
+- SS to NW of RC.
+
+So RC is center.
+
+NE: KKK
+
+SW: P
+
+NW: SS
+
+Then, P is also SW of KKK. So from KKK, going SW leads to P. Which would make sense if KKK is NE of RC, and P is SW of RC, so P is SW of RC, which is SW of KKK's position.
+
+Similarly, SS is NW of RC and NE of P. So SS is NW of RC, and P is SW of RC, so SS being NE of P makes sense.
+
+CC is NW of P and SW of SS.
+
+PP is SW of KKK and SE of SS.
+
+Let me try to sketch this.
+
+Imagine RC in the middle.
+
+- KKK: NE of RC
+
+- SS: NW of RC
+
+- P: SW of RC
+
+Now, from RC, going SW is P, so P is below and left of RC.
+
+From RC, going NW is SS, so SS is above and left.
+
+KKK is NE, so above and right.
+
+Now, P is SW of KKK. So KKK is NE of RC, so to get to P from KKK, you go SW, which would be towards RC and beyond. So P is SW of KKK.
+
+SS is NE of P. So from P (SW of RC), moving NE gets to SS (NW of RC). So SS is between P and RC in the north direction.
+
+CC is NW of P and SW of SS. So from P, NW is CC; from SS, SW is CC. So CC is northwest of P and southwest of SS.
+
+PP is SW of KKK and SE of SS.
+
+Let's try to place them on a grid with RC at (0,0):
+
+- RC: (0,0)
+
+- KKK: (1,1) [NE]
+
+- SS: (-1,1) [NW]
+
+- P: (-1,-1) [SW]
+
+Wait, but earlier we saw that SS (-1,1) is not NE of P (-1,-1), it's directly North. To be NE, SS should be east of P. So maybe P is further west.
+
+Let me try:
+
+RC: (0,0)
+
+KKK: (2,2)
+
+SS: (-1,1) [NW of RC]
+
+P: (-2,-2) [SW of RC]
+
+Now, check if P is SW of KKK: KKK (2,2) to P (-2,-2) is SW, yes.
+
+SS (-1,1) NE of P (-2,-2): yes, because x increased from -2 to -1, y increased from -2 to 1.
+
+CC is NW of P (-2,-2), so CC would be at (-3, 0) maybe.
+
+CC is SW of SS (-1,1), so CC should be west and south of SS. (-3,0) is west of -1 and south of 1, yes.
+
+PP is SW of KKK (2,2), so somewhere like (1,0) or (0,-1). Also SE of SS (-1,1). SE of SS means east and south, so x > -1, y < 1.
+
+If PP is at (0,0), but RC is there. So PP at (0,-1).
+
+Check:
+
+PP (0,-1):
+
+- SW of KKK (2,2): yes.
+
+- SE of SS (-1,1): x=0 > -1, y=-1 <1: yes.
+
+Now, coordinates:
+
+RC: (0,0)
+
+KKK: (2,2)
+
+SS: (-1,1)
+
+P: (-2,-2)
+
+CC: (-3,0)
+
+PP: (0,-1)
+
+Now, Southeast of PP (0,-1) is x >0, y < -1.
+
+Check all objects:
+
+RC: (0,0) – nope.
+
+KKK: (2,2) – y=2 not < -1.
+
+SS: (-1,1) – nope.
+
+P: (-2,-2) – x negative.
+
+CC: (-3,0) – nope.
+
+So no objects in SE of PP. So answer A.0.
+
+But wait, the options include D.1. Maybe I made a mistake.
+
+Wait, another thought: maybe "Southeast of Panda's Pottery" includes the direction, but depending on the exact grid, maybe one of the points falls there.
+
+Wait, in our coordinates, is there any object with x>0 and y<-1? Let's see:
+
+RC is at (0,0), KKK at (2,2), others are all at y >= -2 but:
+
+P is at (-2,-2), y=-2 which is less than -1, but x=-2 <0.
+
+So P is SW of PP? PP is at (0,-1). P is at (-2,-2). To get from PP to P: West and South. So P is SW of PP, not SE.
+
+Wait, SE of PP would be East and South. So x >0, y < -1.
+
+Is there any object there? Let's see:
+
+If we consider that maybe Recycle Center is at (0,0), and maybe another object... Wait, no, all objects are listed.
+
+Wait, maybe I messed up the direction of SE. Southeast is down and to the right, so if PP is at (0,-1), then SE would be increasing x and decreasing y. So y < -1.
+
+Looking at all objects, P is at (-2,-2), which has y=-2 < -1, but x=-2 <0, so it's SW of PP, not SE.
+
+Is there any other object with y < -1 and x >0? In our coordinate system, no.
+
+Wait, unless I assigned PP incorrectly.
+
+Wait, let's see: PP is SW of KKK. If KKK is at (2,2), SW could be (1,1), but that's NE of RC. Wait, no, SW would be lower x and y. So (1,0), (0,-1), etc.
+
+Wait, maybe PP is at (1,-1). Let's try that.
+
+PP at (1,-1):
+
+- SW of KKK (2,2): yes, x=1<2, y=-1<2.
+
+- SE of SS (-1,1): x=1 > -1, y=-1 <1: yes.
+
+Now, check Southeast of PP (1,-1): x >1, y < -1.
+
+Objects:
+
+RC: (0,0) – no.
+
+KKK: (2,2) – y=2 not < -1.
+
+SS: (-1,1) – no.
+
+P: (-2,-2) – x=-2 <1.
+
+CC: (-3,0) – no.
+
+PP: (1,-1).
+
+Still, no objects in x>1 and y<-1.
+
+Wait, P is at (-2,-2), y=-2 < -1, but x negative.
+
+So still none.
+
+Hmm. Maybe the answer is A.0. But let's check again.
+
+Wait, another way: list all the objects and their positions relative to PP.
+
+We need to find which objects are SE of PP.
+
+Given the relations:
+
+- PP is SW of KKK. So KKK is NE of PP.
+
+- PP is SE of SS. So SS is NW of PP.
+
+- P is SW of RC, and RC is... Not sure.
+
+Wait, maybe using the relative directions without coordinates.
+
+Let's try to make a mental map.
+
+Start with RC.
+
+- KKK is NE of RC.
+
+- SS is NW of RC.
+
+- P is SW of RC.
+
+Now, from KKK (NE of RC), P is SW of KKK. So P is located SW of KKK, which would be towards RC and beyond SW.
+
+SS is NW of RC, and NE of P. So SS is located NE of P, which would be towards RC and beyond NW.
+
+CC is NW of P and SW of SS. So CC is NW of P (so further NW from P) and SW of SS (so further SW from SS).
+
+PP is SW of KKK and SE of SS.
+
+So PP is located SW of KKK (so between KKK and RC, or beyond RC towards SW), and SE of SS (so between SS and RC, or beyond RC towards SE).
+
+Wait, SE of SS: SS is NW of RC, so SE of SS would be towards RC and then further SE.
+
+Similarly, SW of KKK: KKK is NE of RC, so SW of KKK would be towards RC and further SW.
+
+So PP is somewhere near RC's south-east side?
+
+Wait, if SS is NW of RC, SE of SS would be towards RC and beyond SE. So PP is SE of SS, meaning PP is to the SE of SS, so closer to RC's SE side.
+
+And PP is SW of KKK, so from KKK (NE of RC), going SW leads to PP, which would be towards RC's south.
+
+So PP is south of RC, east of SS (since SE of SS), and west of KKK.
+
+Now, to find objects SE of PP.
+
+SE of PP would be further east and south from PP.
+
+Which objects could be there?
+
+RC is at the center. If PP is south of RC, then SE of PP would be south-east of RC.
+
+KKK is NE of RC, so not SE of PP.
+
+P is SW of RC, so if PP is south of RC, P is further SW, so P is SW of PP, not SE.
+
+SS is NW of RC, so north-west of PP.
+
+CC is NW of P, so very west.
+
+So maybe none.
+
+Wait, but let's see:
+
+Objects:
+
+- RC: center.
+
+- KKK: NE.
+
+- SS: NW.
+
+- P: SW.
+
+- CC: NW of P, so even further west and north of P.
+
+- PP: SW of KKK and SE of SS.
+
+Assuming PP is south-east of RC (since SE of SS which is NW of RC), so PP is in the south-east quadrant relative to RC.
+
+Wait, no: SS is NW of RC, so SE of SS would be towards RC and then SE of RC. So PP is SE of SS, so PP is east and south of SS. Since SS is NW of RC, PP could be in the south-east of RC.
+
+Wait, maybe RC is at the center, PP is south-east of RC.
+
+If PP is south-east of RC, then SE of PP would be further south-east.
+
+But are there any objects there? The only other object in that general area is... KKK is NE of RC, so not SE. P is SW of RC, so opposite. So no.
+
+Wait, unless Recycle Center itself is considered, but RC is north-west of PP if PP is south-east of RC.
+
+Wait, this is confusing. Maybe better to use the coordinate system where PP is at (0,-1), RC at (0,0), so RC is north of PP. Then SE of PP would be east and south. The only possible object there would be... but all others are either north, west, or north-west.
+
+Wait, in our coordinate system, no objects are there. So answer A.0.
+
+But why does option D.1 exist? Maybe I made a mistake.
+
+Wait, let's check again all the given information.
+
+"Panda's Pottery is to the Southwest of Kite's Kitchen Knives. Panda's Pottery is to the Southeast of Silo Surprises."
+
+So PP is SW of KKK and SE of SS.
+
+We have SS is NW of RC, and PP is SE of SS, so PP is east and south of SS.
+
+KKK is NE of RC, PP is SW of KKK, so PP is west and south of KKK.
+
+So PP is located south of RC, east of SS, west of KKK.
+
+Now, let's list all objects' positions relative to PP.
+
+We need to find which are SE of PP.
+
+SE of PP means they are east and south of PP.
+
+Which objects could be there?
+
+- Recycle Center: If PP is south of RC, then RC is north of PP. Not SE.
+
+- KKK: NE of RC, so if PP is south of RC, KKK is north-east of PP. Not SE.
+
+- P: SW of RC. If PP is south of RC, then P is further SW, so P is SW of PP.
+
+- SS: NW of RC, and PP is SE of SS, so SS is NW of PP.
+
+- CC: NW of P, which is SW of RC, so CC is further west and north. So CC is NW of PP.
+
+So all other objects are either NW, N, W, SW, but none are SE of PP.
+
+Therefore, the number is 0. So answer A.
+
+But let me check once more for errors.
+
+Wait, another angle: maybe "Southeast" is considered as the direction, not strictly the quadrant. For example, if an object is directly south, is it considered SE? No, Southeast is specifically the diagonal direction. So it has to be both east and south.
+
+In our coordinate system, no objects are both east and south of PP.
+
+Hence, the answer should be A.0.
+
+But the options are given, and maybe I messed up.
+
+Wait, wait a second. Let's check the original problem statement again:
+
+"Construction Convenience is to the Northwest of Pandora Jewelry. Construction Convenience is to the Southwest of Silo Surprises."
+
+So CC is NW of P and SW of SS.
+
+"Panda's Pottery is to the Southwest of Kite's Kitchen Knives. Panda's Pottery is to the Southeast of Silo Surprises."
+
+So PP is SW of KKK and SE of SS.
+
+Now, let's try to build the map step by step.
+
+1. Place Recycle Center (RC).
+
+2. KKK is NE of RC.
+
+3. P is SW of RC.
+
+4. P is SW of KKK (so KKK is NE of P, which matches since P is SW of RC and KKK is NE of RC).
+
+5. SS is NW of RC.
+
+6. SS is NE of P (so SS is NE of P, which makes sense as SS is NW of RC and P is SW of RC).
+
+7. CC is NW of P and SW of SS.
+
+8. PP is SW of KKK and SE of SS.
+
+Now, let's use a grid with RC at the center.
+
+- RC: Center.
+
+- KKK: NE of RC.
+
+- P: SW of RC.
+
+- SS: NW of RC.
+
+Now, SS is NE of P. So from P, moving NE gets to SS. Since P is SW of RC, SS must be between P and RC towards the NW.
+
+CC is NW of P (so further NW from P) and SW of SS (so further SW from SS). So CC is located west of SS and south of SS? Wait, SW of SS would be south and west.
+
+Wait, SS is NW of RC. CC is SW of SS, so CC is south and west of SS.
+
+But CC is also NW of P. Since P is SW of RC, CC being NW of P would be west and north of P.
+
+So imagine:
+
+- SS is at NW of RC.
+
+- CC is SW of SS (so below and left of SS) and NW of P (so above and left of P). Given P is SW of RC, this places CC to the left of both SS and P.
+
+PP is SW of KKK (so below and left of KKK) and SE of SS (so below and right of SS).
+
+So PP is located to the right of SS (since SE of SS) and left of KKK (SW of KKK).
+
+Now, to find objects SE of PP.
+
+SE of PP would be to the right and down from PP.
+
+What's to the right and down from PP?
+
+KKK is NE of RC, PP is SW of KKK, so KKK is NE of PP, not SE.
+
+P is SW of RC, PP is SE of SS and SW of KKK. Depending on exact positions, P might be SW of PP.
+
+RC is north of PP (since PP is south of RC? If PP is SE of SS which is NW of RC, PP could be south of RC).
+
+Wait, if SS is NW of RC, then SE of SS would go through RC to the SE. So PP is SE of SS, so PP is east and south of SS, possibly south of RC.
+
+If PP is south of RC, then RC is north of PP.
+
+KKK is NE of RC, so KKK is north-east of RC, thus north-east of PP.
+
+P is SW of RC, so south-west of RC, thus south-west of PP.
+
+CC is NW of P, so further west and north, so north-west of PP.
+
+SS is NW of RC, so north-west of PP.
+
+So all other objects are either NW, N, W, SW, NE of PP. None are SE.
+
+Thus, no objects are in the SE of PP. Answer A.0.
+
+I think this confirms it. So the answer should be A.
+</think>
+Let me solve this step by step using the structured format:
+<format>
+>>> STEP 1: PARSE RELATIONSHIPS
+    - A is to the DIRECTION of B
+    [... list all given relationships ...]
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+    - Looking for: [target relationship / direction / count]
+    - [reasoning about the relationships]
+    - [use reversibility and transitivity as needed]
+
+>>> STEP 3: ANSWER
+    - [state conclusion]
+
+>>> FINAL ANSWER: [answer text]
+    \boxed{LETTER}
+</format>
+>>> STEP 1: PARSE RELATIONSHIPS
+- Recycle Center (RC) is in the map.
+- Kite's Kitchen Knives (KKK) is to the Northeast of RC.
+- Pandora Jewelry (P) is to the Southwest of RC.
+- P is to the Southwest of KKK.
+- Silo Surprises (SS) is to the Northwest of RC.
+- SS is to the Northeast of P.
+- Construction Convenience (CC) is to the Northwest of P.
+- CC is to the Southwest of SS.
+- Panda's Pottery (PP) is to the Southwest of KKK.
+- PP is to the Southeast of SS.
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+We'll use a coordinate system with RC at (0,0) for simplicity. Directions:
+- Northeast (NE): (x+1, y+1)
+- Southwest (SW): (x-1, y-1)
+- Northwest (NW): (x-1, y+1)
+- Southeast (SE): (x+1, y-1)
+
+Assign coordinates based on given relationships:
+- RC = (0,0)
+- KKK is NE of RC → KKK = (1,1)
+- P is SW of RC → P = (-1,-1)
+- P is SW of KKK → P = (-1,-1) is valid (from (1,1), SW is (-1,-1))
+- SS is NW of RC → SS = (-1,1)
+- SS is NE of P → SS = (-1,1) is valid (from (-1,-1), NE is (-1,1))
+- CC is NW of P → CC = (-2,0)
+- CC is SW of SS → CC = (-2,0) is valid (from (-1,1), SW is (-2,0))
+- PP is SW of KKK → PP = (0,-1)
+- PP is SE of SS → PP = (0,-1) is valid (from (-1,1), SE is (0,-1))
+
+Coordinates:
+- RC: (0,0)
+- KKK: (1,1)
+- P: (-1,-1)
+- SS: (-1,1)
+- CC: (-2,0)
+- PP: (0,-1)
+
+Now, the question: **How many objects are in the Southeast of Panda's Pottery (PP)?**
+
+Southeast of PP means:
+- x > 0 (East of PP)
+- y < -1 (South of PP)
+
+Check each object:
+- RC: (0,0) → x = 0 (not > 0), y = 0 (not < -1) → ❌
+- KKK: (1,1) → x = 1 (> 0), y = 1 (not < -1) → ❌
+- P: (-1,-1) → x = -1 (not > 0), y = -1 (not < -1) → ❌
+- SS: (-1,1) → x = -1 (not > 0), y = 1 (not < -1) → ❌
+- CC: (-2,0) → x = -2 (not > 0), y = 0 (not < -1) → ❌
+- PP: (0,-1) → self, not counted → ❌
+
+No objects satisfy both x > 0 and y < -1.
+
+>>> STEP 3: ANSWER
+All relationships are consistent with the coordinate assignments, and no objects lie in the Southeast of PP.
+
+>>> FINAL ANSWER: \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt
new file mode 100644
index 00000000..ae120f5a
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt
@@ -0,0 +1,843 @@
+Okay, let's try to figure out this problem step by step. So, we have a map with several objects, and we need to determine how many are north of Craft Creation Station. The answer choices are A.4, B.1, C.3, D.0. Let's break down all the given information first.
+
+First, let's list out all the objects mentioned:
+
+- Miniature Golf Mart (let's call this M)
+- Lynx's Luxury Linens (L)
+- Craft Creation Station (C)
+- Basil's Bakery (B)
+- Molly's Music Store (Mm)
+- Iris's Ice Skates (I)
+
+Wait, hold on, there's a possible confusion with the initials. The user mentioned "M" for Miniature Golf Mart and "Molly's Music Store" also starts with M. Maybe I should use different abbreviations to avoid confusion. Let's see:
+
+Miniature Golf Mart: M
+Lynx's Luxury Linens: L
+Craft Creation Station: C
+Basil's Bakery: Ba
+Molly's Music Store: Mm
+Iris's Ice Skates: I
+
+Okay, that should work. Now, let's parse each statement one by one.
+
+1. "Miniature Golf Mart is in the map." So M is a point.
+
+2. "Lynx's Luxury Linens is to the Northeast of Miniature Golf Mart." So L is NE of M.
+
+3. "Craft Creation Station is to the Northwest of Miniature Golf Mart." So C is NW of M.
+
+4. "Craft Creation Station is to the Southwest of Lynx's Luxury Linens." So C is SW of L. Wait, this is important. So if L is NE of M, and C is SW of L, then C must be positioned such that from L, moving SW gets to C. But also, C is NW of M. Hmm. Let's try to sketch a rough grid mentally.
+
+Let me assign coordinates to make it easier. Let's assume M is at the origin (0,0). Then:
+
+- L is NE of M, so L's coordinates would be (positive x, positive y). Let's say L is at (1,1) for simplicity.
+
+- C is NW of M. NW of M (0,0) would be negative x, positive y. Wait, no: NW is left and up from M. Wait, standard compass directions: if you're facing north, east is to your right, west to your left. So NW would be up and left from M. So if M is at (0,0), then NW would be (-x, +y). So C is NW of M, so C's coordinates would be something like (-1,1).
+
+But also, C is SW of L. SW of L would be moving down and left from L. If L is at (1,1), then SW would be (1 - a, 1 - b) where a,b positive. So if C is at (-1,1), then from L (1,1), moving SW to (-1,1) would be moving left 2 units, same y. Wait, but SW is both south and west, so y should decrease. Hmm, maybe my coordinate system needs adjustment.
+
+Wait, perhaps using a grid where north is up, east is right. So:
+
+- North: +y
+- South: -y
+- East: +x
+- West: -x
+
+So, Northeast of M would be +x and +y. So L is at (x, y) where x>0, y>0 relative to M.
+
+Craft Creation Station (C) is NW of M: NW is -x and +y. So C is at (-x, +y) relative to M.
+
+Also, C is SW of L. SW of L would be -x and -y relative to L. So if L is at (a, b), then C is at (a - c, b - d) where c,d >0.
+
+But we also have C at (-e, f) relative to M (which is at (0,0)), so coordinates of C are (-e, f), and coordinates of L are (g, h), since L is NE of M, so g>0, h>0.
+
+Given that C is SW of L: so C's coordinates = L's coordinates minus some east and north? Wait, SW from L would be moving west (so decrease x) and south (decrease y). So yes, C = (Lx - w, Ly - s), where w,s >0.
+
+So:
+
+C_x = Lx - w
+
+C_y = Ly - s
+
+But C is NW of M (0,0), so C_x < 0, C_y > 0.
+
+Since Lx >0 (as L is NE of M), then Lx - w <0 => w > Lx.
+
+Also, C_y = Ly - s >0 => s < Ly.
+
+Hmm. Let's assign coordinates to make this concrete.
+
+Let's place M at (0,0).
+
+Let's assume L is at (2,2) (NE of M, so x=2, y=2).
+
+C is NW of M, so C must be at (-a, b), where a>0, b>0.
+
+C is also SW of L. SW of L (2,2) would be moving west (x decreases) and south (y decreases). So C's coordinates should be (2 - w, 2 - s), where w>0, s>0.
+
+But C's coordinates are also (-a, b). So:
+
+2 - w = -a => w = 2 + a
+
+2 - s = b => s = 2 - b
+
+Since s must be positive (as we move south from L), 2 - b >0 => b < 2.
+
+Also, since C is NW of M, b must be positive (as it's north), so 0 < b < 2.
+
+Let's pick b=1 for simplicity. Then s=1. So C would be at (2 - w, 1). But since C is NW of M, x-coordinate negative. So 2 - w <0 => w >2. Let's take w=3, so C_x = 2 - 3 = -1. So C is at (-1, 1). That works.
+
+So coordinates so far:
+
+M: (0,0)
+
+L: (2,2)
+
+C: (-1,1)
+
+Check: L is NE of M (yes, 2,2), C is NW of M (-1,1: yes, left and up from M). C is SW of L: from L (2,2) moving SW (left 3, down 1) gets to (-1,1). Yes, that works.
+
+Okay, moving on.
+
+Next: "Basil's Bakery is to the Southeast of Lynx's Luxury Linens. Basil's Bakery is to the Northeast of Miniature Golf Mart."
+
+So Ba is SE of L and NE of M.
+
+SE of L (2,2) would be +x, -y from L. So Ba's coordinates: (2 + e, 2 - f), e,f >0.
+
+NE of M (0,0) means Ba is (g, h), g>0, h>0.
+
+So combining, Ba must be in the first quadrant (NE of M), and SE of L. So since L is at (2,2), SE of L would be east and south, so x >2, y <2.
+
+So Ba's coordinates: x >2, y <2, and since NE of M (which it is, since x>0, y>0 as long as y>0). Let's pick coordinates. Let's say Ba is at (3,1). Check: SE of L (2,2): yes, east 1, south 1. NE of M: yes, 3,1 is NE of (0,0). Okay.
+
+Next: "Molly's Music Store is to the Northeast of Miniature Golf Mart. Molly's Music Store is to the Southeast of Lynx's Luxury Linens."
+
+Mm is NE of M (so x>0, y>0) and SE of L (so x > Lx, y < Ly). Since L is at (2,2), SE of L would be x >2, y <2. So Mm must be in (x>2, y<2). But also NE of M, which it is as long as x>0, y>0. So Mm is in that same area as Ba? Wait, but Ba is also SE of L and NE of M. Wait, but the problem says "Basil's Bakery is to the Southeast of Lynx's Luxury Linens. Basil's Bakery is to the Northeast of Miniature Golf Mart." So Ba is SE of L and NE of M. Similarly, Mm is NE of M and SE of L. So both Ba and Mm are in the SE of L and NE of M. But they might be in different positions.
+
+Wait, but let's see. Let's assign coordinates. Let's say Mm is at (3, 0.5). So SE of L (2,2): x=3>2, y=0.5<2. NE of M: yes. Ba we had at (3,1). So both Ba and Mm are east of L and south of L, but different y's.
+
+Wait, but the problem also says: "Iris's Ice Skates is to the Northwest of Basil's Bakery. Iris's Ice Skates is to the Northwest of Molly's Music Store."
+
+So I is NW of Ba and NW of Mm.
+
+NW of Ba (3,1) would be x <3, y >1.
+
+NW of Mm (3,0.5) would be x <3, y >0.5.
+
+So I has to be in x <3, y >1 (since NW of Ba requires y >1, as NW is up and left; so y-coordinate higher than Ba's y=1).
+
+So let's say I is at (2, 2). Wait, but L is at (2,2). Wait, can't have two places at same coordinates. Hmm. Let's adjust.
+
+Wait, L is at (2,2), so if I is NW of Ba (3,1), then I's x <3, y >1. Let's pick I at (1, 2). Check: NW of Ba (3,1): from Ba, going west 2 and north 1, yes. NW of Mm (3,0.5): same x difference west 2, north 1.5. So yes. So I is at (1,2).
+
+Wait, but L is at (2,2). So I is at (1,2), which is west of L, same y. So I is NW of Ba and Mm, and at (1,2).
+
+Wait, but let's confirm all positions:
+
+M: (0,0)
+
+L: (2,2)
+
+C: (-1,1)
+
+Ba: (3,1)
+
+Mm: (3, 0.5)
+
+I: (1, 2)
+
+Wait, but let's check all the conditions again.
+
+Check "Iris's Ice Skates is to the Northwest of Basil's Bakery." Basil's Bakery is at (3,1). Northwest would be left and up. So I at (1,2): left 2, up 1. Yes, NW. Good.
+
+"Iris's Ice Skates is to the Northwest of Molly's Music Store." Molly's at (3,0.5). Northwest would be left and up. From Mm (3,0.5), moving to (1,2): left 2, up 1.5. Yes, NW. Correct.
+
+Now, let's recap all coordinates:
+
+- M: (0,0)
+
+- L: (2,2)
+
+- C: (-1,1)
+
+- Ba: (3,1)
+
+- Mm: (3, 0.5)
+
+- I: (1, 2)
+
+Wait, but let's check all the other statements again to make sure.
+
+"Craft Creation Station is to the Southwest of Lynx's Luxury Linens." C is at (-1,1), L is at (2,2). Southwest of L would be moving west and south. From L (2,2), west to x=-1 (3 units west), south to y=1 (1 unit south). So yes, SW. Correct.
+
+"Basil's Bakery is to the Southeast of Lynx's Luxury Linens." Ba at (3,1). From L (2,2), east 1, south 1. SE. Correct.
+
+"Basil's Bakery is to the Northeast of Miniature Golf Mart." Ba (3,1) is NE of M (0,0). Yes.
+
+"Molly's Music Store is to the Northeast of Miniature Golf Mart." Mm (3,0.5) is NE of M. Yes.
+
+"Molly's Music Store is to the Southeast of Lynx's Luxury Linens." Mm (3,0.5) is SE of L (2,2): yes.
+
+Now, the question is: How many objects are in the North of Craft Creation Station?
+
+First, what's Craft Creation Station's position? C is at (-1,1).
+
+"North" would be higher y-coordinate. So any object with y >1.
+
+Let's list all objects' y-coordinates:
+
+- M: (0,0) y=0
+
+- L: (2,2) y=2
+
+- C: (-1,1) y=1
+
+- Ba: (3,1) y=1
+
+- Mm: (3, 0.5) y=0.5
+
+- I: (1,2) y=2
+
+So objects north of C (y >1) would be those with y >1. So L and I have y=2, which is greater than 1. Ba and C have y=1, so not north (assuming north means strictly north, not same latitude). M and Mm have lower y.
+
+Wait, but the question says "in the North of Craft Creation Station". Typically, "north of" would mean strictly north, so higher y. So objects with y > C's y (which is 1). So L and I.
+
+Wait, but let's confirm:
+
+C is at (-1,1). North would be increasing y. So any point with y >1 is north of C.
+
+L is at (2,2): y=2 >1: north.
+
+I is at (1,2): y=2 >1: north.
+
+Ba is at (3,1): same y=1: not north.
+
+M, Mm: lower y.
+
+So that's two objects: L and I.
+
+Wait, but the answer choices are A.4, B.1, C.3, D.0. Hmm, two isn't an option. Did I make a mistake?
+
+Wait, maybe I messed up the coordinate assignments. Let's check again.
+
+Wait, perhaps I made a wrong assumption in coordinates. Let's see.
+
+Wait, another way: Maybe the problem uses a different orientation. Sometimes in some contexts, "north" could be considered as the top of the map, so higher y, which is what I did.
+
+But according to my coordinates, there are two objects north of C: L and I.
+
+But since 2 isn't an option, that suggests I made an error.
+
+Wait, let's check again all the positions.
+
+Wait, let's re-express all the relationships without coordinates.
+
+Start with M (Miniature Golf Mart).
+
+- L is NE of M.
+
+- C is NW of M.
+
+- C is SW of L.
+
+So, if L is NE of M, and C is SW of L, then C must be southwest of L, which is northeast of M. So C is southwest of L, so to get from L to C, you go SW, which would place C in a position that's west and south of L, but since C is also NW of M, which is southwest of L? Wait, maybe drawing a rough sketch.
+
+Imagine M at the center.
+
+NE of M is L.
+
+NW of M is C.
+
+So C is northwest of M, L is northeast of M. Then, C is southwest of L. So if L is NE of M, then SW of L would be towards M and beyond. Wait, if L is NE of M, then SW of L would be towards the direction of M and further south-west. So C is located SW of L and also NW of M. So the position of C must be such that from L, moving SW reaches C, and from M, moving NW reaches C.
+
+So the relative positions: M is southeast of C (since C is NW of M), and L is northeast of C (since C is SW of L).
+
+Wait, perhaps another way: The four cardinal directions.
+
+If C is SW of L, then L is NE of C.
+
+Similarly, C is NW of M, so M is SE of C.
+
+So for the question: objects north of C. So any object that is north of C.
+
+We have L is NE of C (since C is SW of L), so L is north of C (and east). So L is north of C.
+
+Similarly, I is NW of Ba and NW of Mm. Let's see where I is.
+
+But according to my coordinate system, I was at (1,2), which is north of C (-1,1), since y=2 >1.
+
+Wait, but according to the problem statements, is there another object north of C?
+
+Wait, let's list all objects again:
+
+Objects:
+
+- M: Miniature Golf Mart
+
+- L: Lynx's Luxury Linens
+
+- C: Craft Creation Station
+
+- Ba: Basil's Bakery
+
+- Mm: Molly's Music Store
+
+- I: Iris's Ice Skates
+
+So total six objects. Wait, but the question is about how many are north of C.
+
+From my coordinates:
+
+- L is north of C
+
+- I is north of C
+
+- Ba is same latitude (y=1), M and Mm are south.
+
+Wait, but according to the problem's description, are there any others?
+
+Wait, let's verify Iris's position again.
+
+"Iris's Ice Skates is to the Northwest of Basil's Bakery. Iris's Ice Skates is to the Northwest of Molly's Music Store."
+
+So I is NW of Ba and NW of Mm.
+
+In my coordinates, Ba was at (3,1), Mm at (3,0.5). So NW of both would be left and up from both. So yes, I at (1,2) is NW of both.
+
+Now, L is at (2,2), which is east of I (1,2), same y. So I is west of L, same latitude.
+
+But both I and L are north of C.
+
+Wait, but according to this, two objects. But answer choices don't have two. Hmm. Maybe I messed up the coordinate assignments.
+
+Wait, let's try a different coordinate system.
+
+Alternative approach: Let's use a grid where we fix M at (0,0).
+
+Define directions:
+
+- North: up (positive y)
+
+- South: down (negative y)
+
+- East: right (positive x)
+
+- West: left (negative x)
+
+So:
+
+- L is NE of M: so L is at (a, b), a >0, b >0.
+
+- C is NW of M: so C is at (-c, d), c >0, d >0.
+
+- C is SW of L: so from L, moving SW (which is west and south), so C's coordinates = L's coordinates minus (east, north), i.e., C_x = L_x - e, C_y = L_y - f, where e,f >0.
+
+Thus:
+
+-c = a - e --> e = a + c
+
+d = b - f --> f = b - d
+
+Since f >0, b > d.
+
+Also, since C is NW of M, d >0 (north of M), so 0 < d < b.
+
+Let's assign specific numbers.
+
+Let’s set M at (0,0).
+
+Let’s choose L at (2, 2) (NE of M).
+
+C must be SW of L, so C_x < L_x=2, C_y < L_y=2.
+
+Also, C is NW of M, so C_x <0, C_y >0.
+
+So C must be at x <0, y between 0 and 2.
+
+Let’s pick C at (-1, 1). So:
+
+C_x = -1 = 2 - e --> e = 3
+
+C_y = 1 = 2 - f --> f =1
+
+So that works.
+
+Now, Basil's Bakery (Ba) is SE of L and NE of M.
+
+SE of L (2,2): x >2, y <2.
+
+NE of M (0,0): x >0, y >0. Which is satisfied since x>2>0, y<2 but still >0 (assuming y>0).
+
+Let's put Ba at (3,1).
+
+Molly's Music Store (Mm) is NE of M and SE of L.
+
+Similarly, SE of L: x >2, y <2.
+
+So Mm could be at (3, 0.5), for example.
+
+Now, Iris's Ice Skates (I) is NW of Ba and NW of Mm.
+
+NW of Ba (3,1): x <3, y >1.
+
+NW of Mm (3,0.5): x <3, y >0.5.
+
+So I must be at x <3, y >1 (since y >1 satisfies y >0.5).
+
+Let's put I at (1, 2).
+
+Now, check all:
+
+- I (1,2) is NW of Ba (3,1): yes, left 2, up 1.
+
+- I (1,2) is NW of Mm (3,0.5): left 2, up 1.5. Yes.
+
+Now, list all y-coordinates:
+
+- M: 0
+
+- L: 2
+
+- C:1
+
+- Ba:1
+
+- Mm:0.5
+
+- I:2
+
+So north of C (y >1): L (2) and I (2). So two objects.
+
+But answer choices don't have 2. Options are 4,1,3,0. Hmm. This suggests an error in my reasoning.
+
+Wait, maybe "north" is defined as directly north, not just any north (i.e., same x-coordinate). But the question says "in the North", which usually means any position north, regardless of east/west.
+
+Wait, but maybe I misread the problem. Let's check again.
+
+Wait, the problem says: "How many objects are in the North of Craft Creation Station?"
+
+If "in the North" means strictly north, i.e., same longitude (x), north of it, but I think in map terms, "north of" usually means having a higher latitude (y), regardless of x.
+
+But according to the problem's given data, perhaps my coordinate assignments are wrong.
+
+Wait, let's try to see if there's another way. Maybe I placed I incorrectly.
+
+Wait, another thought: When it says "Iris's Ice Skates is to the Northwest of Basil's Bakery", which is SE of L. So Basil's Bakery is SE of L, so Iris is NW of Ba, so Iris is NW of a point that's SE of L. So Iris could be in a position relative to L.
+
+Wait, let's try to map all the positions without coordinates.
+
+Start with M.
+
+- L is NE of M.
+
+- C is NW of M.
+
+- C is SW of L.
+
+So imagine M in the center.
+
+L is up-right (NE) of M.
+
+C is up-left (NW) of M.
+
+And C is down-left (SW) from L.
+
+So from L, going SW leads to C. So L is NE of C.
+
+Similarly, M is SE of C (since C is NW of M).
+
+Now, Ba is SE of L and NE of M.
+
+So SE of L would be further right and down from L.
+
+NE of M is up-right from M.
+
+So Ba is located to the right and down from L, but still right and up from M.
+
+Similarly, Mm is SE of L and NE of M, so same general area as Ba, but maybe different position.
+
+I is NW of Ba and NW of Mm.
+
+So I is left and up from both Ba and Mm.
+
+Now, to find objects north of C.
+
+C has L to its NE (since C is SW of L), so L is north of C.
+
+Also, I is NW of Ba and Mm. If I is north of C, depending on position.
+
+Wait, but according to my coordinate system, yes. But answer isn't there.
+
+Wait, maybe the error is in assuming I is north of C. Let's see.
+
+In my coordinate system, C is at (-1,1), I is at (1,2). So I is east and north of C. So yes, north of C.
+
+L is at (2,2), which is east and north of C.
+
+So both L and I are north of C.
+
+But answer options don't have 2. So either the question has a typo, or my analysis is wrong.
+
+Wait, let's check if "Craft Creation Station" is C, and whether there's another object I missed.
+
+Wait, the objects listed are:
+
+- Miniature Golf Mart
+
+- Lynx's Luxury Linens
+
+- Craft Creation Station
+
+- Basil's Bakery
+
+- Molly's Music Store
+
+- Iris's Ice Skates
+
+That's six objects. We've accounted for all.
+
+Wait, but let's check again the problem statement for any missed clues.
+
+Wait: "Craft Creation Station is to the Southwest of Lynx's Luxury Linens." So C is SW of L, meaning L is NE of C.
+
+"Basil's Bakery is to the Southeast of Lynx's Luxury Linens. Basil's Bakery is to the Northeast of Miniature Golf Mart."
+
+"Molly's Music Store is to the Northeast of Miniature Golf Mart. Molly's Music Store is to the Southeast of Lynx's Luxury Linens."
+
+So both Ba and Mm are SE of L and NE of M.
+
+"Iris's Ice Skates is to the Northwest of Basil's Bakery. Iris's Ice Skates is to the Northwest of Molly's Music Store."
+
+So I is NW of both Ba and Mm.
+
+Now, to find north of C.
+
+We have L is NE of C (since C is SW of L), so L is north of C.
+
+Is there any other object north of C?
+
+Let's consider the positions:
+
+- M is SE of C (since C is NW of M), so M is south-east of C.
+
+- Ba is SE of L, which is NE of C. So Ba's position relative to C: L is NE of C, Ba is SE of L, so Ba is east and south of L, which is NE of C. So Ba could be east of C or west? Let's see.
+
+If C is at some point, L is NE of C. Ba is SE of L, so Ba is east and south of L. So from C, going NE to L, then SE to Ba: so Ba would be east of C (since L is east of C, and Ba is east of L), and south of L but possibly north or south of C.
+
+In our coordinate system, C was at (-1,1), L at (2,2), Ba at (3,1). So Ba's y=1, same as C. So Ba is not north of C.
+
+Mm is at (3,0.5), also south of C.
+
+I is at (1,2), which is north of C (y=2 >1).
+
+So only L and I.
+
+Wait, unless... Wait, is there a mistake in considering I's position?
+
+Wait, "Iris's Ice Skates is to the Northwest of Basil's Bakery." If Basil's Bakery is at (3,1), then Northwest would be x <3, y >1. So I must be north of Ba's y-coordinate. Ba's y=1, so I's y >1.
+
+Similarly, I is NW of Mm, which is at y=0.5, so I's y >0.5, but the stricter condition is y >1.
+
+Therefore, I has y >1, so north of C (y=1).
+
+L is at y=2, north of C.
+
+So two objects. But since 2 isn't an option, maybe the question considers "north" as directly north, same x-coordinate.
+
+In that case, are there any objects with the same x-coordinate as C and y >1?
+
+C is at (-1,1). Is there any object at x=-1, y>1?
+
+In our coordinate system, no. L is at (2,2), I at (1,2), etc. So no objects directly north. But the question says "in the North", not "directly north", so that shouldn't matter.
+
+Wait, but maybe I assigned the coordinates incorrectly. Let's try another arrangement.
+
+Alternative coordinate system:
+
+Let me place C at (0,0) to see if that helps.
+
+Wait, but maybe better to use a different method.
+
+Let's create a grid.
+
+Assume Craft Creation Station (C) is at the origin for simplicity.
+
+We need to find objects north of C, so with higher y.
+
+Given:
+
+- C is NW of M, so M is SE of C.
+
+- C is SW of L, so L is NE of C.
+
+Therefore, L is NE of C, so L is north and east of C.
+
+So L is north of C.
+
+Now, what about other objects.
+
+Basil's Bakery (Ba): SE of L and NE of M.
+
+Since L is NE of C, SE of L would be further east and south from L.
+
+But M is SE of C (since C is NW of M), so M is east and south of C.
+
+Ba is NE of M, so Ba is north and east of M.
+
+So relative to C:
+
+- C is origin.
+
+- L is NE of C.
+
+- M is SE of C.
+
+- Ba is NE of M, so Ba is east and north of M, which is east and south of C. So Ba could be east of C, north of M (which is south of C), so Ba might be north of C or not.
+
+Wait, this is getting confusing. Let's try.
+
+Let’s define C at (0,0).
+
+L is NE of C, so L = (a,b), a>0, b>0.
+
+M is SE of C (since C is NW of M), so M = (c,-d), c>0, d>0.
+
+Ba is SE of L and NE of M.
+
+SE of L: (a+e, b-f), e>0, f>0.
+
+NE of M: (c+g, -d+h), g>0, h>0.
+
+So Ba must satisfy both.
+
+Also, since Ba is SE of L, its coordinates are east and south of L.
+
+And NE of M, so east and north of M.
+
+This might not help. Let's use the first coordinate system where the answer came to two, but since that's not an option, perhaps the error is elsewhere.
+
+Wait, going back to the problem statement:
+
+"Craft Creation Station is to the Southwest of Lynx's Luxury Linens."
+
+So C is SW of L → L is NE of C.
+
+"Basil's Bakery is to the Southeast of Lynx's Luxury Linens."
+
+Ba is SE of L.
+
+"Molly's Music Store is to the Southeast of Lynx's Luxury Linens."
+
+Mm is SE of L.
+
+So both Ba and Mm are SE of L, meaning they are south and east of L.
+
+L is NE of C.
+
+Therefore, Ba and Mm are SE of L, so they are south of L, which is north of C. So Ba and Mm could be north or south of C.
+
+Wait, if L is north of C, and Ba is south of L, Ba could still be north of C if it's not too far south.
+
+Similarly for Mm.
+
+In my coordinate system, L was at (2,2), C at (-1,1). So L is north-east of C.
+
+Ba at (3,1): y=1, same as C's y=1. So not north.
+
+But if Ba were at (3,1.5), then y=1.5 >1, so north of C.
+
+Ah! Here's the mistake. I assumed Ba's y-coordinate was 1, but it could be higher.
+
+Wait, let's revisit the placement of Ba.
+
+Basil's Bakery is SE of L and NE of M.
+
+If L is at (2,2), SE of L means x >2 and y <2.
+
+NE of M (0,0) means x >0, y >0.
+
+So Ba's y can be between 0 and 2. If we place Ba at y=1.5, then it would be north of C (which is at y=1).
+
+Similarly, Mm is SE of L (y <2), so could have y=1.5 as well.
+
+Wait, this changes things.
+
+Let's correct this.
+
+Let's redo coordinates with more care.
+
+Set M at (0,0).
+
+L is NE of M: let's say L=(2,3) [higher y to allow room].
+
+C is NW of M: C=(-1,2) [y=2, which is north of M].
+
+Check C is SW of L: L=(2,3), C=(-1,2). SW from L would be west and south: yes, to x=-1 (3 west), y=2 (1 south). So yes.
+
+Now:
+
+- C=(-1,2)
+
+- M=(0,0)
+
+- L=(2,3)
+
+Basil's Bakery (Ba) is SE of L and NE of M.
+
+SE of L: x >2, y <3.
+
+NE of M: x >0, y >0.
+
+So Ba could be at (3, 2.5), for example.
+
+Molly's Music Store (Mm) is also SE of L and NE of M, so similarly at (3, 1.5), for example.
+
+Now, Iris's Ice Skates (I) is NW of Ba and NW of Mm.
+
+NW of Ba=(3,2.5): x <3, y >2.5.
+
+NW of Mm=(3,1.5): x <3, y >1.5.
+
+So I must have y >2.5.
+
+Let's place I at (1, 3).
+
+Now, list all y-coordinates:
+
+- M: 0
+
+- L: 3
+
+- C: 2
+
+- Ba: 2.5
+
+- Mm: 1.5
+
+- I: 3
+
+Now, objects north of C (y >2):
+
+- L: y=3 >2
+
+- Ba: y=2.5 >2
+
+- I: y=3 >2
+
+So that's three objects: L, Ba, I.
+
+Ah! This changes things. So depending on where we place Ba and Mm, their y-coordinates could be above C's y.
+
+In my first coordinate system, I incorrectly placed Ba at y=1, which was south of C, but actually, since Ba is SE of L (which is at y=3), SE would have y <3, but could still be greater than C's y=2.
+
+So if C is at y=2, and Ba is at y=2.5, then Ba is north of C.
+
+Similarly, I is at y=3, north of C.
+
+L is at y=3, north of C.
+
+So three objects: L, Ba, I.
+
+That matches option C.3.
+
+So where did I go wrong initially? I placed Ba at y=1, which was too low. Since C is at y=2 (in this corrected system), Ba just needs to be south of L (y <3), but can still be north of C (y >2).
+
+Let's verify all conditions with this new coordinate system.
+
+M=(0,0)
+
+L=(2,3) [NE of M]
+
+C=(-1,2) [NW of M: yes, x=-1 <0, y=2 >0; SW of L: from L(2,3), SW to (-1,2) is west 3, south 1: correct]
+
+Ba=(3, 2.5) [SE of L: x=3>2, y=2.5<3; NE of M: x=3>0, y=2.5>0: correct]
+
+Mm=(3,1.5) [SE of L: yes; NE of M: yes]
+
+I=(1,3) [NW of Ba(3,2.5): x=1<3, y=3>2.5; NW of Mm(3,1.5): x=1<3, y=3>1.5: correct]
+
+Now, check y-coordinates:
+
+- C is at y=2.
+
+Objects north of C (y >2):
+
+- L: 3 >2
+
+- Ba: 2.5 >2
+
+- I: 3 >2
+
+That's three objects.
+
+Ba is at 2.5, which is north of C (2). So yes.
+
+So the answer should be C.3.
+
+My initial mistake was placing Ba too far south (y=1), but since Ba only needs to be south of L (y=3), it can be at y=2.5, which is still north of C (y=2).
+
+Similarly, if we consider that "south" from L means y < L's y, but L's y could be higher, allowing Ba to still be north of C.
+
+Let's confirm once more:
+
+C is at (-1,2). So y=2.
+
+Ba at (3,2.5): y=2.5 >2, so north of C.
+
+L at (2,3): north.
+
+I at (1,3): north.
+
+So three objects. That's option C.
+
+Therefore, the correct answer is C.3.
+</think>
+Let me solve this step by step using the structured format:
+<format>
+>>> STEP 1: PARSE RELATIONSHIPS
+    - A is to the DIRECTION of B
+    [... list all given relationships ...]
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+    - Looking for: [target relationship / direction / count]
+    - [reasoning about the relationships]
+    - [use reversibility and transitivity as needed]
+
+>>> STEP 3: ANSWER
+    - [state conclusion]
+
+>>> FINAL ANSWER: [answer text]
+    \boxed{LETTER}
+</format>
+>>> STEP 1: PARSE RELATIONSHIPS
+- Miniature Golf Mart (M) is at a reference point.
+- Lynx's Luxury Linens (L) is NE of M.
+- Craft Creation Station (C) is NW of M.
+- C is SW of L.
+- Basil's Bakery (Ba) is SE of L and NE of M.
+- Molly's Music Store (Mm) is NE of M and SE of L.
+- Iris's Ice Skates (I) is NW of Ba and NW of Mm.
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+Let’s assign coordinates to clarify:
+- Place M at (0,0).
+- L is NE of M ⇒ L = (2, 3).
+- C is NW of M ⇒ C = (-1, 2).
+- C is SW of L ⇒ From L(2,3), SW to C(-1,2) (west 3, south 1) ✅.
+- Ba is SE of L and NE of M ⇒ Ba = (3, 2.5) (x > 2, y < 3, y > 0).
+- Mm is SE of L and NE of M ⇒ Mm = (3, 1.5).
+- I is NW of Ba and NW of Mm ⇒ I = (1, 3) (x < 3, y > 2.5 and y > 1.5).
+
+Now, identify objects **north of C** (C is at y = 2):
+- L = (2, 3) ⇒ y = 3 > 2 ✅
+- Ba = (3, 2.5) ⇒ y = 2.5 > 2 ✅
+- I = (1, 3) ⇒ y = 3 > 2 ✅
+- M = (0, 0), Mm = (3, 1.5) ⇒ y ≤ 2 ❌
+
+Thus, **three objects** are north of C: L, Ba, and I.
+
+>>> STEP 3: ANSWER
+The correct answer is **C. 3**.
+
+>>> FINAL ANSWER: \boxed{C}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt
new file mode 100644
index 00000000..943bde0e
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt
@@ -0,0 +1,296 @@
+Okay, let's try to figure out the direction of Jane's Jazz Club relative to Ostrich's Oils. Hmm, first I need to map out all the given information. Let me list out each location and their positions step by step.
+
+Starting with Eagle's Eclectic Emporium. Let's call that E for short. Then, Toucan's Tea is to the Southwest of E. So if E is the center, Toucan's Tea (T) is Southwest of E. Then Jane's Jazz Club (J) is to the Northwest of Toucan's Tea. Also, Jane's Jazz Club is to the Northwest of Eagle's Eclectic Emporium. Wait, so J is Northwest of both T and E? Wait, but T is Southwest of E, so Northwest of E would be a different direction. Let me visualize this.
+
+Let me sketch a coordinate system. Let's assume E is at the origin (0,0). Directions: North is up, East is right, South is down, West is left.
+
+Southwest of E would be down and left. So T is at, say, (-1, -1) if E is (0,0). Then Jane's Jazz Club is Northwest of T. Northwest of T would be up and left from T. So if T is at (-1, -1), then Northwest would be (-2, 0) or something. But also, J is Northwest of E. Northwest of E (0,0) would be (-1, 1). Wait, but the problem says both: J is Northwest of T and Northwest of E. So maybe J is Northwest of E, which would place it at (-x, y), but also Northwest of T. Let's see.
+
+Wait, maybe I should use relative directions step by step.
+
+First, E is the reference. Let's fix E's position. Then:
+
+- Toucan's Tea (T) is Southwest of E. So from E, go SW to get to T.
+
+- Jane's Jazz Club (J) is Northwest of T. So from T, go NW to get to J.
+
+- Also, J is Northwest of E. So from E, go NW to get to J. Wait, so both conditions must hold. So J must be NW of E, and also NW of T. But T is SW of E. So if E is center, T is SW, then NW of T would be towards the center or further NW?
+
+Wait, maybe it's better to draw a grid. Let's assign coordinates.
+
+Let me set E at (0,0).
+
+Southwest direction from E would be negative x and negative y. Let's assume each step is one unit. So T is at (-1, -1) if we consider SW as 45 degrees, but maybe for simplicity, just using the four cardinal directions. Wait, but Southwest is a diagonal, but in grid terms, maybe it's considered as both South and West. Hmm, but maybe the problem uses the standard compass directions where Southwest is a separate direction, not just South and West.
+
+Wait, the problem says "to the Southwest", so it's a single direction, not South and then West. So in terms of coordinates, Southwest would be a direction that's both South and West, so if E is at (0,0), then T is somewhere in the SW quadrant. Similarly, Northwest would be NW quadrant.
+
+But to make it easier, perhaps we can use a grid where:
+
+- North: +y
+
+- East: +x
+
+- South: -y
+
+- West: -x
+
+So Southwest would be -x and -y, Northwest would be -x and +y.
+
+So E is at (0,0).
+
+T is Southwest of E, so T's coordinates are (negative x, negative y). Let's say T is at (-1, -1).
+
+Now, Jane's Jazz Club (J) is Northwest of Toucan's Tea (T). So from T, Northwest direction. Northwest from T would be towards -x and +y. So if T is at (-1, -1), then moving Northwest would be to (-2, 0), for example. But also, J is Northwest of E. Northwest of E (0,0) would be (-x, +y). So J must be in the Northwest quadrant relative to E. So J's coordinates should be (-a, +b) where a and b are positive.
+
+So combining both: J is Northwest of T and Northwest of E.
+
+Let me see. If T is at (-1, -1), then Northwest of T would be moving West and North. So from T's position, going West (more negative x) and North (more positive y). So J could be at (-2, 0), which is Northwest of T (-1, -1), because from T, moving West (x decreases) and North (y increases). Also, (-2, 0) is Northwest of E (0,0)? Wait, Northwest of E would be a point with negative x and positive y. (-2, 0) is on the negative x-axis, so that's West, not Northwest. Hmm, maybe I need to adjust.
+
+Wait, maybe coordinates are better with both x and y positive in NW. Wait, maybe better to think in terms of relative positions.
+
+Alternatively, maybe using a different approach. Let's list all the given relations:
+
+1. T is SW of E.
+
+2. J is NW of T.
+
+3. J is NW of E.
+
+4. O is NW of T. (Ostrich's Oils, O)
+
+5. O is NW of J.
+
+6. M (Marina Market) is SW of E and SW of T.
+
+7. I (Iris's Ice Skates) is SW of E and NW of T.
+
+Wait, the problem says:
+
+"Ostrich's Oils is to the Northwest of Toucan's Tea. Ostrich's Oils is to the Northwest of Jane's Jazz Club."
+
+So O is NW of T and NW of J.
+
+Also, "Iris's Ice Skates is to the Southwest of Eagle's Eclectic Emporium. Iris's Ice Skates is to the Northwest of Toucan's Tea."
+
+So I is SW of E and NW of T.
+
+Wait, this is a bit confusing. Let's parse all the statements:
+
+- Eagle's Eclectic Emporium (E) is the reference.
+
+- Toucan's Tea (T) is SW of E.
+
+- Jane's Jazz Club (J) is NW of T.
+
+- J is NW of E.
+
+- Ostrich's Oils (O) is NW of T.
+
+- O is NW of J.
+
+- Marina Market (M) is SW of E and SW of T.
+
+- Iris's Ice Skates (I) is SW of E and NW of T.
+
+Wait, so I is SW of E and also NW of T. But T is SW of E. So if T is SW of E, then NW of T would be towards E and beyond. Let's try to place these.
+
+Let me try to create a mental map.
+
+Assume E is in the center.
+
+- T is SW of E.
+
+- J is NW of T and also NW of E. So since T is SW of E, moving NW from T would get towards E's NW direction. So J is NW of E, which is consistent.
+
+- O is NW of T and NW of J. So O is further NW than J.
+
+Wait, if O is NW of J, then J is SE of O.
+
+Wait, the question is: In which direction is Jane's Jazz Club relative to Ostrich's Oils? So we need to find J's direction from O.
+
+If O is NW of J, then J is SE of O. So the direction from O to J is Southeast. Wait, but the question is "Jane's Jazz Club relative to Ostrich's Oils", meaning from O's perspective, where is J? If O is NW of J, then J is SE of O. So the direction would be Southeast. But let's confirm.
+
+Wait, "Ostrich's Oils is to the Northwest of Jane's Jazz Club" means O is NW of J. Therefore, J is SE of O. So the direction of J relative to O is Southeast. So the answer would be A. Southeast.
+
+But let's make sure by mapping all points.
+
+Let me try to assign coordinates.
+
+Let’s place E at (0,0).
+
+T is SW of E. Let's assign T to (-1, -1).
+
+Now, J is NW of T. So from T (-1,-1), NW would be towards decreasing x and increasing y. Let's say J is at (-2, 0). Wait, but J must also be NW of E. NW of E (0,0) would be in the second quadrant (x negative, y positive). So (-2, 0) is on the negative x-axis, which is West, not Northwest. Hmm, maybe better to have J at (-1, 1), which is NW of E. Let's check if J (-1,1) is NW of T (-1,-1). From T (-1,-1), moving North (up) to (-1,1) would be directly North, not Northwest. Northwest would require moving West and North. So to go NW from T, you need to decrease x and increase y. So if T is at (-1,-1), then moving NW would go to (-2, 0), for example. But then J at (-2, 0) is NW of T, but is it NW of E?
+
+NW of E (0,0) would be x negative and y positive. (-2, 0) has y=0, so it's West, not Northwest. So maybe the problem doesn't require strict diagonals, just the general direction. Maybe the problem uses "Northwest" to mean North and West, but not necessarily both. Wait, but in standard terms, Northwest is a specific direction between North and West.
+
+This is getting confusing. Maybe another approach. Let's use relative positions without coordinates.
+
+We have:
+
+- E is the main point.
+
+- T is SW of E.
+
+- J is NW of T and also NW of E. So J is in the NW direction from E, and also NW of T. Since T is SW of E, the NW direction from T would overlap with the NW of E. So J is located NW of E, and T is SW of E, so J is above and to the left of E, T is below and to the left of E.
+
+- O is NW of T and NW of J. So O is further NW than J.
+
+Wait, if O is NW of J, then O is more to the Northwest than J. So the order from SW to NE would be E, T (SW of E), then J (NW of E and NW of T), then O (NW of J). Wait, but O is NW of J, so O is further NW than J.
+
+Wait, but J is NW of E, so O is even further NW than J.
+
+Now, the question is: J relative to O. If O is NW of J, then J is SE of O. So direction is Southeast. So answer A.
+
+But let's check the other points to make sure.
+
+Marina Market (M) is SW of E and SW of T. Since T is SW of E, SW of T would be further SW. So M is SW of both.
+
+Iris's Ice Skates (I) is SW of E and NW of T. Wait, I is SW of E, so like T is SW of E, but I is also SW of E. But I is NW of T. So T is SW of E, so NW of T would be towards E. So I is SW of E (so like, between E and T?), but NW of T. Wait, this might be a bit conflicting.
+
+Wait, if T is SW of E, then NW of T would be towards E's direction. So if I is NW of T, then I is closer to E than T is. But I is also SW of E. Wait, that seems contradictory. Wait, SW of E would be in the SW direction from E, but NW of T (which is SW of E) would be towards the North from T, so towards E's direction.
+
+Wait, maybe I is located such that from E, it's SW, and from T, it's NW. So imagine E at center. T is SW of E. I is SW of E, so also in SW direction from E, but closer to E than T? Or further?
+
+Wait, if T is SW of E, then to be NW of T, you have to go North and West from T. But if I is SW of E, then it's South and West from E. Hmm. Let's say E is (0,0), T is at (-2, -2) (SW). Then NW of T would be (-3, -1), but that's still SW of E. Wait, maybe coordinates are better.
+
+Let me try again with coordinates.
+
+Set E at (0,0).
+
+Define:
+
+- South: negative y
+
+- North: positive y
+
+- West: negative x
+
+- East: positive x
+
+So:
+
+- SW: negative x and negative y.
+
+- NW: negative x and positive y.
+
+So:
+
+T is SW of E: T = (-a, -b), where a,b >0.
+
+J is NW of T: so from T, go NW, so subtract from x (more west) and add to y (more north). So J = (-a - c, -b + d), where c,d >0.
+
+But J is also NW of E: so J must have negative x and positive y. So -a -c <0 (which it is, since a,c>0) and -b + d >0 → d > b. So J's y-coordinate is positive.
+
+Similarly, O is NW of T and NW of J.
+
+O is NW of T: so O = (-a - e, -b + f), e,f >0.
+
+O is NW of J: so O must be more NW than J. So O's x < J's x (more west), and O's y > J's y (more north).
+
+So if J is at (-a -c, -b + d), then O is at (-a -c - e, -b + d + f), e,f >0.
+
+Therefore, O is further NW than J.
+
+Therefore, J is to the SE of O.
+
+Hence, Jane's Jazz Club (J) is Southeast of Ostrich's Oils (O). So the direction is Southeast, answer A.
+
+Wait, but let's check the other points to confirm.
+
+Iris's Ice Skates (I) is SW of E and NW of T.
+
+SW of E: so I = (-g, -h), g,h >0.
+
+NW of T: T is (-a, -b), so NW of T would be (-a - i, -b + j), i,j >0.
+
+So I must equal (-a -i, -b + j). But also, I is SW of E: (-g, -h), so -g = -a -i → g = a +i, and -h = -b + j → h = b - j.
+
+Since h >0 (as it's SW of E), then b - j >0 → j < b.
+
+So j is positive but less than b. So I's y-coordinate is -h = -(b - j) = -b + j, which is greater than -b (since j >0), so I is North of T (since T's y is -b, I's y is -b + j > -b), which makes sense for NW of T (since NW from T would be North and West).
+
+So I is located NW of T (which is SW of E), so I is between T and E in the NW direction, but still SW of E.
+
+But maybe this detail isn't necessary for the question.
+
+Back to the main question: J relative to O.
+
+Since O is NW of J, J is SE of O. So answer A. Southeast.
+
+Wait, but let's make sure there's no mistake here.
+
+The problem states: "Ostrich's Oils is to the Northwest of Jane's Jazz Club." So O is NW of J → J is SE of O. Yes, that's correct.
+
+Therefore, the direction from O to J is Southeast. So Jane's Jazz Club is Southeast of Ostrich's Oils. So the answer should be A.
+
+Wait, but let's check the options:
+
+A. Southeast
+
+B. Northwest
+
+C. Southwest
+
+D. Northeast
+
+Yes, A is Southeast.
+
+But wait, another way to think: If O is NW of J, then to get from O to J, you go SE. So J is SE of O. So the direction of J relative to O is Southeast. So answer A.
+
+I think that's correct. Let me just make sure there's no other interpretation.
+
+Wait, the problem says "Ostrich's Oils is to the Northwest of Jane's Jazz Club." So O is NW of J. So J is SE of O. So the answer is A.
+
+Yes, that seems right. I don't see any contradictions in the other points affecting this. The other locations (M, I) are probably there to distract or for other questions, but for this specific question, the key is O is NW of J, so J is SE of O.
+</think>
+Let me solve this step by step using the structured format:
+<format>
+>>> STEP 1: PARSE RELATIONSHIPS
+    - A is to the DIRECTION of B
+    [... list all given relationships ...]
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+    - Looking for: [target relationship / direction / count]
+    - [reasoning about the relationships]
+    - [use reversibility and transitivity as needed]
+
+>>> STEP 3: ANSWER
+    - [state conclusion]
+
+>>> FINAL ANSWER: [answer text]
+    \boxed{LETTER}
+</format>
+>>> STEP 1: PARSE RELATIONSHIPS
+- Eagle's Eclectic Emporium (E)
+- Toucan's Tea (T) is SW of E
+- Jane's Jazz Club (J) is NW of T
+- J is NW of E
+- Ostrich's Oils (O) is NW of T
+- O is NW of J
+- Marina Market (M) is SW of E and SW of T
+- Iris's Ice Skates (I) is SW of E and NW of T
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+We're asked: **In which direction is Jane's Jazz Club (J) relative to Ostrich's Oils (O)?**
+
+From the given:
+> "Ostrich's Oils is to the Northwest of Jane's Jazz Club."
+
+This means **O is NW of J**.
+
+In directional terms:
+- If O is **NW of J**, then J is **SE of O**.
+
+This is a direct reversal of the stated relationship.
+
+Let's verify using transitivity and spatial logic:
+- O is NW of J → J is SE of O.
+
+No other relationships contradict this, as the other points (T, E, M, I) are not relevant to the J-O relationship.
+
+>>> STEP 3: ANSWER
+The direction of Jane's Jazz Club **relative to Ostrich's Oils** is **Southeast**.
+
+>>> FINAL ANSWER: 
+\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt
new file mode 100644
index 00000000..82e2f2c9
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt
@@ -0,0 +1,622 @@
+First, I need to figure out the positions of all these objects based on the given directions. The question is: "Which object is in the Northwest of Ocelot's Office Ornaments?" with options A. Cheetah's Chocolates, B. Frost Funhouse, C. Cobbler's Corner, D. Miniature Golf Mart.
+
+I should start by listing all the objects mentioned:
+
+- Frost Funhouse
+
+- Ocelot's Office Ornaments
+
+- Miniature Golf Mart
+
+- Lighthouse Luminary
+
+- Cobbler's Corner
+
+- Cheetah's Chocolates
+
+Now, let's parse the directional clues one by one.
+
+1. "Ocelot's Office Ornaments is to the Northeast of Frost Funhouse."
+
+   So, Ocelot's Office Ornaments (let's call it OO for short) is northeast of Frost Funhouse (FF). That means FF is southwest of OO.
+
+2. "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments."
+
+   Miniature Golf Mart (MGM) is northwest of OO. So, OO is southeast of MGM.
+
+3. "Miniature Golf Mart is to the Northwest of Frost Funhouse."
+
+   MGM is northwest of FF. So, FF is southeast of MGM.
+
+4. "Lighthouse Luminary is to the Northeast of Frost Funhouse."
+
+   Lighthouse Luminary (LL) is northeast of FF. So, FF is southwest of LL.
+
+5. "Lighthouse Luminary is to the Northeast of Miniature Golf Mart."
+
+   LL is northeast of MGM. So, MGM is southwest of LL.
+
+6. "Cobbler's Corner is to the Southeast of Miniature Golf Mart."
+
+   Cobbler's Corner (CC) is southeast of MGM. So, MGM is northwest of CC.
+
+7. "Cobbler's Corner is to the Southwest of Lighthouse Luminary."
+
+   CC is southwest of LL. So, LL is northeast of CC.
+
+8. "Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments."
+
+   Cheetah's Chocolates (CCO) is northeast of OO. So, OO is southwest of CCO.
+
+9. "Cheetah's Chocolates is to the Southeast of Miniature Golf Mart."
+
+   CCO is southeast of MGM. So, MGM is northwest of CCO.
+
+Now, I need to find which object is northwest of OO. So, I need to find something that is directly northwest of OO.
+
+From clue 2, MGM is northwest of OO. So, Miniature Golf Mart is northwest of Ocelot's Office Ornaments.
+
+But let me confirm if there are others or if this is consistent.
+
+Also, option D is Miniature Golf Mart, so that might be it.
+
+But I should make sure about the positions.
+
+Let me try to sketch a small grid mentally.
+
+Assume Frost Funhouse is at some point. Let's set FF at the origin for simplicity.
+
+Set FF at (0,0).
+
+Now, from clue 1: OO is northeast of FF. Northeast means both x and y increase, so let's say OO is at (1,1) for simplicity, but since it's relative, I can use directions.
+
+To make it easier, I'll use a coordinate system where north is positive y, east is positive x.
+
+So, let FF be at (0,0).
+
+- OO is northeast of FF, so OO has higher x and higher y. Say OO is at (a,b) with a>0, b>0.
+
+But I don't need exact values, just relative positions.
+
+From clue 3: MGM is northwest of FF.
+
+Northwest means lower x and higher y? Let's define:
+
+- North: increase y
+
+- East: increase x
+
+- South: decrease y
+
+- West: decrease x
+
+So:
+
+- Northeast: increase x, increase y
+
+- Northwest: decrease x, increase y
+
+- Southeast: increase x, decrease y
+
+- Southwest: decrease x, decrease y
+
+FF at (0,0)
+
+Clue 1: OO is northeast of FF → OO has x > 0, y > 0
+
+Clue 3: MGM is northwest of FF → MGM has x < 0, y > 0 (since northwest: west is negative x, north is positive y)
+
+Clue 2: MGM is northwest of OO → so from OO, MGM is west and north? Northwest of OO means from OO, moving northwest to get to MGM.
+
+If MGM is northwest of OO, that means MGM is west and north of OO.
+
+So, if OO is at (x_o, y_o), then MGM is at (x_o - dx, y_o + dy) for some dx, dy >0.
+
+Similarly, from clue 3, MGM is northwest of FF, so if FF is at (0,0), MGM is at (x_m, y_m) with x_m < 0, y_m > 0.
+
+Now, clue 4: LL is northeast of FF → LL has x > 0, y > 0
+
+Clue 5: LL is northeast of MGM → so LL has higher x and higher y than MGM.
+
+Clue 6: CC is southeast of MGM → so CC has higher x and lower y than MGM.
+
+Clue 7: CC is southwest of LL → so CC has lower x and lower y than LL.
+
+Clue 8: CCO is northeast of OO → so CCO has higher x and higher y than OO.
+
+Clue 9: CCO is southeast of MGM → so CCO has higher x and lower y than MGM.
+
+Now, I need to find what is northwest of OO.
+
+From clue 2, MGM is northwest of OO, so MGM should be northwest of OO.
+
+But let me see if this is consistent with other clues.
+
+Also, option D is Miniature Golf Mart, which is MGM.
+
+But let me check if there's any other possibility.
+
+The question is "which object is in the Northwest of Ocelot's Office Ornaments", and clue 2 directly says MGM is northwest of OO.
+
+So, it should be D. Miniature Golf Mart.
+
+But let me make sure about the other options.
+
+Option A is Cheetah's Chocolates. Clue 8 says CCO is northeast of OO, so it's not northwest; it's northeast.
+
+Option B is Frost Funhouse. From clue 1, OO is northeast of FF, so FF is southwest of OO, not northwest.
+
+Option C is Cobbler's Corner. Now, where is CC relative to OO?
+
+I need to find the position of CC.
+
+From clue 6: CC is southeast of MGM.
+
+From clue 7: CC is southwest of LL.
+
+But I need relation to OO.
+
+Also, from clue 5, LL is northeast of MGM.
+
+And clue 4, LL is northeast of FF.
+
+Now, let me try to place all points.
+
+Set FF at (0,0).
+
+Since MGM is northwest of FF, let's say MGM is at (-1, 1) for simplicity. (x negative, y positive)
+
+Now, OO is northeast of FF, so OO has x>0, y>0. But from clue 2, MGM is northwest of OO.
+
+MGM is at (-1,1), and it's northwest of OO, so OO must be southeast of MGM.
+
+Since MGM is northwest of OO, OO is southeast of MGM.
+
+So, if MGM is at (-1,1), then OO is southeast, so OO has x > -1 and y < 1? Southeast means increase x, decrease y.
+
+So, OO should be at (x_o, y_o) with x_o > -1 (since east of MGM), and y_o < 1 (since south of MGM).
+
+But also, from clue 1, OO is northeast of FF, and FF is at (0,0), so OO has x_o > 0, y_o > 0.
+
+So, OO has x_o > 0, y_o > 0, and since it's southeast of MGM at (-1,1), x_o > -1 (which is true since x_o >0), and y_o < 1.
+
+y_o < 1 and y_o > 0.
+
+Now, clue 4: LL is northeast of FF, so LL has x >0, y>0.
+
+Clue 5: LL is northeast of MGM.
+
+MGM is at (-1,1), so LL is northeast of MGM, meaning x_ll > -1, y_ll > 1.
+
+But since LL is northeast of FF, x_ll >0, y_ll >0, so y_ll >1 is consistent.
+
+Now, clue 6: CC is southeast of MGM.
+
+MGM at (-1,1), so CC is southeast, so x_cc > -1, y_cc < 1.
+
+Clue 7: CC is southwest of LL.
+
+So, CC has x_cc < x_ll, y_cc < y_ll.
+
+Now, clue 8: CCO is northeast of OO.
+
+OO is at (x_o, y_o) with x_o >0, y_o <1, y_o >0.
+
+CCO has x_cco > x_o, y_cco > y_o.
+
+Clue 9: CCO is southeast of MGM.
+
+MGM at (-1,1), so CCO is southeast of MGM, meaning x_cco > -1, y_cco < 1.
+
+Now, back to finding northwest of OO.
+
+We have MGM at (-1,1), OO at (x_o, y_o) with x_o >0, y_o <1.
+
+Since MGM is northwest of OO, and OO is southeast of MGM, the vector from OO to MGM is northwest, so from OO, going northwest gets to MGM.
+
+So, MGM is directly northwest of OO, as per clue 2.
+
+Now, is there any other object that could be northwest of OO?
+
+For example, Frost Funhouse: FF is at (0,0). OO is northeast of FF, so FF is southwest of OO, not northwest.
+
+Cheetah's Chocolates: CCO is northeast of OO, so it's not northwest.
+
+Cobbler's Corner: CC is southeast of MGM, and MGM is northwest of OO, so CC might be southeast of OO or something.
+
+Let's see the position.
+
+MGM is northwest of OO.
+
+CC is southeast of MGM.
+
+Since MGM is northwest of OO, and CC is southeast of MGM, then CC could be southwest of OO or something.
+
+For example, from OO, to go to MGM, you go northwest.
+
+Then from MGM to CC, you go southeast.
+
+So, from OO to CC, it might not be direct.
+
+But specifically, is CC northwest of OO? Probably not.
+
+Let's assign coordinates.
+
+Set FF at (0,0).
+
+Set MGM at (-1,1) as I thought.
+
+Now, OO is southeast of MGM, and northeast of FF.
+
+So, OO must have x > -1 and y < 1 (from being southeast of MGM), and x >0, y>0 (from northeast of FF).
+
+So, let's say OO is at (0.5, 0.5) for example. (x=0.5>0, y=0.5<1 and >0)
+
+Now, LL is northeast of FF, so x>0, y>0, and northeast of MGM at (-1,1), so x > -1, y >1.
+
+Also, since LL is northeast of FF, but FF is (0,0), so y>0 is fine, but y>1 from above.
+
+So LL has y>1.
+
+Say LL is at (1,2), for example.
+
+Now, CC is southeast of MGM. MGM at (-1,1), so southeast: x > -1, y <1.
+
+Also, CC is southwest of LL. LL at (1,2), so southwest means x <1, y<2.
+
+Now, CC has y<1 from above, and y<2 is automatic since y<1<2.
+
+x > -1 and x <1.
+
+Also, from clue 7, CC is southwest of LL, which is consistent.
+
+Now, CCO: northeast of OO.
+
+OO at (0.5,0.5), so CCO has x >0.5, y>0.5.
+
+Also, southeast of MGM: MGM at (-1,1), so CCO has x > -1, y <1.
+
+So CCO has x >0.5, y>0.5, and y<1.
+
+Say CCO at (0.6, 0.6) for example.
+
+Now, the question: which is northwest of OO.
+
+OO is at (0.5,0.5)
+
+Northwest of OO would be points with x < 0.5 and y > 0.5.
+
+Now, look at the objects:
+
+- FF: (0,0) — x=0 <0.5, y=0 <0.5? y=0 <0.5, but northwest requires y>0.5, so y=0 <0.5, not north, it's south. So not northwest.
+
+- OO itself: not.
+
+- MGM: (-1,1) — x=-1 <0.5, y=1 >0.5, so yes, northwest.
+
+- LL: (1,2) — x=1 >0.5, y=2>0.5, so northeast, not northwest.
+
+- CC: let's say we have it. CC is southeast of MGM, so x > -1, y<1. And from above, x<1, y<1.
+
+For example, CC could be at (0,0.5), but y=0.5, which is not greater than 0.5 for northwest? Northwest of OO: y must be >0.5.
+
+If CC is at (0,0.5), y=0.5 not greater than 0.5, so not north, it's same latitude or south.
+
+But typically, "northwest" implies strictly northwest, so y > y_o and x < x_o.
+
+Similarly, in directions, it's usually not the same point.
+
+So for OO at (0.5,0.5), northwest: x < 0.5 and y > 0.5.
+
+CC: y <1, but y could be less than or equal to 0.5.
+
+For example, if CC is at (0,0.4), y=0.4 <0.5, so south, not north.
+
+If CC is at (0,0.6), but y<1, and y>0.5? But from clue, CC is southeast of MGM, MGM y=1, so y_cc <1, but it could be above or below 0.5.
+
+In my coordinate, OO y=0.5, so for CC to be northwest of OO, it needs y>0.5.
+
+But CC is southeast of MGM at y=1, so y_cc <1, but it could be 0.6, for example.
+
+However, from clue 7, CC is southwest of LL.
+
+LL is at (1,2), so y_cc <2, which is true, but also, since LL is northeast of MGM, and MGM y=1, LL y>1, so y_ll >1.
+
+CC is southwest of LL, so y_cc < y_ll, which is true since y_cc <1 < y_ll if y_ll>1.
+
+But y_cc could be, say, 0.6, which is greater than 0.5.
+
+But is it necessarily northwest of OO?
+
+Not necessarily, because OO is at (0.5,0.5), and CC could be at (0.4, 0.6), for example, which would be northwest of OO? x=0.4<0.5, y=0.6>0.5, so yes, it could be.
+
+But in this case, CC might be northwest of OO, but is it?
+
+In my example, if CC is at (0.4,0.6), and OO at (0.5,0.5), then yes, CC is northwest of OO.
+
+But is that consistent with all clues?
+
+Let me check the clues.
+
+We have clue 6: CC is southeast of MGM.
+
+MGM at (-1,1), CC at (0.4,0.6): x=0.4 > -1 (east), y=0.6 <1 (south), so southeast, yes.
+
+Clue 7: CC is southwest of LL. LL at (1,2), CC at (0.4,0.6): x=0.4<1, y=0.6<2, so southwest, yes.
+
+Now, is CC northwest of OO? OO at (0.5,0.5), CC at (0.4,0.6): x=0.4<0.5, y=0.6>0.5, so yes, northwest.
+
+But clue 2 says MGM is northwest of OO, and MGM is at (-1,1), which is also northwest.
+
+So both MGM and CC could be northwest of OO, but that can't be right for the question, as it should be one answer.
+
+Perhaps I have a mistake.
+
+In this setup, MGM is at (-1,1), which is more northwest than CC.
+
+But the question is "which object is in the Northwest", implying there might be one, or we need to see the direct one.
+
+But clue 2 directly states that MGM is northwest of OO, so it should be MGM.
+
+In my coordinate, CC is also northwest of OO in this example, but is CC necessarily northwest?
+
+Let me see the relative positions.
+
+From the clues, is there a constraint that prevents CC from being northwest of OO?
+
+For example, from clue 9: CCO is southeast of MGM.
+
+CCO is at (0.6,0.6) in my example.
+
+CC is at (0.4,0.6), CCO at (0.6,0.6), so same y, but CCO is southeast of MGM, which is fine.
+
+But for CC, it might not be directly northwest of OO; it depends on positions.
+
+However, the question is based on the given directions, and clue 2 explicitly says MGM is northwest of OO, so that should be the answer.
+
+Moreover, in the options, D is Miniature Golf Mart, which is MGM.
+
+Now, let's see why CC might not be northwest.
+
+Perhaps from the other clues.
+
+Another clue: "Cobbler's Corner is to the Southwest of Lighthouse Luminary." and "Lighthouse Luminary is to the Northeast of Miniature Golf Mart."
+
+But I think I have that.
+
+Perhaps I should list all the relations.
+
+Let me make a directed graph or something.
+
+From the clues:
+
+- OO is NE of FF → FF is SW of OO
+
+- MGM is NW of OO → OO is SE of MGM
+
+- MGM is NW of FF → FF is SE of MGM
+
+- LL is NE of FF → FF is SW of LL
+
+- LL is NE of MGM → MGM is SW of LL
+
+- CC is SE of MGM → MGM is NW of CC
+
+- CC is SW of LL → LL is NE of CC
+
+- CCO is NE of OO → OO is SW of CCO
+
+- CCO is SE of MGM → MGM is NW of CCO
+
+Now, for "northwest of OO", we need something that is NW of OO.
+
+From above, MGM is NW of OO, directly.
+
+Is there another?
+
+For example, is FF NW of OO? But FF is SW of OO, from clue 1.
+
+CC: CC is SE of MGM, and MGM is NW of OO, so CC is SE of something that is NW of OO.
+
+If MGM is NW of OO, then SE of MGM would be, relative to OO, it could be south or east, but not northwest.
+
+For instance, if OO is at origin, MGM is at (-1,1), then SE of MGM would be, say, (0,0) or (0,0.5), which for OO at (0,0), SE of MGM at (-1,1) is (x> -1, y<1), so for OO at (0,0), points like (0.5,0.5) would be NE of OO, not NW.
+
+In my earlier coordinate, OO at (0.5,0.5), MGM at (-1,1), SE of MGM: x> -1, y<1, so for example (0,0.4) or (0.4,0.6).
+
+(0.4,0.6) compared to OO (0.5,0.5): x=0.4<0.5, y=0.6>0.5, so NW.
+
+But is (0.4,0.6) directly NW? Yes, but is it necessarily the case?
+
+The issue is that the position of CC is not fixed; it could be at different places.
+
+For example, CC could be at (0.2, 0.4), which has y=0.4 < 0.5, so south of OO, not north.
+
+Or at (0.4, 0.6), which is north and west.
+
+But the clue doesn't specify that CC must be north of OO; it could be south.
+
+Similarly, for the question, we need to see what is definitely northwest.
+
+But clue 2 says MGM is northwest of OO, so it is definitely northwest.
+
+For CC, it might or might not be northwest, depending on positions.
+
+For example, if CC is at (0.2, 0.4), then it is southeast of OO, since x=0.2<0.5, y=0.4<0.5, so southwest.
+
+If at (0.4,0.6), it is northwest.
+
+But the clue doesn't constrain it to be north of OO, so it could be south.
+
+Therefore, CC is not necessarily northwest of OO; it could be in other directions.
+
+Whereas MGM is explicitly northwest of OO.
+
+Similarly, CCO is northeast of OO, so definitely not northwest.
+
+FF is southwest, as established.
+
+So only MGM is directly stated to be northwest of OO.
+
+Moreover, in the options, D is Miniature Golf Mart.
+
+But let me check the answer choices.
+
+Options:
+
+A. Cheetah's Chocolates — this is northeast, from clue 8.
+
+B. Frost Funhouse — southwest, from clue 1.
+
+C. Cobbler's Corner — could be various, but not necessarily northwest.
+
+D. Miniature Golf Mart — northwest, from clue 2.
+
+Also, the clue says "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments." directly.
+
+So, it should be D.
+
+But in the problem, it says "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments." which is clue 2.
+
+And the question is "which object is in the Northwest of Ocelot's Office Ornaments?"
+
+So, it's directly given.
+
+Perhaps I misread the clues.
+
+Let me read the original text.
+
+"Ocelot's Office Ornaments is to the Northeast of Frost Funhouse. Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments. Miniature Golf Mart is to the Northwest of Frost Funhouse."
+
+Yes, so Miniature Golf Mart is northwest of Ocelot's Office Ornaments.
+
+So, the object northwest of OO is Miniature Golf Mart.
+
+Therefore, the answer should be D.
+
+But why are there other clues? Maybe to distract or to confirm.
+
+For example, "Miniature Golf Mart is to the Northwest of Frost Funhouse" is consistent, as we have.
+
+And the other clues about Lighthouse and Cobbler's might be for other questions, but not relevant here.
+
+Similarly, Cheetah's Chocolates is northeast, not northwest.
+
+So, I think D is correct.
+
+But let me see if there's a trick.
+
+Another thing: the question says "in the Northwest", and "to the Northwest" might be interpreted as the direction, but in context, it's clear.
+
+Perhaps for Cobbler's Corner, but no.
+
+Let me list all the "is to the" statements.
+
+- OO is NE of FF
+
+- MGM is NW of OO
+
+- MGM is NW of FF
+
+- LL is NE of FF
+
+- LL is NE of MGM
+
+- CC is SE of MGM
+
+- CC is SW of LL
+
+- CCO is NE of OO
+
+- CCO is SE of MGM
+
+Now, for OO, the objects relative to it:
+
+- FF: SW (since OO is NE of FF)
+
+- MGM: NW (directly)
+
+- LL: ? Not directly given, but from LL is NE of FF and FF is SW of OO, so LL could be NE or something.
+
+For example, if FF is SW of OO, and LL is NE of FF, then LL could be NE of OO or not, depending.
+
+But not relevant.
+
+- CC: not directly given relative to OO.
+
+- CCO: NE
+
+So only MGM is specified as NW of OO.
+
+Therefore, the answer must be Miniature Golf Mart.
+
+So, option D.
+
+But in the options, it's listed as D. Miniature Golf Mart.
+
+Now, just to be thorough, let's see if from the clues, CC could be NW, but as I thought, it's not necessary.
+
+For example, from LL is NE of MGM, and CC is SW of LL, so CC is SW of LL, and LL is NE of MGM, so CC is not necessarily related to OO directly.
+
+But in terms of direction, since MGM is NW of OO, and CC is SE of MGM, then CC is generally south-east of OO, not north-west.
+
+For instance, if you go from OO to MGM (NW), then from MGM to CC (SE), so from OO to CC, you go NW then SE, which might cancel out, but typically, it would be roughly in the same area or south-east.
+
+In vector terms, if OO to MGM is vector (-a,b) for a>0,b>0, then MGM to CC is vector (c,-d) for c>0,d>0, so OO to CC is (-a+c, b-d), which could have x negative or positive, y positive or negative, so it could be NW, NE, SE, SW, depending on values.
+
+But in the context of the map, with the given clues, it might not be specified, but since the question has a specific answer, and MGM is directly given, it should be D.
+
+Moreover, in multiple-choice, D is the direct one.
+
+Perhaps the problem is that "Miniature Golf Mart" is listed, and it's correct.
+
+Another thing: in the text, it says "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments." so it's clear.
+
+I think I'm overcomplicating.
+
+So, the answer should be D. Miniature Golf Mart.
+</think>
+Let me solve this step by step using the structured format:
+<format>
+>>> STEP 1: PARSE RELATIONSHIPS
+    - A is to the DIRECTION of B
+    [... list all given relationships ...]
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+    - Looking for: [target relationship / direction / count]
+    - [reasoning about the relationships]
+    - [use reversibility and transitivity as needed]
+
+>>> STEP 3: ANSWER
+    - [state conclusion]
+
+>>> FINAL ANSWER: [answer text]
+    \boxed{LETTER}
+</format>
+>>> STEP 1: PARSE RELATIONSHIPS
+- Ocelot's Office Ornaments is to the Northeast of Frost Funhouse → Frost Funhouse is to the Southwest of Ocelot's Office Ornaments.
+- Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments → Ocelot's Office Ornaments is to the Southeast of Miniature Golf Mart.
+- Miniature Golf Mart is to the Northwest of Frost Funhouse → Frost Funhouse is to the Southeast of Miniature Golf Mart.
+- Lighthouse Luminary is to the Northeast of Frost Funhouse → Frost Funhouse is to the Southwest of Lighthouse Luminary.
+- Lighthouse Luminary is to the Northeast of Miniature Golf Mart → Miniature Golf Mart is to the Southwest of Lighthouse Luminary.
+- Cobbler's Corner is to the Southeast of Miniature Golf Mart → Miniature Golf Mart is to the Northwest of Cobbler's Corner.
+- Cobbler's Corner is to the Southwest of Lighthouse Luminary → Lighthouse Luminary is to the Northeast of Cobbler's Corner.
+- Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments → Ocelot's Office Ornaments is to the Southwest of Cheetah's Chocolates.
+- Cheetah's Chocolates is to the Southeast of Miniature Golf Mart → Miniature Golf Mart is to the Northwest of Cheetah's Chocolates.
+
+>>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
+- The question asks: "Which object is in the Northwest of Ocelot's Office Ornaments?"
+- From the parsed relationships, the direct statement is: "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments."
+- This means Miniature Golf Mart is positioned directly northwest of Ocelot's Office Ornaments.
+- Now, check the options:
+  - A. Cheetah's Chocolates: Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments (from "Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments"), so it is northeast, not northwest.
+  - B. Frost Funhouse: Frost Funhouse is to the Southwest of Ocelot's Office Ornaments (from "Ocelot's Office Ornaments is to the Northeast of Frost Funhouse"), so it is southwest, not northwest.
+  - C. Cobbler's Corner: Cobbler's Corner is to the Southeast of Miniature Golf Mart (from "Cobbler's Corner is to the Southeast of Miniature Golf Mart"). Since Miniature Golf Mart is northwest of Ocelot's Office Ornaments, Cobbler's Corner is southeast of Miniature Golf Mart, meaning it is generally southeast or south of Ocelot's Office Ornaments, not northwest. It is not directly northwest.
+  - D. Miniature Golf Mart: Directly stated as northwest of Ocelot's Office Ornaments.
+- The other clues (e.g., about Lighthouse Luminary, Cobbler's Corner, Cheetah's Chocolates) are consistent with the positions but do not change the direct relationship given for Miniature Golf Mart.
+
+>>> STEP 3: ANSWER
+- The object directly in the Northwest of Ocelot's Office Ornaments is Miniature Golf Mart.
+
+>>> FINAL ANSWER: D
+\boxed{D}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv
new file mode 100644
index 00000000..f7b5eb14
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv
@@ -0,0 +1,3 @@
+index,question_type,correct,attempted,sol,gt,tokens,num_relations,verified_claims,message
+0,direction,True,True,A,Northeast,4736,9,11,Correct: option A -> Northeast
+1499,counting,False,True,C,0,8799,9,17,"Incorrect: expected '0', got '3' (option C)"
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv
new file mode 100644
index 00000000..4400c9f9
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv
@@ -0,0 +1,6 @@
+index,question_type,correct,attempted,sol,gt,tokens,num_relations,verified_claims,message
+0,direction,True,True,A,Northeast,3503,9,1,Correct: option A -> Northeast
+374,direction,True,True,A,Southeast,4142,9,1,Correct: option A -> Southeast
+749,object,True,True,D,Miniature Golf Mart,6550,9,11,Correct: option D -> Miniature Golf Mart
+1124,counting,True,True,A,0,8820,9,0,Correct: option A -> 0
+1499,counting,False,True,C,0,6193,9,0,"Incorrect: expected '0', got '3' (option C)"
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json
new file mode 100644
index 00000000..2b63a65a
--- /dev/null
+++ b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json
@@ -0,0 +1,49 @@
+{
+  "model": "Qwen/Qwen3-30B-A3B-Thinking-2507",
+  "total_examples": 2,
+  "correct": 1,
+  "attempted": 2,
+  "accuracy": 0.5,
+  "soundness": 0.5,
+  "total_reasoning_tokens": 13535,
+  "avg_reasoning_tokens": 6767.5,
+  "max_corrections": 3,
+  "stats_by_type": {
+    "direction": {
+      "total": 1,
+      "correct": 1
+    },
+    "object": {
+      "total": 0,
+      "correct": 0
+    },
+    "counting": {
+      "total": 1,
+      "correct": 0
+    }
+  },
+  "results": [
+    {
+      "idx": 0,
+      "question_type": "direction",
+      "correct": true,
+      "attempted": true,
+      "sol": "A",
+      "gt": "Northeast",
+      "reasoning_tokens": 4736,
+      "num_relations": 9,
+      "verified_claims": 11
+    },
+    {
+      "idx": 1499,
+      "question_type": "counting",
+      "correct": false,
+      "attempted": true,
+      "sol": "C",
+      "gt": "0",
+      "reasoning_tokens": 8799,
+      "num_relations": 9,
+      "verified_claims": 17
+    }
+  ]
+}
\ No newline at end of file
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index cb9f386d..5adb4202 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -92,20 +92,20 @@ def count_tokens(text: str, tokenizer) -> int:
     return len(tokens)
 
 
-def get_question_type_from_index(idx: int) -> str:
-    """Determine question type based on index range.
+# def get_question_type_from_index(idx: int) -> str:
+#     """Determine question type based on index range.
     
-    Dataset structure:
-    - 3000-3499: right turns
-    - 3500-3999: total turns
-    - 4000-4500: relative position
-    """
-    if idx < 3500:
-        return "right_turns"
-    elif idx < 4000:
-        return "total_turns"
-    else:
-        return "relative_position"
+#     Dataset structure:
+#     - 3000-3499: right turns
+#     - 3500-3999: total turns
+#     - 4000-4500: relative position
+#     """
+#     if idx < 3500:
+#         return "right_turns"
+#     elif idx < 4000:
+#         return "total_turns"
+#     else:
+#         return "relative_position"
 
 
 def init_llm_server(model_name, max_tokens=32768, port=8000):
diff --git a/examples/TTSwithVerification/spatialmap_stepverifier.py b/examples/TTSwithVerification/spatialmap_stepverifier.py
index 4dc33fff..8d0f454f 100644
--- a/examples/TTSwithVerification/spatialmap_stepverifier.py
+++ b/examples/TTSwithVerification/spatialmap_stepverifier.py
@@ -1,11 +1,15 @@
 """
-SpatialMap experiment with step-by-step verification using StepVerifierSpatialMapMonitor.
+SpatialMap experiment with thinking-phase step verification.
 
-Uses the new monitor-based architecture that integrates with stream_completion.
+Uses ThinkingPhaseStepVerifierSpatialMapMonitor which:
+  - Verifies the model's directional claims during <think> via side-streams
+  - Injects a structured step format after </think> (no meta-prompt needed)
+  - Verifies each step as the model fills in the structured template
 """
 
 import argparse
 import asyncio
+import csv
 import json
 import logging
 import os
@@ -17,7 +21,7 @@
 from transformers import AutoTokenizer
 
 from interwhen import stream_completion
-from interwhen.monitors import StepVerifierSpatialMapMonitor
+from interwhen.monitors import ThinkingPhaseStepVerifierSpatialMapMonitor
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 logger = logging.getLogger(__name__)
@@ -42,6 +46,7 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/SpatialM
     dirs = {
         "base": output_base,
         "reasoning": os.path.join(output_base, "Reasoning_output"),
+        "csv_saved": os.path.join(output_base, "csv_saved"),
     }
     
     for dir_path in dirs.values():
@@ -50,11 +55,6 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/SpatialM
     return dirs
 
 
-def remove_last_paragraph(s: str) -> str:
-    """Remove the last instruction paragraph from the prompt."""
-    return s[:-143] if len(s) > 143 else s
-
-
 def get_question_type(idx: int) -> str:
     """Determine question type based on index range.
     
@@ -71,174 +71,16 @@ def get_question_type(idx: int) -> str:
         return "counting"
 
 
-def build_meta_prompt_from_example(example):
-    """Build prompt with structured output format instructions."""
-    
-    # Get the description
-    description = example.get("prompt")
-    description = str(description)
-    description = remove_last_paragraph(description)
-    
-    pre_prompt = """You are a spatial reasoning expert. Given a description of objects on a map and their relative positions, analyze the spatial relationships step by step.
-
-CRITICAL INSTRUCTION: DO NOT use abbreviations or initials for entity names. Always use the COMPLETE FULL NAME of each entity exactly as given in the problem. For example, write "Police Supply Store" not "PSS" or "PS".
+def build_simple_prompt(example):
+    """Build a simple user prompt from the spatial map example.
 
-DIRECTION DEFINITIONS (Diagonal Directions):
-- Northwest = up and to the left (row decreases, col decreases)
-- Northeast = up and to the right (row decreases, col increases)  
-- Southwest = down and to the left (row increases, col decreases)
-- Southeast = down and to the right (row increases, col increases)
-
-CARDINAL DIRECTIONS (for questions asking about North/South/East/West):
-- North = directly up - requires BOTH Northwest AND Northeast relationships to be confirmed
-- South = directly down - requires BOTH Southwest AND Southeast relationships to be confirmed
-- West = directly left - requires BOTH Northwest AND Southwest relationships to be confirmed
-- East = directly right - requires BOTH Northeast AND Southeast relationships to be confirmed
-
-IMPORTANT: In this dataset, only diagonal relationships (NW/NE/SW/SE) are given. An object can ONLY be in a pure cardinal direction (N/S/E/W) if BOTH required diagonal relationships exist.
-
-IMPORTANT RULES:
-- Directions are TRANSITIVE: If A is Northwest of B, and B is Northwest of C, then A is Northwest of C.
-- Directions are REVERSIBLE: If A is Northwest of B, then B is Southeast of A.
-- Opposite pairs: Northwest ↔ Southeast, Northeast ↔ Southwest
-
-STRUCTURED OUTPUT FORMAT:
-
-═══════════════════════════════════════════════════════════════════════════════
-EXAMPLE 1: Direction Finding (Q0)
-═══════════════════════════════════════════════════════════════════════════════
-
-Map Description:
-Police Supply Store is in the map. Narwhal's Novelties is to the Northwest of Police Supply Store. Coral Crafts is to the Northwest of Narwhal's Novelties. Coral Crafts is to the Northwest of Police Supply Store. Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store. Oz Oddities is to the Southwest of Planetarium Prints. Oz Oddities is to the Southwest of Police Supply Store. Ice Queen Ice Cream is to the Northwest of Planetarium Prints. Ice Queen Ice Cream is to the Southeast of Coral Crafts.
-
-Question: In which direction is Planetarium Prints relative to Police Supply Store?
-
-### Final Answer
-
->>> STEP 1: PARSE RELATIONSHIPS
-    - Narwhal's Novelties is to the Northwest of Police Supply Store
-    - Coral Crafts is to the Northwest of Narwhal's Novelties
-    - Coral Crafts is to the Northwest of Police Supply Store
-    - Planetarium Prints is to the Southeast of Coral Crafts
-    - Planetarium Prints is to the Northeast of Police Supply Store
-    - Oz Oddities is to the Southwest of Planetarium Prints
-    - Oz Oddities is to the Southwest of Police Supply Store
-    - Ice Queen Ice Cream is to the Northwest of Planetarium Prints
-    - Ice Queen Ice Cream is to the Southeast of Coral Crafts
-
->>> STEP 2: FIND DIRECT RELATIONSHIP
-    - Looking for: Planetarium Prints relative to Police Supply Store
-    - Direct relationship found: "Planetarium Prints is to the Northeast of Police Supply Store"
-
->>> STEP 3: ANSWER
-    - Planetarium Prints is to the NORTHEAST of Police Supply Store.
-    
->>> FINAL ANSWER: Northeast
-    \\boxed{A}
-
-═══════════════════════════════════════════════════════════════════════════════
-EXAMPLE 2: Object Finding (Q1)
-═══════════════════════════════════════════════════════════════════════════════
-
-Map Description:
-Quail's Quilts is in the map. Olive's Oils is to the Southeast of Quail's Quilts. Lumber's Marketplace is to the Northeast of Olive's Oils. Lumber's Marketplace is to the Northeast of Quail's Quilts. Stingray Shoes is to the Northeast of Quail's Quilts. Stingray Shoes is to the Northwest of Lumber's Marketplace. Elephant's Electronics is to the Northeast of Olive's Oils. Elephant's Electronics is to the Northeast of Lumber's Marketplace. Blossom Boutique is to the Northwest of Elephant's Electronics. Blossom Boutique is to the Southeast of Stingray Shoes.
-
-Question: Which object is in the Southwest of Lumber's Marketplace?
-
-### Final Answer
-
->>> STEP 1: PARSE RELATIONSHIPS
-    - Olive's Oils is to the Southeast of Quail's Quilts
-    - Lumber's Marketplace is to the Northeast of Olive's Oils
-    - Lumber's Marketplace is to the Northeast of Quail's Quilts
-    - Stingray Shoes is to the Northeast of Quail's Quilts
-    - Stingray Shoes is to the Northwest of Lumber's Marketplace
-    - Elephant's Electronics is to the Northeast of Olive's Oils
-    - Elephant's Electronics is to the Northeast of Lumber's Marketplace
-    - Blossom Boutique is to the Northwest of Elephant's Electronics
-    - Blossom Boutique is to the Southeast of Stingray Shoes
-
->>> STEP 2: FIND OBJECTS IN SOUTHWEST OF Lumber's Marketplace
-    - Using reversibility: if Lumber's Marketplace is to the Northeast of X, then X is to the Southwest of Lumber's Marketplace.
-    - Scanning relationships for "Lumber's Marketplace is to the Northeast of X":
-    - "Lumber's Marketplace is to the Northeast of Olive's Oils" → Olive's Oils is SOUTHWEST of Lumber's Marketplace ✓
-    - "Lumber's Marketplace is to the Northeast of Quail's Quilts" → Quail's Quilts is SOUTHWEST of Lumber's Marketplace ✓
-    - Other objects:
-    - Stingray Shoes is Northwest of Lumber's Marketplace → NOT Southwest
-    - Elephant's Electronics is Northeast of Lumber's Marketplace → NOT Southwest
-    - Blossom Boutique: no direct relationship to Lumber's Marketplace given
-    - Objects in Southwest of Lumber's Marketplace: Olive's Oils, Quail's Quilts
-    - Checking options: Quail's Quilts matches option D.
-
->>> STEP 3: ANSWER
-    - Quail's Quilts is in the Southwest of Lumber's Marketplace.
-    
->>> FINAL ANSWER: Quail's Quilts
-    \\boxed{D}
-
-═══════════════════════════════════════════════════════════════════════════════
-EXAMPLE 3: Counting (Q2)
-═══════════════════════════════════════════════════════════════════════════════
-
-Map Description:
-Tremor Toys is in the map. Fresh Foods is to the Northeast of Tremor Toys. Salmon Sushi is to the Northeast of Fresh Foods. Salmon Sushi is to the Northeast of Tremor Toys. Recycle Center is to the Northeast of Fresh Foods. Recycle Center is to the Southeast of Salmon Sushi. Wolf's Wardrobe is to the Southeast of Fresh Foods. Wolf's Wardrobe is to the Southeast of Tremor Toys. Mantis's Maps is to the Southeast of Salmon Sushi. Mantis's Maps is to the Southeast of Fresh Foods.
-
-Question: How many objects are in the Southwest of Mantis's Maps?
-
-### Final Answer
-
->>> STEP 1: PARSE RELATIONSHIPS
-    - Fresh Foods is to the Northeast of Tremor Toys
-    - Salmon Sushi is to the Northeast of Fresh Foods
-    - Salmon Sushi is to the Northeast of Tremor Toys
-    - Recycle Center is to the Northeast of Fresh Foods
-    - Recycle Center is to the Southeast of Salmon Sushi
-    - Wolf's Wardrobe is to the Southeast of Fresh Foods
-    - Wolf's Wardrobe is to the Southeast of Tremor Toys
-    - Mantis's Maps is to the Southeast of Salmon Sushi
-    - Mantis's Maps is to the Southeast of Fresh Foods
-
->>> STEP 2: COUNT OBJECTS IN SOUTHWEST OF Mantis's Maps
-    - Using reversibility: if Mantis's Maps is to the Southeast of X, then X is to the Northwest of Mantis's Maps (NOT Southwest!).
-    - For X to be Southwest of Mantis's Maps, we need: "Mantis's Maps is to the Northeast of X" or "X is to the Southwest of Mantis's Maps".
-    - Scanning ALL relationships involving Mantis's Maps:
-    - Mantis's Maps is to the Southeast of Salmon Sushi → Salmon Sushi is NORTHWEST of Mantis's Maps (not Southwest)
-    - Mantis's Maps is to the Southeast of Fresh Foods → Fresh Foods is NORTHWEST of Mantis's Maps (not Southwest)
-    - No other relationships mention Mantis's Maps directly.
-    - Checking each object for SOUTHWEST relationship to Mantis's Maps:
-    - Tremor Toys: No direct relationship to Mantis's Maps given. Cannot determine.
-    - Fresh Foods: Northwest of Mantis's Maps (not Southwest)
-    - Salmon Sushi: Northwest of Mantis's Maps (not Southwest)
-    - Recycle Center: No direct relationship to Mantis's Maps given. Cannot determine.
-    - Wolf's Wardrobe: No direct relationship to Mantis's Maps given. Cannot determine.
-    - Count of objects confirmed to be Southwest of Mantis's Maps: 0
-    - But wait - let me check if we can use transitivity:
-    - Wolf's Wardrobe is Southeast of Tremor Toys
-    - Mantis's Maps is Southeast of Fresh Foods, Fresh Foods is Northeast of Tremor Toys
-    - So Mantis's Maps is "more east and south" than Tremor Toys, but exact direction unclear.
-    - Using only DIRECT relationships where we can confirm Southwest: 0 objects.
-    - Checking the options: If 0 is not available, we need to reconsider.
-    - Options available: A. 5, B. 3, C. 2, D. 1
-    - Re-examining with transitivity for Southwest (row increase, col decrease from Mantis's Maps):
-    - For Tremor Toys to be SW of Mantis's Maps: Tremor Toys must be south and west of Mantis's Maps.
-    - Tremor Toys → Fresh Foods (NE) → Mantis's Maps (SE of Fresh Foods)
-    - So Tremor Toys is southwest of Fresh Foods, and Mantis's Maps is southeast of Fresh Foods.
-    - This means Tremor Toys is west of Mantis's Maps, but row comparison is unclear.
-    - Since only 1 object (Tremor Toys) could potentially be SW based on chain reasoning, answer is D. 1.
-
->>> STEP 3: ANSWER
-    - There is 1 object in the Southwest of Mantis's Maps.
-    
->>> FINAL ANSWER: 1
-    \\boxed{D}
-
-═══════════════════════════════════════════════════════════════════════════════
-
-REMINDER: Use the COMPLETE FULL NAME of each entity. DO NOT abbreviate or use initials.
-
-Now solve the following spatial reasoning problem using the EXACT same format."""
-    
-    return pre_prompt, description
+    No system / meta prompt is used — the structured step format is
+    injected by the monitor after ``</think>``.
+    """
+    description = str(example.get("prompt", ""))
+    # Trim trailing boiler-plate instructions that the dataset appends
+    description_trimmed = description[:-143] if len(description) > 143 else description
+    return description_trimmed
 
 
 def extract_solution(text: str) -> str:
@@ -293,6 +135,16 @@ def save_output(idx: int, output: str, output_dir: str):
         f.write(output)
     logger.info(f"Saved output to {filepath}")
 
+
+def save_prompt(idx, prompt_with_answer, reason_dir):
+    """Save reasoning trace to file."""
+    os.makedirs(reason_dir, exist_ok=True)
+    filename = os.path.join(reason_dir, f"reason_{idx}.txt")
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(prompt_with_answer)
+    logger.info(f"Saved reasoning trace to {filename}")
+
+
 def evaluate_spatialmap_answer(answer, options, ground_truth):
     """
     Evaluate a SpatialMap MCQ answer and return (is_correct, extracted_answer, message).
@@ -350,7 +202,15 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
                         help="Maximum number of correction attempts per example")
     parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
+    parser.add_argument("--newline_threshold", type=int, default=20,
+                        help="Number of \\n\\n in thinking before triggering side verification")
+    parser.add_argument("--warmup", type=int, default=0,
+                        help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
     args = parser.parse_args()
+
+    logger.info(f"Thinking-phase verification: always on")
+    logger.info(f"  Newline threshold: {args.newline_threshold}")
+    logger.info(f"  Warmup: {args.warmup}")
     
     if args.debug:
         logging.getLogger().setLevel(logging.DEBUG)
@@ -385,6 +245,9 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     total_correct = 0
     total_examples = 0
     total_reasoning_tokens = 0
+    num_attempted = 0  # examples where a \boxed{} answer was produced (not "no solution")
+    reasoning_token_counts = []
+    per_example_results = []  # list of dicts for CSV
     
     # Per-type stats
     stats_by_type = {
@@ -395,9 +258,9 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     
     for idx in indices:
         example = dataset[idx]
-        system_prompt, user_prompt = build_meta_prompt_from_example(example)
+        user_prompt = build_simple_prompt(example)
         if str(example.get("ground_truth", "")).strip() == "Q4":
-                target_options = ["A", "B"]
+            target_options = ["A", "B"]
         else:
             target_options = ["A", "B", "C", "D"] 
         keys = "|".join(map(re.escape, target_options))
@@ -409,18 +272,26 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
         # Determine question type
         question_type = get_question_type(idx)
         
-        # Build full prompt
-        full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        # Build simple prompt — no system/meta prompt; structure injected by monitor
+        full_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
         
         logger.info(f"\n{'='*60}")
         logger.info(f"Example {idx} ({question_type})")
         logger.info(f"{'='*60}")
         
-        # Create the monitor with the problem text
-        monitor = StepVerifierSpatialMapMonitor.from_prompt(
+        # Always use ThinkingPhaseStepVerifierSpatialMapMonitor:
+        # Phase 1 — verifies during <think> via side-streams
+        # Phase 2a — injects structured step format after </think>
+        # Phase 2b — verifies structured output as model fills it in
+        monitor = ThinkingPhaseStepVerifierSpatialMapMonitor(
+            name="spatialmap_thinking_verifier",
             problem_text=user_prompt,
+            llm_server=llm_server,
+            prompt=full_prompt,
+            newline_threshold=args.newline_threshold,
             max_corrections=args.max_corrections,
-            name="spatialmap_step_verifier"
+            answer_start_token="</think>",
+            warmup_newlines=args.warmup,
         )
         
         logger.info(f"Z3 solver initialized with {len(monitor.z3_solver.parsed_relations)} relations")
@@ -441,14 +312,25 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
             traceback.print_exc()
             continue
         
+        # Save reasoning trace
+        save_prompt(int(idx), answer, reason_dir)
+        logger.info(f"Raw final output:\n{answer}")
+
         # Count generated tokens
         reasoning_tokens = count_tokens(answer, tokenizer)
         total_reasoning_tokens += reasoning_tokens
+        reasoning_token_counts.append(reasoning_tokens)
+        logger.info(f"Generated tokens in this example: {reasoning_tokens}")
         
         # Evaluate the answer
         gt_sol = str(example.get("ground_truth", "")).strip()
         is_correct, extracted_answer, message = evaluate_spatialmap_answer(answer, options, gt_sol)
         
+        # "attempted" = model produced a real \boxed{} answer (not "no solution")
+        attempted = (extracted_answer is not None and extracted_answer.strip().lower() != "no solution")
+        if attempted:
+            num_attempted += 1
+        
         if extracted_answer:
             logger.info(f"Extracted answer: {extracted_answer}")
         logger.info(message)
@@ -459,14 +341,13 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
             
         total_examples += 1
         stats_by_type[question_type]["total"] += 1
-        # Save output
-        save_output(idx, answer, reason_dir)
         
         # Log result
         result = {
             'idx': int(idx),
             'question_type': question_type,
             'correct': is_correct,
+            'attempted': attempted,
             'sol': extracted_answer,
             'gt': gt_sol,
             'reasoning_tokens': reasoning_tokens,
@@ -475,12 +356,26 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
         }
         results.append(result)
         
-        logger.info(f"Result: sol={extracted_answer}, gt={gt_sol}, correct={is_correct}")
+        per_example_results.append({
+            "index": int(idx),
+            "question_type": question_type,
+            "correct": is_correct,
+            "attempted": attempted,
+            "sol": extracted_answer if extracted_answer else "",
+            "gt": gt_sol,
+            "tokens": reasoning_tokens,
+            "num_relations": len(monitor.z3_solver.parsed_relations),
+            "verified_claims": len(monitor.verified_claims),
+            "message": message,
+        })
+        
+        logger.info(f"Result: sol={extracted_answer}, gt={gt_sol}, correct={is_correct}, attempted={attempted}")
         logger.info(f"Verified claims: {len(monitor.verified_claims)}")
         logger.info(f"Reasoning tokens: {reasoning_tokens}")
     
     # Compute final metrics
     accuracy = total_correct / total_examples if total_examples > 0 else 0
+    soundness = total_correct / num_attempted if num_attempted > 0 else 0  # correct / attempted
     avg_reasoning_tokens = total_reasoning_tokens / total_examples if total_examples > 0 else 0
     
     logger.info(f"\n{'='*60}")
@@ -488,7 +383,9 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     logger.info(f"{'='*60}")
     logger.info(f"Total examples: {total_examples}")
     logger.info(f"Correct: {total_correct}")
+    logger.info(f"Attempted (produced \\boxed answer): {num_attempted}/{total_examples}")
     logger.info(f"Accuracy: {accuracy:.4f} ({total_correct}/{total_examples})")
+    logger.info(f"Soundness: {soundness:.4f} ({total_correct}/{num_attempted})")
     logger.info(f"Total reasoning tokens: {total_reasoning_tokens}")
     logger.info(f"Avg reasoning tokens: {avg_reasoning_tokens:.1f}")
     
@@ -499,12 +396,27 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
             acc = stats["correct"] / stats["total"]
             logger.info(f"  {qtype}: {acc:.4f} ({stats['correct']}/{stats['total']})")
     
+    print(f"\nFinal Accuracy: {total_correct}/{total_examples} ({accuracy:.2%})")
+    print(f"Soundness: {total_correct}/{num_attempted} ({soundness:.2%})")
+    print(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}")
+    print(f"Total Reasoning Tokens: {total_reasoning_tokens}")
+    
+    # Save per-example CSV
+    csv_file = os.path.join(output_dirs["csv_saved"], f"results_{total_examples}examples.csv")
+    with open(csv_file, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=["index", "question_type", "correct", "attempted", "sol", "gt", "tokens", "num_relations", "verified_claims", "message"])
+        writer.writeheader()
+        writer.writerows(per_example_results)
+    logger.info(f"Per-example CSV saved to {csv_file}")
+    
     # Save summary
     summary = {
         'model': args.model,
         'total_examples': total_examples,
         'correct': total_correct,
+        'attempted': num_attempted,
         'accuracy': accuracy,
+        'soundness': soundness,
         'total_reasoning_tokens': total_reasoning_tokens,
         'avg_reasoning_tokens': avg_reasoning_tokens,
         'max_corrections': args.max_corrections,
@@ -516,3 +428,39 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     with open(summary_path, 'w') as f:
         json.dump(summary, f, indent=2)
     logger.info(f"\nSaved summary to {summary_path}")
+    
+    # Save results summary to a text file
+    results_file = os.path.join(output_dirs["base"], f"EAT_{total_examples}examples_results.txt")
+    with open(results_file, 'w') as f:
+        f.write(f"SpatialMap Step Verification Results\n")
+        f.write(f"{'='*50}\n\n")
+        f.write(f"Model: {args.model}\n")
+        f.write(f"Number of Examples: {total_examples}\n")
+        f.write(f"Max Corrections: {args.max_corrections}\n")
+        f.write(f"Newline Threshold: {args.newline_threshold}\n")
+        f.write(f"Warmup: {args.warmup}\n")
+        f.write(f"\n")
+        f.write(f"Results:\n")
+        f.write(f"---------\n")
+        f.write(f"Correct: {total_correct}/{total_examples}\n")
+        f.write(f"Accuracy: {accuracy:.2%}\n")
+        f.write(f"Attempted (produced \\boxed answer): {num_attempted}/{total_examples}\n")
+        f.write(f"Soundness (correct/attempted): {soundness:.2%}\n\n")
+        f.write(f"Per-type Breakdown:\n")
+        f.write(f"---------------------------\n")
+        for qtype, stats in stats_by_type.items():
+            if stats["total"] > 0:
+                acc = stats["correct"] / stats["total"]
+                f.write(f"  {qtype}: {acc:.2%} ({stats['correct']}/{stats['total']})\n")
+        f.write(f"\nToken Statistics:\n")
+        f.write(f"---------------------------\n")
+        f.write(f"Total Tokens: {total_reasoning_tokens}\n")
+        f.write(f"Average Tokens: {avg_reasoning_tokens:.2f}\n")
+        if reasoning_token_counts:
+            f.write(f"Median Tokens: {float(np.median(reasoning_token_counts)):.0f}\n")
+            f.write(f"Min Tokens: {min(reasoning_token_counts)}\n")
+            f.write(f"Max Tokens: {max(reasoning_token_counts)}\n")
+            f.write(f"Std Dev: {np.std(reasoning_token_counts):.2f}\n")
+    
+    logger.info(f"Results saved to {results_file}")
+    print(f"Results saved to {results_file}")
diff --git a/interwhen/monitors/__init__.py b/interwhen/monitors/__init__.py
index daf42436..2a366822 100644
--- a/interwhen/monitors/__init__.py
+++ b/interwhen/monitors/__init__.py
@@ -2,4 +2,4 @@
 from .k_stable import KstableAnswerMCQMonitor, KstableAnswerGame24Monitor
 from .stepVerifier import StepVerifierGame24Monitor, StepVerifierMazeMonitor, StepVerifierSpatialMapMonitor
 from .earlyStopping import EATMonitor, DEERMonitor
-from .thinkingPhaseVerifier import ThinkingPhaseStepVerifierGame24Monitor, ThinkingPhaseStepVerifierMazeMonitor
\ No newline at end of file
+from .thinkingPhaseVerifier import ThinkingPhaseStepVerifierGame24Monitor, ThinkingPhaseStepVerifierMazeMonitor, ThinkingPhaseStepVerifierSpatialMapMonitor
\ No newline at end of file
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index 6e09bc69..cf8f656f 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -66,7 +66,7 @@
 import re
 import json
 import logging
-from typing import List, Tuple, Optional
+from typing import Dict, List, Set, Tuple, Optional
 from copy import deepcopy
 
 import httpx
@@ -81,6 +81,11 @@
     verify_locate_section, format_maze_feedback, format_locate_feedback,
     DIRECTION_DELTAS, compute_relative_direction,
 )
+from ..utils.spatialmap_verifier import (
+    SpatialMapZ3Solver, extract_step2_claims,
+    parse_directional_claims_from_text,
+    verify_spatialmap_step, format_spatialmap_feedback,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -1619,3 +1624,591 @@ async def fix(self, generated_text: str, event_info: dict, fix_method=None):
             f"  -> Appending [VERIFIER FEEDBACK] so model retries."
         )
         return event_info["generated_text"] + event_info["feedback"]
+
+
+# =====================================================================
+#  SpatialMap Thinking-Phase Prompts
+# =====================================================================
+
+
+def _build_spatialmap_format_block() -> str:
+    """
+    Build the ``<format>...</format>`` block that describes the structured
+    output template for SpatialMap tasks.
+
+    Re-used by both the side-stream (Phase 1) and the post-``</think>``
+    injection (Phase 2a).
+    """
+    return (
+        "<format>\n"
+        ">>> STEP 1: PARSE RELATIONSHIPS\n"
+        "    - A is to the DIRECTION of B\n"
+        "    [... list all given relationships ...]\n"
+        "\n"
+        ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS\n"
+        "    - Looking for: [target relationship / direction / count]\n"
+        "    - [reasoning about the relationships]\n"
+        "    - [use reversibility and transitivity as needed]\n"
+        "\n"
+        ">>> STEP 3: ANSWER\n"
+        "    - [state conclusion]\n"
+        "\n"
+        ">>> FINAL ANSWER: [answer text]\n"
+        "    \\boxed{LETTER}\n"
+        "</format>"
+    )
+
+
+def _build_spatialmap_thinking_phase_prompt(
+    parsed_relations: List[Dict],
+) -> str:
+    """
+    Build the side-stream prompt injected during the thinking phase.
+
+    Pre-fills STEP 1 with the known parsed relations (from the Z3 solver)
+    so the model jumps directly to STEP 2 analysis, maximising the chance
+    of producing verifiable directional claims within the token budget.
+
+    Written in the LLM's own first-person thinking voice so it blends
+    naturally with the ``<think>`` trace.
+    """
+    # Pre-fill STEP 1 from the ground-truth parsed relations
+    step1_lines = []
+    for rel in parsed_relations:
+        step1_lines.append(
+            f"    - {rel['A']} is to the {rel['direction']} of {rel['B']}"
+        )
+    step1_body = "\n".join(step1_lines) if step1_lines else "    (none)"
+
+    return (
+        "\n\nLet me organize what I have so far. I will list the given "
+        "relationships in STEP 1, then in STEP 2 I will state every "
+        "spatial claim I have derived using FULL object names (no "
+        "abbreviations) in exactly this form:\n"
+        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
+        "For direction I will use the full word: northeast, northwest, "
+        "southeast, southwest, north, south, east, or west.\n\n"
+        ">>> STEP 1: PARSE RELATIONSHIPS (given)\n"
+        f"{step1_body}\n\n"
+        ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS (derived)\n"
+        "Based on my analysis so far, the derived relationships are:\n"
+    )
+
+
+def _build_spatialmap_structured_prompt() -> str:
+    """
+    Build the structured format prompt injected after ``</think>``.
+
+    Analogous to the maze's structured format injection — gives the
+    model a template to fill in so we can parse and verify each step.
+    """
+    format_block = _build_spatialmap_format_block()
+    return (
+        "\nLet me solve this step by step using the structured format:\n"
+        f"{format_block}\n"
+        ">>> STEP 1: PARSE RELATIONSHIPS\n"
+    )
+
+
+# =====================================================================
+#  ThinkingPhaseStepVerifierSpatialMapMonitor
+# =====================================================================
+
+
+class ThinkingPhaseStepVerifierSpatialMapMonitor(VerifyMonitor):
+    """
+    Monitor that verifies spatial-map directional claims during and after
+    thinking.
+
+    **No meta-prompt required** — works with a plain user prompt containing
+    just the map description and question.  Structure is injected by this
+    monitor after ``</think>`` (natural or early-stop), exactly like the
+    Maze monitor injects its step format.
+
+    Phase 1 – During ``<think>...</think>``:
+        Every N double-newlines (after warmup), fork a side-stream that
+        injects a structured step prompt, stream tokens, parse directional
+        claims from STEP 2, and verify them against Z3.
+
+    Phase 2a – ``</think>`` detected, structured prompt not yet injected:
+        Inject the structured step-by-step format template so the model
+        fills it in (STEP 1 → STEP 2 → STEP 3 → FINAL ANSWER → ``\\boxed{}``).
+
+    Phase 2b – Structured prompt injected, model is generating:
+        Verify directional claims in STEP 2 as they appear.  Once
+        ``\\boxed{}`` appears, signal completion.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        problem_text: str,
+        llm_server: dict,
+        prompt: str,
+        newline_threshold: int = 15,
+        max_corrections: int = 5,
+        answer_start_token: str = "</think>",
+        async_execution: bool = True,
+        warmup_newlines: int = 0,
+    ):
+        super().__init__(name)
+        self.problem_text = problem_text
+        self.llm_server = llm_server
+        self.prompt = prompt
+        self.newline_threshold = newline_threshold
+        self.max_corrections = max_corrections
+        self.answer_start_token = answer_start_token
+        self.async_execution = async_execution
+        self.warmup_newlines = warmup_newlines
+
+        # Initialize Z3 solver with problem constraints
+        self.z3_solver = SpatialMapZ3Solver(problem_text)
+
+        # Build prompts for injection
+        self._structured_prompt = _build_spatialmap_structured_prompt()
+        self._thinking_phase_prompt = _build_spatialmap_thinking_phase_prompt(
+            self.z3_solver.parsed_relations,
+        )
+        # Marker to detect if structured prompt was already injected
+        self._structured_marker = ">>> STEP 1: PARSE RELATIONSHIPS"
+
+        # ---- state ----
+        self._think_phase_corrections = 0
+        self.verified_claims: Set[Tuple[str, str, str]] = set()
+
+    @classmethod
+    def from_prompt(
+        cls,
+        problem_text: str,
+        llm_server: dict,
+        prompt: str,
+        newline_threshold: int = 15,
+        max_corrections: int = 5,
+        warmup_newlines: int = 0,
+        name: str = "spatialmap_thinking_verifier",
+    ) -> "ThinkingPhaseStepVerifierSpatialMapMonitor":
+        """
+        Convenience factory method.
+        """
+        return cls(
+            name=name,
+            problem_text=problem_text,
+            llm_server=llm_server,
+            prompt=prompt,
+            newline_threshold=newline_threshold,
+            max_corrections=max_corrections,
+            warmup_newlines=warmup_newlines,
+        )
+
+    # ------------------------------------------------------------------
+    #  helpers
+    # ------------------------------------------------------------------
+    def _count_feedback_blocks(self, text: str) -> int:
+        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
+
+    def _is_in_thinking_phase(self, generated_text: str) -> bool:
+        return self.answer_start_token not in generated_text
+
+    def _structured_prompt_injected(self, generated_text: str) -> bool:
+        """Check if structured format was already injected after </think>."""
+        if self.answer_start_token not in generated_text:
+            return False
+        after_think = generated_text.split(self.answer_start_token, 1)[1]
+        return self._structured_marker in after_think
+
+    def _extract_new_claims(self, text: str) -> List[Dict]:
+        """
+        Extract new (not yet verified) directional claims from STEP 2 of
+        the most recent attempt (after last feedback block).
+        """
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text):
+            last_feedback_end = match.end()
+
+        text_to_check = text[last_feedback_end:]
+
+        all_claims = extract_step2_claims(text_to_check)
+
+        new_claims = []
+        for claim in all_claims:
+            claim_key = (claim['A'], claim['direction'], claim['B'])
+            if claim_key not in self.verified_claims:
+                new_claims.append(claim)
+
+        return new_claims
+
+    # ------------------------------------------------------------------
+    #  _side_stream_spatialmap – streams tokens to get analysis
+    # ------------------------------------------------------------------
+    async def _side_stream_spatialmap(self, text_so_far: str, max_new_tokens: int = 400) -> str:
+        """
+        Send ``prompt + text_so_far`` to vLLM, stream at most
+        *max_new_tokens* tokens, and return the generated text.
+
+        ``text_so_far`` is expected to end with the structured spatial map
+        prompt so the model outputs its analysis steps.
+        """
+        logger.info(
+            f"[SpatialMap Side-stream] Starting analysis extraction\n"
+            f"  Relations: {len(self.z3_solver.parsed_relations)}\n"
+            f"  Max new tokens: {max_new_tokens}"
+        )
+
+        payload = deepcopy(self.llm_server["payload"])
+        payload["prompt"] = self.prompt + text_so_far
+        payload["max_tokens"] = max_new_tokens
+        payload.pop("logprobs", None)
+
+        generated = ""
+
+        async with httpx.AsyncClient(timeout=None) as client:
+            async with client.stream(
+                "POST",
+                self.llm_server["url"],
+                headers=self.llm_server["headers"],
+                json=payload,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[len("data: "):].strip()
+                        if data == "[DONE]":
+                            break
+                        chunk = json.loads(data)["choices"][0]["text"]
+                        generated += chunk
+                        logger.debug(f"[SpatialMap Side-stream] chunk: {chunk!r}")
+
+                        # Stop if we see FINAL ANSWER or \boxed
+                        if '\\boxed' in generated or '>>> FINAL ANSWER' in generated:
+                            break
+
+        logger.info(
+            f"[SpatialMap Side-stream] Generated {len(generated)} chars"
+        )
+        return generated
+
+    # ------------------------------------------------------------------
+    #  step_extractor
+    # ------------------------------------------------------------------
+    def step_extractor(self, chunk: str, generated_text: str):
+        """
+        Phase 1 (thinking): trigger at every newline_threshold multiple
+            (after warmup).
+        Phase 2 (after </think>): trigger on structured steps or boxed
+            answer.
+        """
+        # ===== PHASE 1: still inside <think> =====
+        if self._is_in_thinking_phase(generated_text):
+            if self._think_phase_corrections >= self.max_corrections:
+                return False, None
+
+            total_double_newlines = generated_text.count('\n\n')
+
+            if total_double_newlines < self.warmup_newlines:
+                return False, None
+
+            past_warmup = total_double_newlines - self.warmup_newlines
+            if (generated_text.endswith('\n\n')
+                    and past_warmup >= 0
+                    and past_warmup % self.newline_threshold == 0):
+                logger.info(
+                    f"[SpatialMap step_extractor] Phase 1 trigger: \\n\\n count={total_double_newlines} "
+                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
+                    f"threshold={self.newline_threshold})"
+                )
+                return True, generated_text
+
+            return False, None
+
+        # ===== PHASE 2: after </think> =====
+
+        # 2a: structured prompt not yet injected → trigger immediately
+        if not self._structured_prompt_injected(generated_text):
+            logger.info(
+                "[SpatialMap step_extractor] Phase 2a: </think> detected, "
+                "structured prompt not yet injected."
+            )
+            return True, generated_text
+
+        # 2b: structured prompt injected — verify STEP 2 claims / boxed answer
+        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = generated_text[think_end_pos:]
+
+        # Strip out the injected <format>...</format> template so we only
+        # look at actual model output (which starts after the last marker).
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            model_output_start = last_marker_pos + len(self._structured_marker)
+            text_after_think = text_after_think[model_output_start:]
+            text_start_offset = think_end_pos + model_output_start
+        else:
+            text_start_offset = think_end_pos
+
+        # Skip past feedback blocks
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        text = text_after_think[last_feedback_end:]
+        text_start = text_start_offset + last_feedback_end
+
+        # Check for STEP 2 section with claims
+        step2_pattern = re.compile(
+            r'>>>\s*STEP\s*2[:\s].*?(?=>>>\s*STEP\s*3|>>>\s*FINAL|\\boxed|$)',
+            re.DOTALL | re.IGNORECASE
+        )
+        step2_match = step2_pattern.search(text)
+
+        if step2_match:
+            # Check if STEP 3 or FINAL has started (STEP 2 is complete)
+            text_after_step2 = text[step2_match.end():]
+            step3_or_final = re.search(
+                r'>>>\s*(STEP\s*3|FINAL)',
+                text_after_step2,
+                re.IGNORECASE
+            )
+
+            if step3_or_final:
+                new_claims = self._extract_new_claims(text)
+                if new_claims:
+                    end_pos = text_start + step2_match.end()
+                    return True, generated_text[:end_pos]
+
+        # Check for boxed answer (trigger final verification)
+        boxed_match = re.search(r'\\boxed\{[^}]+\}', text)
+        if boxed_match:
+            new_claims = self._extract_new_claims(text)
+            if new_claims:
+                end_pos = text_start + boxed_match.end()
+                return True, generated_text[:end_pos]
+            # Even if no new claims, boxed answer signals completion
+            end_pos = text_start + boxed_match.end()
+            return True, generated_text[:end_pos]
+
+        return False, None
+
+    # ------------------------------------------------------------------
+    #  verify
+    # ------------------------------------------------------------------
+    async def verify(self, step: str, token_index: int, event, event_info):
+        """
+        Case 1 -- still in thinking (no </think>):
+            Fork side-stream, parse claims, verify with Z3.
+        Case 2 -- after </think>:
+            2a: Inject structured prompt.
+            2b: Verify STEP 2 claims and/or final answer.
+        """
+
+        # ==================================================================
+        # CASE 1: Thinking phase – side-stream verification
+        # ==================================================================
+        if self.answer_start_token not in step:
+            total_dn = step.count('\n\n')
+            logger.info(
+                f"[SpatialMap Phase 1] Thinking-phase verification triggered\n"
+                f"  \\n\\n count  : {total_dn}\n"
+                f"  Thinking len : {len(step)} chars"
+            )
+
+            # Build text with injected prompt for analysis extraction
+            text_with_prompt = step + self._thinking_phase_prompt
+
+            # Side-stream: get analysis from the model
+            side_output = await self._side_stream_spatialmap(
+                text_with_prompt, max_new_tokens=600
+            )
+
+            if not side_output or len(side_output.strip()) < 20:
+                logger.info(
+                    "[SpatialMap Phase 1] Insufficient output from side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            # Parse directional claims directly from the side-stream output.
+            # The prompt pre-fills STEP 1 and ends at ">>> STEP 2:", so the
+            # model's output is already STEP 2 content — no header to search for.
+            claims = parse_directional_claims_from_text(side_output)
+
+            logger.info(
+                f"[SpatialMap Phase 1] Parsed {len(claims)} claims from side-stream.\n"
+                f"  Side-stream output (first 500 chars): {side_output[:500]!r}"
+            )
+
+            if not claims:
+                logger.info(
+                    "[SpatialMap Phase 1] No directional claims found in side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            # Verify each claim against Z3
+            for claim in claims:
+                claim_key = (claim['A'], claim['direction'], claim['B'])
+                if claim_key in self.verified_claims:
+                    continue
+
+                is_valid, errors = verify_spatialmap_step(
+                    claim=claim,
+                    z3_solver=self.z3_solver,
+                    add_if_valid=True,
+                )
+                self.verified_claims.add(claim_key)
+
+                if not is_valid:
+                    self._think_phase_corrections += 1
+                    error_summary = "; ".join(errors)
+                    logger.info(
+                        f"[SpatialMap Phase 1] INVALID claim: "
+                        f"{claim['A']} is {claim['direction']} of {claim['B']}\n"
+                        f"  Error(s) : {error_summary}\n"
+                        f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                    )
+                    thinking_feedback = (
+                        f"\n\nWait, I think I made an error in my spatial reasoning. "
+                        f"{error_summary}. "
+                        f"Let me re-examine the relationships more carefully.\n"
+                    )
+                    if not event.is_set():
+                        event_info["generated_text"] = step
+                        event_info["feedback"] = thinking_feedback
+                        event_info["correction_index"] = token_index
+                        event_info["errors"] = errors
+                        event_info["phase"] = "rollback_to_thinking"
+                        event.set()
+                    return step, thinking_feedback
+
+            # All claims valid
+            logger.info(
+                f"[SpatialMap Phase 1] All {len(claims)} claims valid. "
+                f"Letting model continue thinking."
+            )
+            return step, None
+
+        # ==================================================================
+        # CASE 2a: </think> present but structured prompt not yet injected
+        # ==================================================================
+        if not self._structured_prompt_injected(step):
+            logger.info(
+                "[SpatialMap Phase 2a] </think> detected. "
+                "Injecting structured step format."
+            )
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = self._structured_prompt
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "inject_structured_prompt"
+                event.set()
+            return step, self._structured_prompt
+
+        # ==================================================================
+        # CASE 2b: Structured prompt injected — verify output
+        # ==================================================================
+        num_corrections = self._count_feedback_blocks(step)
+        if num_corrections >= self.max_corrections:
+            fb = "\nthe answer is \\boxed{no solution}"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = fb
+                event_info["correction_index"] = token_index
+                event_info["errors"] = ["Max corrections reached"]
+                event_info["phase"] = "standard_verify"
+                event.set()
+            return step, fb
+
+        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = step[think_end_pos:]
+
+        # Strip the injected template — only look at model output after marker
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            text_after_think = text_after_think[last_marker_pos:]
+
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        recent_text = text_after_think[last_feedback_end:]
+
+        # --- Verify STEP 2 claims ---
+        new_claims = self._extract_new_claims(recent_text)
+
+        for claim in new_claims:
+            claim_key = (claim['A'], claim['direction'], claim['B'])
+
+            is_valid, errors = verify_spatialmap_step(
+                claim=claim,
+                z3_solver=self.z3_solver,
+                add_if_valid=True,
+            )
+            self.verified_claims.add(claim_key)
+
+            if not is_valid:
+                feedback = format_spatialmap_feedback(errors, claim)
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = errors
+                    event_info["failed_step"] = claim
+                    event_info["phase"] = "standard_verify"
+                    event.set()
+                return step, feedback
+
+        # --- Check for boxed answer ---
+        boxed_match = re.search(r'\\boxed\{[^}]+\}', recent_text)
+        if boxed_match:
+            logger.info(
+                f"[SpatialMap Phase 2b] Boxed answer found. Stopping."
+            )
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = ""
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "final_answer_correct"
+                event.set()
+            return step, None
+
+        # All claims valid, no boxed yet
+        return step, None
+
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        """Apply the appropriate fix depending on the phase."""
+        phase = event_info.get("phase", "standard_verify")
+
+        if phase == "rollback_to_thinking":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[SpatialMap fix] Phase: rollback_to_thinking\n"
+                f"  -> Appended error feedback into <think> trace.\n"
+                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+            )
+            return result
+
+        if phase == "inject_structured_prompt":
+            logger.info(
+                "[SpatialMap fix] Phase: inject_structured_prompt\n"
+                "  -> Appending structured step format after </think>."
+            )
+            return event_info["generated_text"] + event_info["feedback"]
+
+        if phase == "final_answer_correct":
+            logger.info(
+                "[SpatialMap fix] Phase: final_answer_correct\n"
+                "  -> Stopping generation."
+            )
+            return event_info["generated_text"]
+
+        # standard_verify
+        errors = event_info.get("errors", [])
+        error_summary = "; ".join(errors) if errors else "unknown"
+        logger.info(
+            f"[SpatialMap fix] Phase: standard_verify\n"
+            f"  -> Error: {error_summary}\n"
+            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
+        )
+        return event_info["generated_text"] + event_info["feedback"]
diff --git a/interwhen/utils/spatialmap_verifier.py b/interwhen/utils/spatialmap_verifier.py
index 2ea94e42..7ea52da1 100644
--- a/interwhen/utils/spatialmap_verifier.py
+++ b/interwhen/utils/spatialmap_verifier.py
@@ -185,15 +185,32 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
     - "X is to the northwest of Y"
     - "X is NORTHWEST of Y"
     - "X is northwest of Y" (affirmative claims)
+    - "X is NW of Y" (abbreviated directions)
+    - "[X] is to the northwest of [Y]" (bracket-wrapped names)
     
     Returns list of IR dicts: [{"A": ..., "direction": ..., "B": ...}, ...]
     """
+    # Expand abbreviated directions before parsing
+    abbrev_map = {
+        'NW': 'northwest', 'NE': 'northeast',
+        'SW': 'southwest', 'SE': 'southeast',
+    }
+    expanded_text = text
+    for abbr, full in abbrev_map.items():
+        # Replace standalone abbreviations like "is NE of" → "is northeast of"
+        expanded_text = re.sub(
+            rf'\b{abbr}\b(?=\s+of\b)', full, expanded_text
+        )
+
+    # Strip square brackets around entity names: [Foo Bar] → Foo Bar
+    expanded_text = re.sub(r'\[([A-Z][A-Za-z\'\s]*?)\]', r'\1', expanded_text)
+
     claims = []
     
     # Pattern: "X is (to the) DIRECTION of Y"
     pattern = r"([A-Z][A-Za-z'][A-Za-z'\s]*?)\s+is\s+(?:to\s+the\s+)?(northwest|northeast|southwest|southeast|north|south|east|west)\s+of\s+([A-Z][A-Za-z'][A-Za-z'\s]*?)(?:\.|,|\s*[→✓✗]|\s*$|\s+(?:and|so|which|therefore|thus|but|\())"
     
-    matches = re.finditer(pattern, text, re.IGNORECASE)
+    matches = re.finditer(pattern, expanded_text, re.IGNORECASE)
     
     for match in matches:
         entity_a = match.group(1).strip()
@@ -209,7 +226,7 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
                       'which', 'what', 'where', 'when', 'also', 'not', 'the', 'a', 'an'}
         if entity_a.lower() in skip_words or entity_b.lower() in skip_words:
             continue
-        if len(entity_a) < 3 or len(entity_b) < 3:
+        if len(entity_a) < 2 or len(entity_b) < 2:
             continue
         if not entity_a[0].isupper():
             continue

From 143a70bc0e78c18eba9dd128578a291d9744cc9f Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Fri, 20 Feb 2026 13:56:30 +0000
Subject: [PATCH 06/17] sm

---
 interwhen/monitors/thinkingPhaseVerifier.py | 393 +++++++++++++++++++
 interwhen/utils/spatialmap_verifier.py      | 409 ++++++++++++++++++++
 2 files changed, 802 insertions(+)

diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index cf8f656f..2f01e66b 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -84,6 +84,11 @@
 from ..utils.spatialmap_verifier import (
     SpatialMapZ3Solver, extract_step2_claims,
     parse_directional_claims_from_text,
+    parse_counting_question, parse_model_count_from_answer,
+    parse_direction_question, parse_object_question,
+    parse_model_boxed_answer,
+    get_possible_directions, get_consistent_object_options,
+    get_possible_count_range,
     verify_spatialmap_step, format_spatialmap_feedback,
 )
 
@@ -1776,6 +1781,62 @@ def __init__(
         self._think_phase_corrections = 0
         self.verified_claims: Set[Tuple[str, str, str]] = set()
 
+        # ---- counting-question verification ----
+        self._counting_question = parse_counting_question(problem_text)
+        self._counting_options: Dict[str, str] = {}
+        # Strip trailing instruction paragraph for clean option parsing
+        _opts_text = re.split(r'\nFirst,', problem_text, maxsplit=1)[0]
+        if self._counting_question:
+            # Parse MCQ options from problem text (e.g., "A. 5\nB. 3\nC. 0\nD. 1")
+            raw_opts = re.findall(
+                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
+                _opts_text, flags=re.DOTALL,
+            )
+            self._counting_options = {
+                k: v.strip().rstrip(".") for k, v in raw_opts
+            }
+            logger.info(
+                f"[SpatialMap] Counting question detected: "
+                f"direction={self._counting_question['direction']}, "
+                f"reference={self._counting_question['reference']}, "
+                f"options={self._counting_options}"
+            )
+        self._count_feedback_given = False  # only give count feedback once
+
+        # ---- direction-question verification ----
+        self._direction_question = parse_direction_question(problem_text)
+        if self._direction_question:
+            logger.info(
+                f"[SpatialMap] Direction question detected: "
+                f"entity_a={self._direction_question['entity_a']}, "
+                f"entity_b={self._direction_question['entity_b']}"
+            )
+
+        # ---- object-question verification ----
+        self._object_question = parse_object_question(problem_text)
+        if self._object_question:
+            logger.info(
+                f"[SpatialMap] Object question detected: "
+                f"direction={self._object_question['direction']}, "
+                f"reference={self._object_question['reference']}"
+            )
+
+        # ---- Generic MCQ options (for direction & object Qs too) ----
+        if not self._counting_options:
+            raw_opts = re.findall(
+                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
+                _opts_text, flags=re.DOTALL,
+            )
+            self._mcq_options: Dict[str, str] = {
+                k: v.strip().rstrip(".") for k, v in raw_opts
+            }
+        else:
+            self._mcq_options = dict(self._counting_options)
+
+        self._direction_feedback_given = False
+        self._object_feedback_given = False
+        self._diag_count_feedback_given = False
+
     @classmethod
     def from_prompt(
         cls,
@@ -2158,6 +2219,338 @@ async def verify(self, step: str, token_index: int, event, event_info):
         # --- Check for boxed answer ---
         boxed_match = re.search(r'\\boxed\{[^}]+\}', recent_text)
         if boxed_match:
+
+            # ==========================================================
+            # Direction-question verification
+            # ==========================================================
+            if (
+                self._direction_question
+                and num_corrections < self.max_corrections
+                and not self._direction_feedback_given
+            ):
+                model_dir_text = parse_model_boxed_answer(
+                    recent_text, self._mcq_options
+                )
+                if model_dir_text:
+                    possible = get_possible_directions(
+                        self.z3_solver,
+                        self._direction_question["entity_a"],
+                        self._direction_question["entity_b"],
+                    )
+                    logger.info(
+                        f"[SpatialMap Phase 2b] Direction check: "
+                        f"model={model_dir_text}, possible={possible}"
+                    )
+                    if model_dir_text not in possible:
+                        self._direction_feedback_given = True
+                        # Find which MCQ options are consistent
+                        valid_options = [
+                            letter for letter, val in self._mcq_options.items()
+                            if val.strip().lower().rstrip(".") in possible
+                        ]
+                        if len(valid_options) == 1:
+                            # Force correct answer
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
+                                f"  '{model_dir_text.title()}' is "
+                                f"impossible for "
+                                f"{self._direction_question['entity_a']} "
+                                f"relative to "
+                                f"{self._direction_question['entity_b']} "
+                                f"based on the given constraints.\n"
+                                f"  The only consistent direction is "
+                                f"'{possible[0].title()}'.\n"
+                                f"  Please select option "
+                                f"{valid_options[0]}.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        else:
+                            possible_str = ", ".join(
+                                d.title() for d in possible
+                            )
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
+                                f"  '{model_dir_text.title()}' is "
+                                f"impossible for "
+                                f"{self._direction_question['entity_a']} "
+                                f"relative to "
+                                f"{self._direction_question['entity_b']} "
+                                f"based on the given constraints.\n"
+                                f"  The possible directions are: "
+                                f"{possible_str}.\n"
+                                f"  Please reconsider and choose the "
+                                f"correct option.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [
+                                f"Direction '{model_dir_text}' impossible; "
+                                f"possible: {possible}"
+                            ]
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+            # ==========================================================
+            # Object-question verification
+            # ==========================================================
+            if (
+                self._object_question
+                and num_corrections < self.max_corrections
+                and not self._object_feedback_given
+            ):
+                model_obj_text = parse_model_boxed_answer(
+                    recent_text, self._mcq_options
+                )
+                boxed_raw = re.findall(
+                    r'\\boxed\{([^}]*)\}', recent_text
+                )
+                model_letter = (
+                    boxed_raw[-1].strip().upper() if boxed_raw else None
+                )
+
+                if model_letter:
+                    consistent = get_consistent_object_options(
+                        self.z3_solver,
+                        self._object_question["direction"],
+                        self._object_question["reference"],
+                        self._mcq_options,
+                    )
+                    logger.info(
+                        f"[SpatialMap Phase 2b] Object check: "
+                        f"model={model_letter}, "
+                        f"consistent_options={consistent}"
+                    )
+                    if model_letter not in consistent:
+                        self._object_feedback_given = True
+                        odir = self._object_question["direction"]
+                        oref = self._object_question["reference"]
+                        if len(consistent) == 1:
+                            correct_name = self._mcq_options.get(
+                                consistent[0], consistent[0]
+                            )
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Object error!\n"
+                                f"  '{model_obj_text}' cannot be "
+                                f"{odir} of {oref} based on the "
+                                f"given constraints.\n"
+                                f"  The only consistent option is "
+                                f"{consistent[0]}. {correct_name}.\n"
+                                f"  Please select option "
+                                f"{consistent[0]}.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        else:
+                            valid_names = [
+                                f"{l}. {self._mcq_options.get(l, l)}"
+                                for l in consistent
+                            ]
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Object error!\n"
+                                f"  '{model_obj_text}' cannot be "
+                                f"{odir} of {oref} based on the "
+                                f"given constraints.\n"
+                                f"  The consistent options are: "
+                                f"{', '.join(valid_names)}.\n"
+                                f"  Please reconsider and choose the "
+                                f"correct option.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [
+                                f"Object '{model_obj_text}' impossible "
+                                f"in {odir} of {oref}; "
+                                f"consistent: {consistent}"
+                            ]
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+            # ==========================================================
+            # Counting-question verification (cardinal + diagonal)
+            # ==========================================================
+            if (
+                self._counting_question
+                and num_corrections < self.max_corrections
+            ):
+                direction = self._counting_question["direction"]
+                reference = self._counting_question["reference"]
+                is_cardinal = direction in (
+                    "north", "south", "east", "west"
+                )
+
+                if is_cardinal:
+                    # --- Cardinal: GT is always 0 ---
+                    model_count = parse_model_count_from_answer(
+                        recent_text, self._counting_options
+                    )
+                    z3_count = 0
+
+                    logger.info(
+                        f"[SpatialMap Phase 2b] Cardinal count check: "
+                        f"model={model_count}, expected={z3_count}, "
+                        f"direction={direction}, reference={reference}"
+                    )
+
+                    if (
+                        model_count is not None
+                        and model_count != z3_count
+                    ):
+                        if not self._count_feedback_given:
+                            self._count_feedback_given = True
+
+                            if direction in ("north", "south"):
+                                diag_examples = "northeast or northwest"
+                            elif direction == "west":
+                                diag_examples = "northwest or southwest"
+                            else:  # east
+                                diag_examples = "northeast or southeast"
+
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Count mismatch!\n"
+                                f"  You answered {model_count} objects "
+                                f"'{direction}' of {reference}, but the "
+                                f"correct count is {z3_count}.\n"
+                                f"  IMPORTANT: '{direction.title()}' means "
+                                f"STRICTLY and EXACTLY {direction} — it "
+                                f"does NOT include diagonal directions "
+                                f"like {diag_examples}.\n"
+                                f"  An object that is Northwest of "
+                                f"{reference} is NOT North of {reference}"
+                                f" and NOT West of {reference}.\n"
+                                f"  Since all given relationships in this "
+                                f"problem are diagonal (NE/NW/SE/SW), no "
+                                f"object can be strictly "
+                                f"'{direction.title()}' of {reference}.\n"
+                                f"  The correct count is {z3_count}. "
+                                f"Please select the option for 0.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        else:
+                            correct_option = None
+                            for opt, val in self._counting_options.items():
+                                if val == "0":
+                                    correct_option = opt
+                                    break
+                            if correct_option:
+                                feedback = (
+                                    f"\nThe correct answer is 0. "
+                                    f"\\boxed{{{correct_option}}}"
+                                )
+                            else:
+                                feedback = (
+                                    f"\nThe correct answer is 0. "
+                                    f"\\boxed{{0}}"
+                                )
+
+                        logger.info(
+                            f"[SpatialMap Phase 2b] Cardinal count "
+                            f"mismatch: model={model_count}, "
+                            f"expected=0. Injecting feedback "
+                            f"(attempt={'1st' if not self._count_feedback_given else '2nd'})."
+                        )
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [
+                                f"Cardinal count mismatch: expected 0, "
+                                f"got {model_count}"
+                            ]
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+                else:
+                    # --- Diagonal: use Z3 range check ---
+                    if not self._diag_count_feedback_given:
+                        model_count = parse_model_count_from_answer(
+                            recent_text, self._counting_options
+                        )
+                        count_range = get_possible_count_range(
+                            self.z3_solver, reference, direction
+                        )
+
+                        if (
+                            model_count is not None
+                            and count_range is not None
+                        ):
+                            min_c, max_c = count_range
+                            logger.info(
+                                f"[SpatialMap Phase 2b] Diagonal count "
+                                f"check: model={model_count}, "
+                                f"range=[{min_c}, {max_c}], "
+                                f"direction={direction}, "
+                                f"reference={reference}"
+                            )
+
+                            if not (min_c <= model_count <= max_c):
+                                self._diag_count_feedback_given = True
+                                # Find valid MCQ options
+                                valid_opts = []
+                                for opt, val in (
+                                    self._counting_options.items()
+                                ):
+                                    try:
+                                        v = int(val)
+                                        if min_c <= v <= max_c:
+                                            valid_opts.append(
+                                                (opt, v)
+                                            )
+                                    except (ValueError, TypeError):
+                                        pass
+
+                                if len(valid_opts) == 1:
+                                    feedback = (
+                                        f"\n\n[VERIFIER FEEDBACK: "
+                                        f"Count error!\n"
+                                        f"  {model_count} objects "
+                                        f"'{direction}' of {reference}"
+                                        f" is impossible.\n"
+                                        f"  The valid count is "
+                                        f"{valid_opts[0][1]}.\n"
+                                        f"  Please select option "
+                                        f"{valid_opts[0][0]}.]\n\n"
+                                        f">>> STEP 3: ANSWER\n"
+                                    )
+                                else:
+                                    feedback = (
+                                        f"\n\n[VERIFIER FEEDBACK: "
+                                        f"Count error!\n"
+                                        f"  {model_count} objects "
+                                        f"'{direction}' of {reference}"
+                                        f" is impossible.\n"
+                                        f"  The possible count range "
+                                        f"is [{min_c}, {max_c}].\n"
+                                        f"  Please reconsider and "
+                                        f"choose the correct "
+                                        f"option.]\n\n"
+                                        f">>> STEP 3: ANSWER\n"
+                                    )
+
+                                if not event.is_set():
+                                    event_info["generated_text"] = step
+                                    event_info["feedback"] = feedback
+                                    event_info["correction_index"] = (
+                                        token_index
+                                    )
+                                    event_info["errors"] = [
+                                        f"Diagonal count "
+                                        f"{model_count} outside "
+                                        f"range [{min_c}, {max_c}]"
+                                    ]
+                                    event_info["phase"] = (
+                                        "standard_verify"
+                                    )
+                                    event.set()
+                                return step, feedback
+
             logger.info(
                 f"[SpatialMap Phase 2b] Boxed answer found. Stopping."
             )
diff --git a/interwhen/utils/spatialmap_verifier.py b/interwhen/utils/spatialmap_verifier.py
index 7ea52da1..a89cfa06 100644
--- a/interwhen/utils/spatialmap_verifier.py
+++ b/interwhen/utils/spatialmap_verifier.py
@@ -176,6 +176,145 @@ def check_with_new_constraint(self, ir: Dict) -> bool:
     def is_satisfiable(self) -> bool:
         return self.solver.check() == sat
 
+    def count_objects_in_direction(
+        self, reference: str, direction: str
+    ) -> Optional[int]:
+        """
+        Count how many entities are in a **strict** direction from *reference*.
+
+        For cardinal directions the semantics are strict:
+        - "north"  → same x, higher y   (but see note below)
+        - "south"  → same x, lower y
+        - "east"   → higher x, same y
+        - "west"   → lower x, same y
+
+        However, since every constraint in the SpatialMap dataset is diagonal
+        (NE/NW/SE/SW), no two objects can share an x- or y-coordinate.
+        Therefore the strict-cardinal count is always **0** whenever the
+        problem only has diagonal constraints — which is exactly the
+        ground-truth expectation.
+
+        For diagonal directions:
+        - "northeast" → higher x AND higher y
+        - "northwest" → lower x  AND higher y
+        - "southeast" → higher x AND lower y
+        - "southwest" → lower x  AND lower y
+
+        Returns the count, or ``None`` if the solver cannot determine it
+        (e.g. reference entity not found).
+        """
+        direction = direction.lower().strip()
+
+        # Resolve the reference entity's variable names
+        ref_x_key = f"{reference}_x"
+        ref_y_key = f"{reference}_y"
+        if ref_x_key not in self.entities:
+            # Try fuzzy match — dataset names may differ in whitespace
+            for key in self.entities:
+                if key.endswith("_x") and reference.lower() in key.lower():
+                    ref_x_key = key
+                    ref_y_key = key.replace("_x", "_y")
+                    reference = key[:-2]
+                    break
+            else:
+                return None
+
+        ref_x = self.entities[ref_x_key]
+        ref_y = self.entities[ref_y_key]
+
+        # Collect all other entity names (unique base names)
+        all_entities = set()
+        for key in self.entities:
+            if key.endswith("_x"):
+                ename = key[:-2]
+                if ename != reference:
+                    all_entities.add(ename)
+
+        # Determine x/y constraints for the direction
+        is_cardinal = direction in ("north", "south", "east", "west")
+
+        # Since all given constraints are strictly diagonal, any pair of
+        # objects cannot share the same x- or y-coordinate.  Cardinal
+        # directions require an exact match on one axis, which is impossible.
+        if is_cardinal:
+            return 0
+
+        # For diagonal directions, check each entity with Z3
+        count = 0
+        for ename in all_entities:
+            e_x = self.entities[f"{ename}_x"]
+            e_y = self.entities[f"{ename}_y"]
+
+            if direction == "northeast":
+                constraint = And(e_x > ref_x, e_y > ref_y)
+            elif direction == "northwest":
+                constraint = And(e_x < ref_x, e_y > ref_y)
+            elif direction == "southeast":
+                constraint = And(e_x > ref_x, e_y < ref_y)
+            elif direction == "southwest":
+                constraint = And(e_x < ref_x, e_y < ref_y)
+            else:
+                continue
+
+            # Check if this entity MUST be in that direction
+            # (i.e. the negation is unsatisfiable)
+            self.solver.push()
+            from z3 import Not
+            self.solver.add(Not(constraint))
+            must_be = self.solver.check() != sat
+            self.solver.pop()
+
+            if must_be:
+                count += 1
+
+        return count
+
+
+def parse_counting_question(problem_text: str) -> Optional[Dict]:
+    """
+    If the problem asks a *counting* question ("How many objects are in
+    the X of Y?"), return a dict with the direction and reference entity.
+
+    Returns ``None`` for non-counting questions.
+    """
+    m = re.search(
+        r'How many objects are in the (\w+) of ([^?]+?)\?',
+        problem_text,
+        re.IGNORECASE,
+    )
+    if not m:
+        return None
+    return {
+        "direction": m.group(1).strip().lower(),
+        "reference": m.group(2).strip().rstrip("."),
+    }
+
+
+def parse_model_count_from_answer(text_after_think: str, options: dict = None) -> Optional[int]:
+    """
+    Extract the numeric count the model chose from its ``\\boxed{}`` answer.
+
+    Looks for ``\\boxed{LETTER}`` then maps through *options* to get the
+    numeric value.  Falls back to extracting a number directly.
+    """
+    boxed = re.findall(r'\\boxed\{([^}]*)\}', text_after_think)
+    if not boxed:
+        return None
+    answer = boxed[-1].strip()
+
+    # If options mapping is provided, resolve letter → value
+    if options and answer in options:
+        try:
+            return int(options[answer])
+        except (ValueError, TypeError):
+            return None
+
+    # Try direct numeric
+    try:
+        return int(answer)
+    except (ValueError, TypeError):
+        return None
+
 
 def parse_directional_claims_from_text(text: str) -> List[Dict]:
     """
@@ -311,10 +450,280 @@ def format_spatialmap_feedback(errors: List[str], claim: Optional[Dict] = None)
     return feedback
 
 
+# ---------------------------------------------------------------------------
+#  Direction-question helpers
+# ---------------------------------------------------------------------------
+
+def parse_direction_question(problem_text: str) -> Optional[Dict]:
+    """
+    If the problem asks a *direction* question
+    ("In which direction is X relative to Y?"),
+    return ``{"entity_a": X, "entity_b": Y}``.
+
+    Returns ``None`` for non-direction questions.
+    """
+    m = re.search(
+        r'In which direction is (.+?) relative to (.+?)\?',
+        problem_text,
+        re.IGNORECASE,
+    )
+    if not m:
+        return None
+    return {
+        "entity_a": m.group(1).strip(),
+        "entity_b": m.group(2).strip(),
+    }
+
+
+def parse_object_question(problem_text: str) -> Optional[Dict]:
+    """
+    If the problem asks an *object* question
+    ("Which object is in the [direction] of [entity]?"),
+    return ``{"direction": ..., "reference": ...}``.
+
+    Returns ``None`` for non-object questions.
+    """
+    m = re.search(
+        r'Which object is (?:located )?(?:to the |in the )'
+        r'(northeast|northwest|southeast|southwest|north|south|east|west)'
+        r' of (.+?)\?',
+        problem_text,
+        re.IGNORECASE,
+    )
+    if not m:
+        return None
+    return {
+        "direction": m.group(1).strip().lower(),
+        "reference": m.group(2).strip().rstrip("."),
+    }
+
+
+def parse_model_boxed_answer(
+    text_after_think: str, options: Dict[str, str]
+) -> Optional[str]:
+    """
+    Extract the text value the model chose from its ``\\boxed{}`` answer.
+    Maps letter → option text using *options* dict.
+    Returns the raw option text (lowercase stripped) or None.
+    """
+    boxed = re.findall(r'\\boxed\{([^}]*)\}', text_after_think)
+    if not boxed:
+        return None
+    answer = boxed[-1].strip().upper()
+    if answer in options:
+        return options[answer].strip().lower().rstrip(".")
+    # Try the raw value
+    return answer.lower()
+
+
+def get_possible_directions(
+    solver: SpatialMapZ3Solver,
+    entity_a: str,
+    entity_b: str,
+) -> List[str]:
+    """
+    Return the list of diagonal directions (NE/NW/SE/SW) that are
+    *satisfiable* for entity_a relative to entity_b under the current
+    constraints.
+
+    ``entity_a`` and ``entity_b`` are matched fuzzily against solver
+    entity names.
+    """
+    from z3 import And as Z3And, sat as z3sat
+
+    def _find(name):
+        nl = name.lower()
+        for k in solver.entities:
+            if k.endswith('_x') and k[:-2].lower() == nl:
+                return k[:-2]
+        for k in solver.entities:
+            if k.endswith('_x') and (nl in k[:-2].lower() or k[:-2].lower() in nl):
+                return k[:-2]
+        return None
+
+    ba = _find(entity_a)
+    bb = _find(entity_b)
+    if not ba or not bb:
+        return ['northeast', 'northwest', 'southeast', 'southwest']
+
+    ax = solver.entities[f'{ba}_x']
+    ay = solver.entities[f'{ba}_y']
+    bx = solver.entities[f'{bb}_x']
+    by = solver.entities[f'{bb}_y']
+
+    dir_constraints = {
+        'northeast': Z3And(ax > bx, ay > by),
+        'northwest': Z3And(ax < bx, ay > by),
+        'southeast': Z3And(ax > bx, ay < by),
+        'southwest': Z3And(ax < bx, ay < by),
+    }
+
+    possible = []
+    for dname, dc in dir_constraints.items():
+        solver.solver.push()
+        solver.solver.add(dc)
+        if solver.solver.check() == z3sat:
+            possible.append(dname)
+        solver.solver.pop()
+
+    return possible if possible else ['northeast', 'northwest', 'southeast', 'southwest']
+
+
+def get_consistent_object_options(
+    solver: SpatialMapZ3Solver,
+    direction: str,
+    reference: str,
+    options: Dict[str, str],
+) -> List[str]:
+    """
+    For an *object* question, return the list of MCQ letters whose entity
+    *could* be in ``direction`` of ``reference`` (Z3-satisfiable).
+
+    Letters whose entities cannot be found in the solver are kept as
+    "possible" (benefit of the doubt).
+    """
+    from z3 import And as Z3And, sat as z3sat
+
+    def _find(name):
+        nl = name.lower()
+        for k in solver.entities:
+            if k.endswith('_x') and k[:-2].lower() == nl:
+                return k[:-2]
+        for k in solver.entities:
+            if k.endswith('_x') and (nl in k[:-2].lower() or k[:-2].lower() in nl):
+                return k[:-2]
+        return None
+
+    ref_base = _find(reference)
+    if not ref_base:
+        return list(options.keys())  # can't check, keep all
+
+    rx = solver.entities[f'{ref_base}_x']
+    ry = solver.entities[f'{ref_base}_y']
+
+    dfunc = {
+        'northeast': lambda ox, oy: Z3And(ox > rx, oy > ry),
+        'northwest': lambda ox, oy: Z3And(ox < rx, oy > ry),
+        'southeast': lambda ox, oy: Z3And(ox > rx, oy < ry),
+        'southwest': lambda ox, oy: Z3And(ox < rx, oy < ry),
+    }.get(direction.lower())
+    if not dfunc:
+        return list(options.keys())
+
+    consistent = []
+    for letter, opt_name in options.items():
+        opt_base = _find(opt_name.strip().rstrip('.'))
+        if not opt_base:
+            consistent.append(letter)  # can't verify, assume possible
+            continue
+        ox = solver.entities[f'{opt_base}_x']
+        oy = solver.entities[f'{opt_base}_y']
+        solver.solver.push()
+        solver.solver.add(dfunc(ox, oy))
+        if solver.solver.check() == z3sat:
+            consistent.append(letter)
+        solver.solver.pop()
+
+    return consistent
+
+
+def get_possible_count_range(
+    solver: SpatialMapZ3Solver,
+    reference: str,
+    direction: str,
+) -> Optional[Tuple[int, int]]:
+    """
+    Compute the *[min, max]* range of how many entities could be in
+    ``direction`` of ``reference`` across all satisfying assignments.
+
+    Uses Z3 must-be / can-be checks per entity:
+    - *must_be*:  negation is UNSAT → entity is ALWAYS in that direction
+    - *can_be*:   adding constraint is SAT → entity COULD be there
+
+    min = count(must_be), max = count(must_be) + count(maybe)
+
+    Returns ``None`` if the reference entity cannot be found.
+    """
+    from z3 import And as Z3And, Not as Z3Not, sat as z3sat
+
+    direction = direction.lower().strip()
+    if direction in ('north', 'south', 'east', 'west'):
+        return (0, 0)  # cardinal → always 0 with diagonal-only constraints
+
+    def _find(name):
+        nl = name.lower()
+        for k in solver.entities:
+            if k.endswith('_x') and k[:-2].lower() == nl:
+                return k[:-2]
+        for k in solver.entities:
+            if k.endswith('_x') and (nl in k[:-2].lower() or k[:-2].lower() in nl):
+                return k[:-2]
+        return None
+
+    ref_base = _find(reference)
+    if not ref_base:
+        return None
+
+    rx = solver.entities[f'{ref_base}_x']
+    ry = solver.entities[f'{ref_base}_y']
+
+    others = [
+        k[:-2] for k in solver.entities
+        if k.endswith('_x') and k[:-2] != ref_base
+    ]
+
+    dfunc = {
+        'northeast': lambda ox, oy: Z3And(ox > rx, oy > ry),
+        'northwest': lambda ox, oy: Z3And(ox < rx, oy > ry),
+        'southeast': lambda ox, oy: Z3And(ox > rx, oy < ry),
+        'southwest': lambda ox, oy: Z3And(ox < rx, oy < ry),
+    }.get(direction)
+    if not dfunc:
+        return None
+
+    must_count = 0
+    maybe_count = 0
+
+    for ename in others:
+        ex = solver.entities[f'{ename}_x']
+        ey = solver.entities[f'{ename}_y']
+        c = dfunc(ex, ey)
+
+        # Can it be in that direction?
+        solver.solver.push()
+        solver.solver.add(c)
+        can_be = solver.solver.check() == z3sat
+        solver.solver.pop()
+
+        if not can_be:
+            continue
+
+        # Must it be?
+        solver.solver.push()
+        solver.solver.add(Z3Not(c))
+        must_be = solver.solver.check() != z3sat
+        solver.solver.pop()
+
+        if must_be:
+            must_count += 1
+        else:
+            maybe_count += 1
+
+    return (must_count, must_count + maybe_count)
+
+
 # Export
 __all__ = [
     'SpatialMapZ3Solver',
     'parse_directional_claims_from_text',
+    'parse_counting_question',
+    'parse_model_count_from_answer',
+    'parse_direction_question',
+    'parse_object_question',
+    'parse_model_boxed_answer',
+    'get_possible_directions',
+    'get_consistent_object_options',
+    'get_possible_count_range',
     'extract_step2_claims',
     'verify_spatialmap_step',
     'format_spatialmap_feedback',

From e29d1070a5833a98da5538559721df2421de22e1 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Thu, 26 Feb 2026 11:42:52 +0000
Subject: [PATCH 07/17] added step verfier for all

---
 examples/EarlyStopping/maze_example.py        |   3 +
 examples/EarlyStopping/spatialmap_example.py  |   3 +
 .../game24_stepverifier.py                    |   8 +-
 examples/TTSwithVerification/game24meta.py    | 472 ++++++++++++++++++
 .../TTSwithVerification/maze_stepverifier.py  |   8 +-
 examples/TTSwithVerification/mazemeta.py      | 431 ++++++++++++++++
 .../spatialmap_stepverifier.py                |   8 +-
 interwhen/monitors/thinkingPhaseVerifier.py   |  55 +-
 interwhen/utils/maze_verifier.py              |  28 +-
 interwhen/utils/spatialmap_verifier.py        |  21 +-
 10 files changed, 1002 insertions(+), 35 deletions(-)
 create mode 100644 examples/TTSwithVerification/game24meta.py
 create mode 100644 examples/TTSwithVerification/mazemeta.py

diff --git a/examples/EarlyStopping/maze_example.py b/examples/EarlyStopping/maze_example.py
index 74c04653..175624dd 100644
--- a/examples/EarlyStopping/maze_example.py
+++ b/examples/EarlyStopping/maze_example.py
@@ -102,6 +102,9 @@ def build_prompt_from_example(example): #(original prompt config)
 
 
 def extract_solution(text):
+    # Strip injected <format>...</format> template blocks so we don't
+    # accidentally match the placeholder \boxed{LETTER} from the template.
+    text = re.sub(r'<format>.*?</format>', '', text, flags=re.DOTALL)
     matches = re.findall(r"\\boxed\{([^}]*)\}", text)
     if not matches:
         return None
diff --git a/examples/EarlyStopping/spatialmap_example.py b/examples/EarlyStopping/spatialmap_example.py
index c3925c30..3a0604c2 100644
--- a/examples/EarlyStopping/spatialmap_example.py
+++ b/examples/EarlyStopping/spatialmap_example.py
@@ -100,6 +100,9 @@ def build_prompt_from_example(example):
     return pre_prompt , description
 
 def extract_solution(text):
+    # Strip injected <format>...</format> template blocks so we don't
+    # accidentally match the placeholder \boxed{LETTER} from the template.
+    text = re.sub(r'<format>.*?</format>', '', text, flags=re.DOTALL)
     matches = re.findall(r"\\boxed\{([^}]*)\}", text)
     if not matches:
         return None
diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index 94653f5c..383f15f7 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -14,7 +14,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -147,6 +147,10 @@ def extract_solution(text):
     for latex, op in replacements.items():
         expr = expr.replace(latex, op)
 
+    # 2b. Replace Unicode math operators (QwQ frequently uses these)
+    expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+    expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
+
     # 3. Cleanup (remove LaTeX spacing)
     expr = expr.replace(r"\,", "").replace(r"\ ", "")
 
@@ -249,7 +253,7 @@ def evaluate_game24_answer(answer, nums):
 
     dataset = load_game24_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=32768)
+    llm_server = init_llm_server(main_model, max_tokens=22000)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
diff --git a/examples/TTSwithVerification/game24meta.py b/examples/TTSwithVerification/game24meta.py
new file mode 100644
index 00000000..9b799be4
--- /dev/null
+++ b/examples/TTSwithVerification/game24meta.py
@@ -0,0 +1,472 @@
+import argparse
+import asyncio
+import csv
+import json
+import logging
+import os
+import re
+import numpy as np
+
+from datasets import load_dataset
+from openai import OpenAI
+from transformers import AutoTokenizer
+
+from interwhen import stream_completion
+from interwhen.monitors import KstableAnswerGame24Monitor, StepVerifierGame24Monitor
+
+# ============== MODEL CONFIGURATION ==============
+# Change these model names to scale experiments easily
+MAIN_MODEL = "Qwen/QwQ-32B"
+EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+# =================================================
+
+def get_model_short_name(model_name: str) -> str:
+    """Extract a short, filesystem-safe name from the model path."""
+    short_name = model_name.split("/")[-1]
+    short_name = short_name.replace(" ", "_").replace(":", "-")
+    return short_name
+
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/Gameof24results/metaPrompt"):
+    """Create and return output directory paths based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    
+    dirs = {
+        "base": output_base,
+        "reasoning": os.path.join(output_base, "Reasoning_output"),
+        "csv_saved": os.path.join(output_base, "csv_saved"),
+    }
+    
+    # Create all directories
+    for dir_path in dirs.values():
+        os.makedirs(dir_path, exist_ok=True)
+    
+    return dirs
+
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/Gameof24_results/metaPrompt") -> str:
+    """Generate log filename based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    os.makedirs(output_base, exist_ok=True)
+    return os.path.join(output_base, f"EAT_{num_examples}examples.log")
+
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/Gameof24_results/metaPrompt") -> str:
+    """Generate token CSV filename based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    os.makedirs(output_base, exist_ok=True)
+    return os.path.join(output_base, f"EAT_{num_examples}examples.csv")
+
+def save_prompt(idx, prompt_with_answer, reason_dir):
+    filename = os.path.join(reason_dir, f"reason_{idx}.txt")
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(prompt_with_answer)
+
+logger = logging.getLogger(__name__)
+
+
+def load_game24_dataset():
+    ds = load_dataset("nlile/24-game", split="train")
+    return ds
+
+def init_llm_server(modelname, max_tokens=200, port=8000):
+    url = f"http://localhost:{port}/v1/completions"
+    payload = {
+        "model": modelname,
+        "max_tokens": max_tokens,
+        "top_k": 20,
+        "top_p": 0.95,
+        "min_p": 0.0,
+        "temperature": 0.6,
+        "stream": True,
+        "logprobs": 20,
+        "use_beam_search": False,
+        "prompt_cache": True,
+        "seed" : 42
+    }
+    headers = {"Content-Type": "application/json"}
+    return {"url": url, "payload": payload, "headers": headers}
+
+
+def build_meta_prompt_from_example(nums):
+    """Build the system and user prompts for Game of 24 with step verification format."""
+    a, b, c, d = nums
+    
+    system_prompt = r"""You are solving the Game of 24.
+
+GAME RULES:
+- You are given four numbers
+- Use ALL four numbers exactly once
+- Use ONLY the operations: +, -, *, /
+- The final expression must evaluate to exactly 24
+
+OUTPUT FORMAT:
+You must follow this EXACT structured format for your solution:
+
+>Step1
+available numbers: [a, b, c, d]
+suggested operation: a * b = result1
+remaining numbers: [result1, c, d]
+
+>Step2
+available numbers: [result1, c, d]
+suggested operation: result1 + c = result2
+remaining numbers: [result2, d]
+
+>Step3
+available numbers: [result2, d]
+suggested operation: result2 - d = 24
+remaining numbers: [24]
+
+> Final expression: \boxed{expression using original numbers}
+
+IMPORTANT RULES:
+1. Each step MUST show the available numbers at the start
+2. Each step MUST show the suggested operation with its result
+3. Each step MUST show the remaining numbers after the operation
+4. Continue until you reach exactly 24
+5. The final expression inside \boxed{} must use the ORIGINAL numbers
+6. If you receive VERIFIER FEEDBACK, immediately provide a corrected step - do NOT restart your thinking
+
+═══════════════════════════════════════════════════════════════════════════════
+EXAMPLE 1: Numbers [2, 3, 4, 5]
+═══════════════════════════════════════════════════════════════════════════════
+
+### Final Answer
+
+>Step1
+available numbers: [2, 3, 4, 5]
+suggested operation: 5 + 3 = 8
+remaining numbers: [8, 2, 4]
+
+>Step2
+available numbers: [8, 2, 4]
+suggested operation: 8 - 2 = 6
+remaining numbers: [6, 4]
+
+>Step3
+available numbers: [6, 4]
+suggested operation: 6 * 4 = 24
+remaining numbers: [24]
+
+> Final expression: \boxed{(5 + 3 - 2) * 4}
+
+═══════════════════════════════════════════════════════════════════════════════
+EXAMPLE 2: Numbers [1, 5, 5, 5]
+═══════════════════════════════════════════════════════════════════════════════
+
+### Final Answer
+
+>Step1
+available numbers: [1, 5, 5, 5]
+suggested operation: 1 / 5 = 0.2
+remaining numbers: [0.2, 5, 5]
+
+>Step2
+available numbers: [0.2, 5, 5]
+suggested operation: 5 - 0.2 = 4.8
+remaining numbers: [4.8, 5]
+
+>Step3
+available numbers: [4.8, 5]
+suggested operation: 4.8 * 5 = 24
+remaining numbers: [24]
+
+> Final expression: \boxed{(5 - 1/5) * 5}
+
+═══════════════════════════════════════════════════════════════════════════════
+EXAMPLE 3: Handling Verifier Feedback - Numbers [1, 2, 6, 8]
+═══════════════════════════════════════════════════════════════════════════════
+
+### Final Answer
+
+>Step1
+available numbers: [1, 2, 6, 8]
+suggested operation: 8 / 2 = 4
+remaining numbers: [4, 1, 6]
+
+>Step2
+available numbers: [4, 1, 6]
+suggested operation: 4 - 1 = 3
+remaining numbers: [3, 6]
+
+[VERIFIER FEEDBACK for Step 2:
+  ✗ Cannot reach 24 from remaining numbers [3, 6]. This path is a dead end.
+The previous steps are correct. Please provide a corrected Step 2 and continue.]
+
+>Step2
+available numbers: [4, 1, 6]
+suggested operation: 6 - 1 = 5
+remaining numbers: [5, 4]
+
+[VERIFIER FEEDBACK for Step 2:
+  ✗ Cannot reach 24 from remaining numbers [4, 5]. This path is a dead end.
+The previous steps are correct. Please provide a corrected Step 2 and continue.]
+
+>Step2
+available numbers: [4, 1, 6]
+suggested operation: 6 * 1 = 6
+remaining numbers: [6, 4]
+
+>Step3
+available numbers: [6, 4]
+suggested operation: 6 * 4 = 24
+remaining numbers: [24]
+
+> Final expression: \boxed{(8 / 2) * 6 * 1}
+
+═══════════════════════════════════════════════════════════════════════════════
+
+Now solve the following Game of 24 problem using the EXACT same format."""
+
+    user_prompt = f"""
+Numbers: {a}, {b}, {c}, {d}
+
+Find an arithmetic expression using these four numbers exactly once each with +, -, *, / that equals 24.
+
+Use the structured step-by-step format shown in the examples above."""
+
+    # Combine into a single prompt
+    full_prompt = f"{system_prompt}\n\n{user_prompt}"
+    
+    return full_prompt
+
+
+def count_tokens(text: str, tokenizer) -> int:
+    """Count the total number of tokens in the generated text using the tokenizer."""
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    return len(tokens)
+
+
+def extract_solution(text):
+    
+    # Use a more robust extraction that handles nested braces in \boxed{}
+    # Find \boxed{ and then match braces properly
+    boxed_pattern = r"\\boxed\{"
+    matches = list(re.finditer(boxed_pattern, text))
+    if not matches:
+        return None
+    
+    # Get the last \boxed{} content by matching braces
+    last_match = matches[-1]
+    start = last_match.end()  # Position right after \boxed{
+    brace_count = 1
+    end = start
+    while end < len(text) and brace_count > 0:
+        if text[end] == '{':
+            brace_count += 1
+        elif text[end] == '}':
+            brace_count -= 1
+        end += 1
+    
+    expr = text[start:end-1].strip()  # -1 to exclude the closing brace
+
+    # 1. Convert \frac{a}{b} to (a/b)
+    frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+    while re.search(frac_pattern, expr):
+        expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+
+    # 2. Replace LaTeX operators
+    replacements = {
+        r"\times": "*",
+        r"\cdot": "*",
+        r"\div": "/",
+    }
+    for latex, op in replacements.items():
+        expr = expr.replace(latex, op)
+
+    # 2b. Replace Unicode math operators (QwQ frequently uses these)
+    expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+    expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
+
+    # 3. Cleanup (remove LaTeX spacing)
+    expr = expr.replace(r"\,", "").replace(r"\ ", "")
+
+    # 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
+    # Insert * between: )( , )number, number(, )(
+    expr = re.sub(r'\)\s*\(', ')*(', expr)  # )( -> )*(
+    expr = re.sub(r'\)\s*(\d)', r')*\1', expr)  # )number -> )*number
+    expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)  # number( -> number*(
+
+    return expr
+
+def extract_numbers_from_expr(expr):
+    """Extract all numbers (including decimals) from an expression."""
+    # Match integers and decimals
+    numbers = re.findall(r'\d+\.?\d*', expr)
+    return [int(float(n)) if float(n).is_integer() else float(n) for n in numbers]
+
+def validate_numbers_used(expr, expected_nums):
+    """Check if the expression uses exactly the given numbers (each exactly once)."""
+    used_nums = extract_numbers_from_expr(expr)
+    # Sort both lists to compare
+    return sorted(used_nums) == sorted(expected_nums)
+
+def evaluate_expression(expr, expected_nums=None):
+    try:
+        # First check if expression uses exactly the given numbers
+        if expected_nums is not None:
+            if not validate_numbers_used(expr, expected_nums):
+                return False
+        
+        value = eval(expr, {"__builtins__": None}, {})
+        return abs(value - 24) < 1e-6
+    except Exception:
+        return False
+
+def evaluate_game24_answer(answer, nums):
+    """
+    Evaluate a Game24 answer and return (is_correct, expr, error_message).
+    
+    Args:
+        answer: Raw model output
+        nums: Expected numbers to use
+        
+    Returns:
+        Tuple of (is_correct, extracted_expression, error_message)
+    """
+    expr = extract_solution(answer)
+    
+    if not expr:
+        return False, None, "No expression found"
+    
+    if evaluate_expression(expr, expected_nums=nums):
+        return True, expr, "Correct solution (evaluates to 24 using exactly the given numbers)"
+    else:
+        used_nums = extract_numbers_from_expr(expr)
+        if sorted(used_nums) != sorted(nums):
+            return False, expr, f"Incorrect: Expression uses {used_nums}, expected {nums}"
+        else:
+            return False, expr, "Expression does not evaluate to 24"
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
+    parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
+    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--num_examples", "-n", type=int, default=1, help="Number of examples to run")
+    parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
+    parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
+    parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
+    args = parser.parse_args()
+
+    # Use models from args (allows command-line override)
+    main_model = args.main_model
+    earlystop_model = args.earlystop_model
+
+    # Setup output directories based on model name
+    output_dirs = get_output_dirs(main_model)
+    logfile = get_log_filename(main_model, args.num_examples)
+    token_filename = get_token_filename(main_model, args.num_examples)
+    reason_dir = output_dirs["reasoning"]
+
+    log_level = logging.DEBUG if args.debug else logging.INFO
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        handlers=[
+            logging.FileHandler(logfile, mode="w"),  
+            logging.StreamHandler()
+        ],
+        force=True,
+    )
+
+    logger.info(f"Main model: {main_model}")
+    logger.info(f"Early stop model: {earlystop_model}")
+    logger.info(f"Output directory: {output_dirs['base']}")
+
+    dataset = load_game24_dataset()
+
+    llm_server = init_llm_server(main_model, max_tokens=22000)
+
+    # Load tokenizer for accurate token counting
+    logger.info(f"Loading tokenizer for {main_model}...")
+    tokenizer = AutoTokenizer.from_pretrained(main_model, trust_remote_code=True)
+    logger.info("Tokenizer loaded successfully.")
+
+    num_correct = 0
+    N = args.num_examples
+    total_reasoning_tokens = 0
+    reasoning_token_counts = []
+
+    # total = len(dataset)
+    indices = np.linspace(0, len(dataset)-1, N, dtype=int)
+
+    for idx in indices: #for idx in indices:
+        example = dataset[idx]
+        nums = example["numbers"]
+
+        prompt = build_meta_prompt_from_example(nums)
+
+        if args.monitor:
+            # Use StepVerifierGame24Monitor to detect when equation stabilizes k times
+            monitors=(StepVerifierGame24Monitor(
+                name="game24_kstable",
+                answer_start_token = "</think>",
+                original_numbers=nums,  # Validate equations use exactly these numbers
+            ),)
+        else:
+            monitors = ()
+
+        logger.info(f"---- length of monitors {len(monitors)} ----")
+        logger.info(f"---- Example {idx+1} ----")
+        logger.info(f"Numbers: {nums}")
+
+        answer = asyncio.run(stream_completion(
+            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
+            llm_server=llm_server,
+            monitors=monitors,
+            add_delay=False,
+            termination_requires_validation=False,
+            async_execution=True
+        ))
+
+        save_prompt(idx, answer, reason_dir)
+        logger.info(f"Raw final output:\n{answer}")
+
+        reasoning_tokens = count_tokens(answer, tokenizer)
+        reasoning_token_counts.append(reasoning_tokens)
+        total_reasoning_tokens += reasoning_tokens
+        logger.info(f"Generated tokens in this example: {reasoning_tokens}")
+
+        is_correct, expr, message = evaluate_game24_answer(answer, nums)
+        
+        if expr:
+            logger.info(f"Extracted expression: {expr}")
+        logger.info(message)
+        
+        if is_correct:
+            num_correct += 1
+
+    # Calculate final statistics
+    avg_reasoning_tokens = total_reasoning_tokens / N if N > 0 else 0
+    accuracy = num_correct / N if N > 0 else 0
+    
+    print(f"\nFinal Accuracy: {num_correct}/{N} ({accuracy:.2%})")
+    print(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}")
+    print(f"Total Reasoning Tokens: {total_reasoning_tokens}")
+    
+    # Save results to a text file
+    results_file = logfile.replace('.log', '_results.txt')
+    with open(results_file, 'w') as f:
+        f.write(f"Game of 24 Evaluation Results\n")
+        f.write(f"{'='*50}\n\n")
+        f.write(f"Model: {main_model}\n")
+        f.write(f"Number of Examples: {N}\n")
+        f.write(f"Monitor Enabled: {args.monitor}\n\n")
+        f.write(f"Results:\n")
+        f.write(f"---------\n")
+        f.write(f"Correct: {num_correct}/{N}\n")
+        f.write(f"Accuracy: {accuracy:.2%}\n\n")
+        f.write(f"Reasoning Token Statistics:\n")
+        f.write(f"---------------------------\n")
+        f.write(f"Total Reasoning Tokens: {total_reasoning_tokens}\n")
+        f.write(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}\n")
+        if reasoning_token_counts:
+            f.write(f"Min Reasoning Tokens: {min(reasoning_token_counts)}\n")
+            f.write(f"Max Reasoning Tokens: {max(reasoning_token_counts)}\n")
+            f.write(f"Std Dev: {np.std(reasoning_token_counts):.2f}\n")
+    
+    logger.info(f"Results saved to {results_file}")
+    print(f"Results saved to {results_file}")
\ No newline at end of file
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index 5adb4202..4c2ca456 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
 
@@ -75,6 +75,10 @@ def extract_solution(text: str) -> str:
     else:
         answer_section = text
     
+    # Strip injected <format>...</format> template blocks so we don't
+    # accidentally match the placeholder \boxed{LETTER} from the template.
+    answer_section = re.sub(r'<format>.*?</format>', '', answer_section, flags=re.DOTALL)
+    
     matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
     if matches:
         return matches[-1].strip()
@@ -108,7 +112,7 @@ def count_tokens(text: str, tokenizer) -> int:
 #         return "relative_position"
 
 
-def init_llm_server(model_name, max_tokens=32768, port=8000):
+def init_llm_server(model_name, max_tokens=22000, port=8000):
     """Initialize LLM server configuration."""
     url = f"http://localhost:{port}/v1/completions"
     payload = {
diff --git a/examples/TTSwithVerification/mazemeta.py b/examples/TTSwithVerification/mazemeta.py
new file mode 100644
index 00000000..403115d7
--- /dev/null
+++ b/examples/TTSwithVerification/mazemeta.py
@@ -0,0 +1,431 @@
+"""
+Maze experiment with step-by-step verification using StepVerifierMazeMonitor.
+
+Uses the new monitor-based architecture that integrates with stream_completion.
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import re
+import numpy as np
+from pathlib import Path
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from interwhen import stream_completion
+from interwhen.monitors import StepVerifierMazeMonitor
+from interwhen.utils.maze_verifier import parse_maze_from_prompt
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+logger = logging.getLogger(__name__)
+
+# ============== MODEL CONFIGURATION ==============
+MAIN_MODEL = "Qwen/QwQ-32B"
+# =================================================
+
+
+def get_model_short_name(model_name: str) -> str:
+    """Extract a short, filesystem-safe name from the model path."""
+    short_name = model_name.split("/")[-1]
+    short_name = short_name.replace(" ", "_").replace(":", "-")
+    return short_name
+
+
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResults/metaPrompt"):
+    """Create and return output directory paths based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    
+    dirs = {
+        "base": output_base,
+        "reasoning": os.path.join(output_base, "Reasoning_output"),
+    }
+    
+    for dir_path in dirs.values():
+        os.makedirs(dir_path, exist_ok=True)
+    
+    return dirs
+
+
+def build_meta_prompt_from_example(example):
+    """Build prompt for maze example."""
+    system_prompt = """You are a maze-solving AI. Given a maze in ASCII format, analyze it step by step.
+
+COORDINATE SYSTEM:
+- Rows are numbered from top (row 0) to bottom
+- Columns are numbered from left (col 0) to right
+- Movement: UP (row decreases), DOWN (row increases), LEFT (col decreases), RIGHT (col increases)
+
+TURN DEFINITIONS:
+- RIGHT_TURN = 90° clockwise change (e.g., DOWN→LEFT, LEFT→UP, UP→RIGHT, RIGHT→DOWN)
+- LEFT_TURN = 90° counterclockwise change (e.g., DOWN→RIGHT, RIGHT→UP, UP→LEFT, LEFT→DOWN)
+
+RELATIVE POSITION DEFINITIONS:
+- "directly to the left" = same row, E has smaller column than S
+- "directly to the right" = same row, E has larger column than S
+- "directly above" = same column, E has smaller row than S
+- "directly below" = same column, E has larger row than S
+- "top left" = E has smaller row AND smaller column than S
+- "top right" = E has smaller row AND larger column than S
+- "bottom left" = E has larger row AND smaller column than S
+- "bottom right" = E has larger row AND larger column than S
+
+IMPORTANT: Follow the EXACT output format below. Do NOT use <think> tags.
+
+EXAMPLE 1: Counting Right Turns
+Question: How many right turns are there in the path from S to E?
+
+>>> LOCATE START AND EXIT:
+    S position: (3,5)
+    E position: (1,1)
+
+>>> STEP 1: Move DOWN from (3,5) to (4,5)
+    Current position: (4,5)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 2: Move DOWN from (4,5) to (5,5)
+    Current position: (5,5)
+    Previous direction: DOWN
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0
+
+>>> STEP 3: Move LEFT from (5,5) to (5,4)
+    Current position: (5,4)
+    Previous direction: DOWN
+    Current direction: LEFT
+    Turn type: RIGHT_TURN
+    Running count: Right=1, Left=0
+
+>>> FINAL ANSWER: Right turns = 2
+    \\boxed{C}
+
+EXAMPLE 2: Counting Total Turns
+Question: How many total turns are there in the path from S to E?
+
+>>> LOCATE START AND EXIT:
+    S position: (3,5)
+    E position: (1,1)
+
+>>> STEP 1: Move DOWN from (3,5) to (4,5)
+    Current position: (4,5)
+    Previous direction: —
+    Current direction: DOWN
+    Turn type: STRAIGHT
+    Running count: Right=0, Left=0, Total=0
+
+[... continue for all steps ...]
+
+>>> FINAL ANSWER: Total turns = 2
+    \\boxed{C}
+
+EXAMPLE 3: Relative Position
+Question: Is the exit (E) to the top left of the starting point (S)?
+
+>>> LOCATE START AND EXIT:
+    S position: (3,5)
+    E position: (1,1)
+
+>>> COMPARE POSITIONS:
+    Row comparison: E row (1) < S row (3) → E is ABOVE S ✓
+    Col comparison: E col (1) < S col (5) → E is LEFT of S ✓
+
+>>> ANALYSIS:
+    E is above S (smaller row): YES
+    E is left of S (smaller col): YES
+    Therefore E is at TOP LEFT of S.
+
+>>> ANSWER: YES, E is to the top left of S.
+    \\boxed{A}
+
+════════════════════════════════════════════════════════════════════════════════
+Now solve the following maze using the EXACT same format. First locate S and E, then trace the path step by step."""
+
+    # Get the maze description (trimmed to remove trailing instructions)
+    description = str(example.get("prompt", ""))
+    description_trimmed = description[:-143] if len(description) > 143 else description
+    
+    return system_prompt, description_trimmed
+
+
+def extract_solution(text: str) -> str:
+    """Extract the boxed answer from the response (after </think>)."""
+    if "</think>" in text:
+        answer_section = text.split("</think>")[-1]
+    else:
+        answer_section = text
+    
+    matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
+    if matches:
+        return matches[-1].strip()
+    
+    match = re.search(r'(?:answer|Answer)[:\s]+([A-D])', answer_section)
+    if match:
+        return match.group(1).strip()
+    
+    return None
+
+
+def count_tokens(text: str, tokenizer) -> int:
+    """Count the total number of tokens in the generated text using the tokenizer."""
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    return len(tokens)
+
+
+def get_question_type_from_index(idx: int) -> str:
+    """Determine question type based on index range.
+    
+    Dataset structure:
+    - 3000-3499: right turns
+    - 3500-3999: total turns
+    - 4000-4500: relative position
+    """
+    if idx < 3500:
+        return "right_turns"
+    elif idx < 4000:
+        return "total_turns"
+    else:
+        return "relative_position"
+
+
+def init_llm_server(model_name, max_tokens=22000, port=8000):
+    """Initialize LLM server configuration."""
+    url = f"http://localhost:{port}/v1/completions"
+    payload = {
+        "model": model_name,
+        "max_tokens": max_tokens,
+        "top_k": 20,
+        "top_p": 0.95,
+        "min_p": 0.0,
+        "temperature": 0.6,
+        "stream": True,
+        "logprobs": 20,
+        "use_beam_search": False,
+        "prompt_cache": True,
+        "seed": 42
+    }
+    headers = {"Content-Type": "application/json"}
+    return {"url": url, "payload": payload, "headers": headers}
+
+
+def save_output(idx: int, output: str, output_dir: str):
+    """Save output to file."""
+    os.makedirs(output_dir, exist_ok=True)
+    filepath = os.path.join(output_dir, f"output_{idx}.txt")
+    with open(filepath, 'w') as f:
+        f.write(output)
+    logger.info(f"Saved output to {filepath}")
+
+def evaluate_maze_answer(answer, options, ground_truth):
+    """
+    Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).
+    
+    Args:
+        answer: Raw model output
+        options: Dictionary mapping option letters (A/B/C/D) to their values
+        ground_truth: The correct answer value
+        
+    Returns:
+        Tuple of (is_correct, extracted_answer, message)
+    """
+    sol = extract_solution(answer)
+    gt_sol = str(ground_truth).strip()
+    
+    if not sol:
+        return False, None, "No expression found"
+    
+    sol = sol.strip()
+    
+    # Case 1: LLM returned option letter (A/B/C/D)
+    if sol in options:
+        if options[sol] == gt_sol:
+            return True, sol, f"Correct: option {sol} -> {options[sol]}"
+        else:
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
+    
+    # Case 2: LLM returned the actual answer text
+    # First check if sol matches ground truth directly
+    if sol.lower() == gt_sol.lower():
+        return True, sol, f"Correct: answer text matches ground truth: {sol}"
+    
+    # Check if sol matches any option value
+    for opt_letter, opt_value in options.items():
+        if sol.lower() == opt_value.lower():
+            if opt_value == gt_sol:
+                return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
+            else:
+                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
+    
+    return False, sol, f"Solution '{sol}' not found in options or ground truth"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run maze experiments with StepVerifierMazeMonitor")
+    parser.add_argument("--model", type=str, default=MAIN_MODEL,
+                        help="Model name for generation")
+    parser.add_argument("--indices", type=str, default=None,
+                        help="Comma-separated indices to run (e.g., '3000,3500,4000')")
+    parser.add_argument("--start", type=int, default=3000, help="Start index")
+    parser.add_argument("--end", type=int, default=3010, help="End index")
+    parser.add_argument("--num_examples", "-n", type=int, default=None,
+                        help="Number of examples to run (overrides start/end)")
+    parser.add_argument("--max_corrections", type=int, default=5,
+                        help="Maximum number of correction attempts per example")
+    parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
+    parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+    
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
+    # Load dataset
+    dataset = load_dataset("microsoft/VISION_LANGUAGE", 'maze', split='val')
+    
+    # Setup LLM server
+    llm_server = init_llm_server(args.model, port=args.port)
+    
+    # Load tokenizer for accurate token counting
+    logger.info(f"Loading tokenizer for {args.model}...")
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    logger.info("Tokenizer loaded successfully.")
+    
+    # Setup output directory
+    output_dirs = get_output_dirs(args.model)
+    reason_dir = output_dirs["reasoning"]
+    
+    # Determine indices
+    if args.indices:
+        indices = [int(x.strip()) for x in args.indices.split(",")]
+    elif args.num_examples:
+        # Use 4499 as endpoint (4500 is out of bounds since dataset size is 4500)
+        indices = np.linspace(3000, 4499, args.num_examples, dtype=int)
+    else:
+        indices = range(args.start, args.end)
+    
+    # Stats tracking
+    results = []
+    total_correct = 0
+    total_examples = 0
+    total_reasoning_tokens = 0
+    
+    for idx in indices:
+        example = dataset[idx]
+        system_prompt, user_prompt = build_meta_prompt_from_example(example)
+        if str(example.get("ground_truth", "")).strip() == "Q4":
+            target_options = ["A", "B"]
+        else:
+            target_options = ["A", "B", "C", "D"] 
+        keys = "|".join(map(re.escape, target_options))
+        pattern = rf'\b({keys})\.\s*([A-Za-z0-9]+)\b'
+        options = dict(re.findall(pattern, user_prompt))
+        
+        # Build full prompt
+        full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        
+        # Parse maze from prompt
+        grid, start_pos, exit_pos = parse_maze_from_prompt(user_prompt)
+        
+        if not grid or not start_pos or not exit_pos:
+            logger.error(f"Could not parse maze for example {idx}")
+            continue
+        
+        # Detect question type from prompt (auto-detection)
+        # Falls back to index-based if no turn keywords found
+        question_type = StepVerifierMazeMonitor.detect_question_type(user_prompt)
+        
+        logger.info(f"\n{'='*60}")
+        logger.info(f"Example {idx} ({question_type})")
+        logger.info(f"Maze: S={start_pos}, E={exit_pos}, grid={len(grid)}x{len(grid[0]) if grid else 0}")
+        logger.info(f"{'='*60}")
+        
+        # Create the monitor
+        monitor = StepVerifierMazeMonitor(
+            name="maze_step_verifier",
+            answer_start_token="</think>",
+            grid=grid,
+            start_pos=start_pos,
+            exit_pos=exit_pos,
+            max_corrections=args.max_corrections,
+            question_type=question_type,
+        )
+        
+        # Run with stream_completion
+        try:
+            answer = asyncio.run(stream_completion(
+                full_prompt,
+                llm_server=llm_server,
+                monitors=(),
+                add_delay=False,
+                termination_requires_validation=False,
+                async_execution=True
+            ))
+        except Exception as e:
+            logger.error(f"Error running example {idx}: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+        
+        # Count generated tokens
+        reasoning_tokens = count_tokens(answer, tokenizer)
+        total_reasoning_tokens += reasoning_tokens
+        
+        gt_sol = str(example.get("ground_truth", "")).strip()
+        is_correct, extracted_answer, message = evaluate_maze_answer(answer, options, gt_sol)
+        
+        if extracted_answer:
+            logger.info(f"Extracted answer: {extracted_answer}")
+        logger.info(message)
+        
+        if is_correct:
+            total_correct += 1
+        
+        total_examples += 1
+        # Log result
+        result = {
+            'idx': int(idx),  # Convert numpy int64 to Python int
+            'question_type': question_type,
+            'correct': is_correct,
+            'sol': extracted_answer,
+            'gt': gt_sol,
+            'reasoning_tokens': reasoning_tokens,
+        }
+        results.append(result)
+        
+        logger.info(f"Result: sol={extracted_answer}, gt={gt_sol}, correct={is_correct}")
+        logger.info(f"Reasoning tokens: {reasoning_tokens}")
+    
+    # Compute final metrics
+    accuracy = total_correct / total_examples if total_examples > 0 else 0
+    avg_reasoning_tokens = total_reasoning_tokens / total_examples if total_examples > 0 else 0
+    
+    logger.info(f"\n{'='*60}")
+    logger.info(f"FINAL RESULTS")
+    logger.info(f"{'='*60}")
+    logger.info(f"Total examples: {total_examples}")
+    logger.info(f"Correct: {total_correct}")
+    logger.info(f"Accuracy: {accuracy:.4f} ({total_correct}/{total_examples})")
+    logger.info(f"Total reasoning tokens: {total_reasoning_tokens}")
+    logger.info(f"Avg reasoning tokens: {avg_reasoning_tokens:.1f}")
+    
+    # Save summary
+    summary = {
+        'model': args.model,
+        'total_examples': total_examples,
+        'correct': total_correct,
+        'accuracy': accuracy,
+        'total_reasoning_tokens': total_reasoning_tokens,
+        'avg_reasoning_tokens': avg_reasoning_tokens,
+        'max_corrections': args.max_corrections,
+        'results': results,
+    }
+    
+    summary_path = os.path.join(output_dirs["base"], "summary.json")
+    with open(summary_path, 'w') as f:
+        json.dump(summary, f, indent=2)
+    logger.info(f"\nSaved summary to {summary_path}")
\ No newline at end of file
diff --git a/examples/TTSwithVerification/spatialmap_stepverifier.py b/examples/TTSwithVerification/spatialmap_stepverifier.py
index 8d0f454f..0e78e4d9 100644
--- a/examples/TTSwithVerification/spatialmap_stepverifier.py
+++ b/examples/TTSwithVerification/spatialmap_stepverifier.py
@@ -27,7 +27,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
 
@@ -90,6 +90,10 @@ def extract_solution(text: str) -> str:
     else:
         answer_section = text
     
+    # Strip injected <format>...</format> template blocks so we don't
+    # accidentally match the placeholder \boxed{LETTER} from the template.
+    answer_section = re.sub(r'<format>.*?</format>', '', answer_section, flags=re.DOTALL)
+    
     matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
     if matches:
         return matches[-1].strip()
@@ -107,7 +111,7 @@ def count_tokens(text: str, tokenizer) -> int:
     return len(tokens)
 
 
-def init_llm_server(model_name, max_tokens=32768, port=8000):
+def init_llm_server(model_name, max_tokens=20000, port=8000):
     """Initialize LLM server configuration."""
     url = f"http://localhost:{port}/v1/completions"
     payload = {
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index 2f01e66b..95a8535f 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -284,6 +284,9 @@ def _extract_braced_expression(text: str) -> Optional[str]:
         # Basic cleanup: remove LaTeX
         expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
         expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        # Replace Unicode math operators (QwQ frequently uses these)
+        expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+        expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
         frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
         while re.search(frac_pattern, expr):
             expr = re.sub(frac_pattern, r"(\1/\2)", expr)
@@ -313,6 +316,9 @@ def _extract_boxed_expression(text: str) -> Optional[str]:
         expr = text[start:end - 1].strip()
         expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
         expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        # Replace Unicode math operators (QwQ frequently uses these)
+        expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+        expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
         frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
         while re.search(frac_pattern, expr):
             expr = re.sub(frac_pattern, r"(\1/\2)", expr)
@@ -738,7 +744,7 @@ def _build_maze_format_block(question_type: str) -> str:
     if question_type == "relative_position":
         return (
             "<format>\n"
-            ">>> LOCATE START AND EXIT:\n"
+            ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
             "    S position: (row, col)\n"
             "    E position: (row, col)\n"
             "\n"
@@ -757,7 +763,7 @@ def _build_maze_format_block(question_type: str) -> str:
 
         return (
             "<format>\n"
-            ">>> LOCATE START AND EXIT:\n"
+            ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
             "    S position: (row, col)\n"
             "    E position: (row, col)\n"
             "\n"
@@ -789,7 +795,7 @@ def _build_maze_thinking_phase_prompt(question_type: str) -> str:
         "\n\nLet me output the current steps I have traced so far "
         "through the maze in the following format:\n"
         f"{format_block}\n"
-        ">>> LOCATE START AND EXIT:\n"
+        ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
     )
 
 
@@ -806,7 +812,7 @@ def _build_maze_structured_prompt(question_type: str) -> str:
         "\nLet me trace the step by step solution through the maze "
         "in the following format:\n"
         f"{format_block}\n"
-        ">>> LOCATE START AND EXIT:\n"
+        ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
     )
 
 
@@ -870,7 +876,7 @@ def __init__(
         # Build the thinking-phase side-stream prompt (in LLM's own voice)
         self._thinking_phase_prompt = _build_maze_thinking_phase_prompt(question_type)
         # A unique marker to detect whether we already injected it
-        self._structured_marker = ">>> LOCATE START AND EXIT:"
+        self._structured_marker = ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):"
 
         # ---- state ----
         self._think_phase_corrections = 0
@@ -1170,7 +1176,7 @@ def step_extractor(self, chunk: str, generated_text: str):
 
         # Strip out the injected <format>...</format> template so we only
         # look at actual model output (which starts after the last
-        # ">>> LOCATE START AND EXIT:\n" line that ends the injected prompt).
+        # ">>> LOCATE START AND EXIT (0-indexed, ...):\n" line that ends the injected prompt).
         last_marker_pos = text_after_think.rfind(self._structured_marker)
         if last_marker_pos >= 0:
             # Model output starts right after the marker line
@@ -1268,7 +1274,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
 
             # Combine the prompt header with side output for parsing
             full_side_text = (
-                ">>> LOCATE START AND EXIT:\n" + side_output
+                ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n" + side_output
             )
 
             # First verify LOCATE section
@@ -1415,7 +1421,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
         text_after_think = step[think_end_pos:]
 
         # Strip the injected <format>...</format> template — only look at
-        # actual model output starting from the last ">>> LOCATE START AND EXIT:" marker.
+        # actual model output starting from the last ">>> LOCATE START AND EXIT (0-indexed, ...)" marker.
         last_marker_pos = text_after_think.rfind(self._structured_marker)
         if last_marker_pos >= 0:
             text_after_think = text_after_think[last_marker_pos:]
@@ -1801,7 +1807,8 @@ def __init__(
                 f"reference={self._counting_question['reference']}, "
                 f"options={self._counting_options}"
             )
-        self._count_feedback_given = False  # only give count feedback once
+        self._count_feedback_given = False
+        self._count_feedback_blocks_count = 0  # tracks cardinal count retry attempts
 
         # ---- direction-question verification ----
         self._direction_question = parse_direction_question(problem_text)
@@ -1833,9 +1840,11 @@ def __init__(
         else:
             self._mcq_options = dict(self._counting_options)
 
-        self._direction_feedback_given = False
-        self._object_feedback_given = False
-        self._diag_count_feedback_given = False
+        # Allow multiple retries for final-answer verification
+        self._max_final_answer_retries = 3
+        self._direction_feedback_count = 0
+        self._object_feedback_count = 0
+        self._diag_count_feedback_count = 0
 
     @classmethod
     def from_prompt(
@@ -2076,7 +2085,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
 
             # Side-stream: get analysis from the model
             side_output = await self._side_stream_spatialmap(
-                text_with_prompt, max_new_tokens=600
+                text_with_prompt, max_new_tokens=800
             )
 
             if not side_output or len(side_output.strip()) < 20:
@@ -2226,7 +2235,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
             if (
                 self._direction_question
                 and num_corrections < self.max_corrections
-                and not self._direction_feedback_given
+                and self._direction_feedback_count < self._max_final_answer_retries
             ):
                 model_dir_text = parse_model_boxed_answer(
                     recent_text, self._mcq_options
@@ -2242,7 +2251,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
                         f"model={model_dir_text}, possible={possible}"
                     )
                     if model_dir_text not in possible:
-                        self._direction_feedback_given = True
+                        self._direction_feedback_count += 1
                         # Find which MCQ options are consistent
                         valid_options = [
                             letter for letter, val in self._mcq_options.items()
@@ -2300,7 +2309,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
             if (
                 self._object_question
                 and num_corrections < self.max_corrections
-                and not self._object_feedback_given
+                and self._object_feedback_count < self._max_final_answer_retries
             ):
                 model_obj_text = parse_model_boxed_answer(
                     recent_text, self._mcq_options
@@ -2325,7 +2334,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
                         f"consistent_options={consistent}"
                     )
                     if model_letter not in consistent:
-                        self._object_feedback_given = True
+                        self._object_feedback_count += 1
                         odir = self._object_question["direction"]
                         oref = self._object_question["reference"]
                         if len(consistent) == 1:
@@ -2402,9 +2411,12 @@ async def verify(self, step: str, token_index: int, event, event_info):
                         model_count is not None
                         and model_count != z3_count
                     ):
-                        if not self._count_feedback_given:
-                            self._count_feedback_given = True
+                        self._count_feedback_given = True
+                        count_corrections = self._count_feedback_blocks_count
+                        self._count_feedback_blocks_count = count_corrections + 1
 
+                        if count_corrections == 0:
+                            # First attempt: explain why cardinal = 0
                             if direction in ("north", "south"):
                                 diag_examples = "northeast or northwest"
                             elif direction == "west":
@@ -2433,6 +2445,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
                                 f">>> STEP 3: ANSWER\n"
                             )
                         else:
+                            # Subsequent attempts: force the correct answer directly
                             correct_option = None
                             for opt, val in self._counting_options.items():
                                 if val == "0":
@@ -2469,7 +2482,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
 
                 else:
                     # --- Diagonal: use Z3 range check ---
-                    if not self._diag_count_feedback_given:
+                    if self._diag_count_feedback_count < self._max_final_answer_retries:
                         model_count = parse_model_count_from_answer(
                             recent_text, self._counting_options
                         )
@@ -2491,7 +2504,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
                             )
 
                             if not (min_c <= model_count <= max_c):
-                                self._diag_count_feedback_given = True
+                                self._diag_count_feedback_count += 1
                                 # Find valid MCQ options
                                 valid_opts = []
                                 for opt, val in (
diff --git a/interwhen/utils/maze_verifier.py b/interwhen/utils/maze_verifier.py
index c0873295..1fea92da 100644
--- a/interwhen/utils/maze_verifier.py
+++ b/interwhen/utils/maze_verifier.py
@@ -142,10 +142,22 @@ def parse_maze_from_prompt(prompt: str) -> Tuple[List[List[str]], Optional[Tuple
     
     for line in lines:
         stripped = line.strip()
-        if stripped.startswith('#') and all(c in '#XSEX ' for c in stripped):
-            in_maze = True
-            current_maze.append(stripped)
-        elif in_maze:
+        # Some dataset entries glue the last maze row to description text,
+        # e.g. "#######, where the symbols ...".  Strip everything from the
+        # first character that isn't a valid maze cell.
+        if stripped.startswith('#'):
+            maze_part = ""
+            for ch in stripped:
+                if ch in '# XSEX':
+                    maze_part += ch
+                else:
+                    break
+            maze_part = maze_part.rstrip()
+            if maze_part and all(c in '#XSEX ' for c in maze_part):
+                in_maze = True
+                current_maze.append(maze_part)
+                continue
+        if in_maze:
             if current_maze:
                 all_mazes.append(current_maze)
             current_maze = []
@@ -420,7 +432,13 @@ def format_locate_feedback(errors: List[str]) -> str:
     feedback = "\n\n[VERIFIER FEEDBACK for LOCATE section:\n"
     for err in errors:
         feedback += f"  ✗ {err}\n"
-    feedback += "Please correct the start/exit positions and continue.]\n\n"
+    feedback += (
+        "IMPORTANT: Coordinates are 0-indexed (row 0, col 0 is the top-left corner). "
+        "Do NOT use 1-indexed coordinates. "
+        "For example, if S is in the first row and first open column, "
+        "that is (0, 1) not (1, 1) or (1, 2).\n"
+        "Please correct the start/exit positions and continue.]\n\n"
+    )
     return feedback
 
 
diff --git a/interwhen/utils/spatialmap_verifier.py b/interwhen/utils/spatialmap_verifier.py
index a89cfa06..7a249fc8 100644
--- a/interwhen/utils/spatialmap_verifier.py
+++ b/interwhen/utils/spatialmap_verifier.py
@@ -347,7 +347,7 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
     claims = []
     
     # Pattern: "X is (to the) DIRECTION of Y"
-    pattern = r"([A-Z][A-Za-z'][A-Za-z'\s]*?)\s+is\s+(?:to\s+the\s+)?(northwest|northeast|southwest|southeast|north|south|east|west)\s+of\s+([A-Z][A-Za-z'][A-Za-z'\s]*?)(?:\.|,|\s*[→✓✗]|\s*$|\s+(?:and|so|which|therefore|thus|but|\())"
+    pattern = r"([A-Z][A-Za-z'][A-Za-z'\s]*?)\s+is\s+(?:to\s+the\s+)?(northwest|northeast|southwest|southeast|north|south|east|west)\s+of\s+([A-Z][A-Za-z'][A-Za-z'\s]*?)(?:\.|,|;|:|\s*[→✓✗]|\s*\n|\s*$|\s+(?:and|so|which|therefore|thus|but|since|because|while|whereas|however|hence|then|for|as|meaning|indicating|implying|suggesting|confirming|\())"
     
     matches = re.finditer(pattern, expanded_text, re.IGNORECASE)
     
@@ -413,11 +413,16 @@ def verify_spatialmap_step(
     Args:
         claim: {"A": entity1, "direction": direction, "B": entity2}
         z3_solver: The Z3 solver with known constraints
-        add_if_valid: If True, add the claim to the solver if it's valid
+        add_if_valid: If True, add the claim to the solver **only if it
+            is entailed** (i.e. its negation is UNSAT).  Merely
+            satisfiable claims are accepted but NOT committed to the
+            solver so they cannot over-constrain future checks.
     
     Returns:
         (is_valid, errors)
     """
+    from z3 import Not as Z3Not, sat as z3sat
+
     errors = []
     
     is_consistent = z3_solver.check_with_new_constraint(claim)
@@ -430,7 +435,17 @@ def verify_spatialmap_step(
         return False, errors
     
     if add_if_valid:
-        z3_solver.apply_ir(claim)
+        # Only commit the claim if it is *entailed* (negation is UNSAT).
+        # This prevents merely-satisfiable-but-unproven claims from
+        # over-constraining the solver and blocking valid solutions later.
+        compiled = z3_solver.compile_constraint(claim)
+        if compiled is not None:
+            z3_solver.solver.push()
+            z3_solver.solver.add(Z3Not(compiled))
+            is_entailed = z3_solver.solver.check() != z3sat
+            z3_solver.solver.pop()
+            if is_entailed:
+                z3_solver.apply_ir(claim)
     
     return True, []
 

From 4084bc8f4426d6f2075b55902ac99f47e4c06569 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen-8gpus.bonete11.svc.cluster.local>
Date: Thu, 26 Feb 2026 11:46:24 +0000
Subject: [PATCH 08/17] added step verfier for all

---
 examples/TTSwithVerification/game24meta.py | 22 ++++++++++++++--------
 examples/TTSwithVerification/mazemeta.py   | 10 ++++++++++
 interwhen/interject.py                     |  6 +++++-
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/examples/TTSwithVerification/game24meta.py b/examples/TTSwithVerification/game24meta.py
index 9b799be4..4ee74a6a 100644
--- a/examples/TTSwithVerification/game24meta.py
+++ b/examples/TTSwithVerification/game24meta.py
@@ -413,14 +413,20 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
-        answer = asyncio.run(stream_completion(
-            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
-            llm_server=llm_server,
-            monitors=monitors,
-            add_delay=False,
-            termination_requires_validation=False,
-            async_execution=True
-        ))
+        try:
+            answer = asyncio.run(stream_completion(
+                f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
+                llm_server=llm_server,
+                monitors=monitors,
+                add_delay=False,
+                termination_requires_validation=False,
+                async_execution=True
+            ))
+        except Exception as e:
+            logger.error(f"Error running example {idx}: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
 
         save_prompt(idx, answer, reason_dir)
         logger.info(f"Raw final output:\n{answer}")
diff --git a/examples/TTSwithVerification/mazemeta.py b/examples/TTSwithVerification/mazemeta.py
index 403115d7..0bf3df5d 100644
--- a/examples/TTSwithVerification/mazemeta.py
+++ b/examples/TTSwithVerification/mazemeta.py
@@ -51,6 +51,13 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResu
     return dirs
 
 
+def save_prompt(idx, prompt_with_answer, reason_dir):
+    """Save reasoning trace to a text file."""
+    filename = os.path.join(reason_dir, f"reason_{idx}.txt")
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(prompt_with_answer)
+
+
 def build_meta_prompt_from_example(example):
     """Build prompt for maze example."""
     system_prompt = """You are a maze-solving AI. Given a maze in ASCII format, analyze it step by step.
@@ -371,6 +378,9 @@ def evaluate_maze_answer(answer, options, ground_truth):
             traceback.print_exc()
             continue
         
+        # Save reasoning trace
+        save_prompt(idx, answer, reason_dir)
+
         # Count generated tokens
         reasoning_tokens = count_tokens(answer, tokenizer)
         total_reasoning_tokens += reasoning_tokens
diff --git a/interwhen/interject.py b/interwhen/interject.py
index cb4f8e60..f424aca6 100644
--- a/interwhen/interject.py
+++ b/interwhen/interject.py
@@ -35,7 +35,11 @@ async def stream_completion(prompt, prev_text = "", llm_server=None, monitors=[]
                         break
                     else:
                         # Obtain the current token (text chunk)
-                        chunk = json.loads(data)["choices"][0]["text"]
+                        try:
+                            chunk = json.loads(data)["choices"][0]["text"]
+                        except (json.JSONDecodeError, KeyError, IndexError) as e:
+                            logger.debug(f"Skipping malformed SSE data: {data!r} ({e})")
+                            continue
                         # If any event is already set, break immediately (we don't want more chunks)
                         if stop_event.is_set():
                             logger.info(f'\n[Early stop already triggered, ignoring chunk: {chunk}]')

From fc4e9b05f65e598ec94c9097429565a1aeb73202 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen3-4gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen3-4gpus.bonete11.svc.cluster.local>
Date: Sat, 28 Feb 2026 05:09:58 +0000
Subject: [PATCH 09/17] Fix extract_solution, regex, and soundness calculation
 for Phi-4-reasoning

- extract_solution: only search for \boxed{} after </think>, not inside thinking trace
- extract_solution: return None if <think> opened but never closed (token limit hit)
- extract_solution: return None for empty \boxed{} (from verifier feedback prompts)
- extract_solution: strip trailing '= 24' from expressions
- extract_solution: add \left/\right LaTeX cleanup
- _find_complete_boxed: brace-counting helper for nested LaTeX (e.g. \frac{}{})
  replaces naive regex in thinkingPhaseVerifier.py (4 sites) and stepVerifier.py (2 sites)
- Soundness: exclude 'no solution' and 'no expression found' cases
- Soundness formula: correct / (total - excluded) instead of correct / attempted
- Add 'excluded' column to CSV output
- Switch model config to microsoft/Phi-4-reasoning with Phi-4 ChatML format
- Projected metrics: Accuracy 1062/1362 (77.97%), Soundness 1062/1062 (100%)
---
 README.md                                     |   8 +-
 examples/EarlyStopping/game24_example.py      |  69 ++-
 .../game24_stepverifier.py                    |  86 ++-
 examples/TTSwithVerification/game24meta.py    |  29 +-
 .../TTSwithVerification/maze_stepverifier.py  |  25 +-
 examples/TTSwithVerification/mazemeta.py      |  11 +-
 .../spatialmap_stepverifier.py                |  25 +-
 examples/TTSwithVerification/spatialmeta.py   | 517 ++++++++++++++++++
 interwhen/monitors/stepVerifier.py            |  42 +-
 interwhen/monitors/thinkingPhaseVerifier.py   |  58 +-
 10 files changed, 788 insertions(+), 82 deletions(-)
 create mode 100644 examples/TTSwithVerification/spatialmeta.py

diff --git a/README.md b/README.md
index 03a763ed..3b79ab94 100644
--- a/README.md
+++ b/README.md
@@ -39,10 +39,10 @@ Running verifier-guided inference requires only a few lines of code: just specif
 **Set up target LLM server**
 ```bash
 python -m vllm.entrypoints.openai.api_server \
-  --model Qwen/Qwen3-30B-A3B-Thinking-2507 \
-  --max-model-len 65536 \
-  --port 8000 \
-  --tensor-parallel-size 8
+  --model microsoft/Phi-4-reasoning \
+  --max-model-len 32768 \
+  --port 8001 \
+  --tensor-parallel-size 2
 ```
 
 **Generate answer enabled with given monitors**
diff --git a/examples/EarlyStopping/game24_example.py b/examples/EarlyStopping/game24_example.py
index 78725273..82df71aa 100644
--- a/examples/EarlyStopping/game24_example.py
+++ b/examples/EarlyStopping/game24_example.py
@@ -15,7 +15,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -25,7 +25,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/Gameof24_results"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_Kstable2/Gameof24_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -42,14 +42,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/Gameof24_res
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/Gameof24_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_Kstable2/Gameof24_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/Gameof24_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_Kstable2/Gameof24_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -73,10 +73,9 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
     payload = {
         "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 20,
+        "top_k": 50,
         "top_p": 0.95,
-        "min_p": 0.0,
-        "temperature": 0.6,
+        "temperature": 0.8,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -148,8 +147,9 @@ def extract_solution(text):
     for latex, op in replacements.items():
         expr = expr.replace(latex, op)
 
-    # 3. Cleanup (remove LaTeX spacing)
+    # 3. Cleanup (remove LaTeX formatting artifacts)
     expr = expr.replace(r"\,", "").replace(r"\ ", "")
+    expr = expr.replace(r"\left", "").replace(r"\right", "")
 
     # 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
     # Insert * between: )( , )number, number(, )(
@@ -249,7 +249,7 @@ def evaluate_game24_answer(answer, nums):
 
     dataset = load_game24_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=32768)
+    llm_server = init_llm_server(main_model, max_tokens=20000)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -273,23 +273,23 @@ def evaluate_game24_answer(answer, nums):
         if args.monitor:
             # Use K-stable answer monitor to detect when equation stabilizes k times
             # monitors = (SimpleTextReplaceMonitor("IsCheck", "</think>", async_execution=False),)
-            # monitors=(KstableAnswerGame24Monitor(
-            #     name="game24_kstable",
-            #     k=3,
-            #     expected_nums=nums,  # Validate equations use exactly these numbers
-            #     answer_start_token="</think>"
-            # ),)
-            monitors = (
-                EATMonitor(
-                    name="EAT_monitor",
-                    model_name=earlystop_model,
-                    alpha=0.2,
-                    delta=0.02,
-                    min_steps=4,
-                    answer_start_token="</think>",
-                    async_execution=True
-                ),
-            )
+            monitors=(KstableAnswerGame24Monitor(
+                name="game24_kstable",
+                k=2,
+                expected_nums=nums,  # Validate equations use exactly these numbers
+                answer_start_token="</think>"
+            ),)
+            # monitors = (
+            #     EATMonitor(
+            #         name="EAT_monitor",
+            #         model_name=earlystop_model,
+            #         alpha=0.2,
+            #         delta=0.02,
+            #         min_steps=4,
+            #         answer_start_token="</think>",
+            #         async_execution=True
+            #     ),
+            # )
         else:
             monitors = ()
 
@@ -297,8 +297,23 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
+        system_prompt = (
+            "You are Phi, a language model trained by Microsoft to help users. "
+            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+            "explorations, and reflections from the Thought section, systematically present the final solution that you "
+            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        )
+
         answer = asyncio.run(stream_completion(
-            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
+            f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n",
             llm_server=llm_server,
             monitors=monitors,
             add_delay=False,
diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index 383f15f7..993a6ee2 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -14,7 +14,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -67,15 +67,14 @@ def load_game24_dataset():
     ds = load_dataset("nlile/24-game", split="train")
     return ds
 
-def init_llm_server(modelname, max_tokens=200, port=8000):
+def init_llm_server(modelname, max_tokens=200, port=8001):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 20,
+        "top_k": 50,
         "top_p": 0.95,
-        "min_p": 0.0,
-        "temperature": 0.6,
+        "temperature": 0.8,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -112,10 +111,22 @@ def count_tokens(text: str, tokenizer) -> int:
 
 def extract_solution(text):
     
+    # Only search for \boxed{} AFTER </think> to avoid grabbing unverified
+    # expressions from inside the thinking trace.
+    # If model opened <think> but never closed it (hit token limit), there is
+    # no final answer — return None.
+    if '</think>' in text:
+        search_text = text[text.rfind('</think>'):]
+    elif '<think>' in text:
+        # Model started thinking but never finished — no verified answer
+        return None
+    else:
+        search_text = text
+
     # Use a more robust extraction that handles nested braces in \boxed{}
     # Find \boxed{ and then match braces properly
     boxed_pattern = r"\\boxed\{"
-    matches = list(re.finditer(boxed_pattern, text))
+    matches = list(re.finditer(boxed_pattern, search_text))
     if not matches:
         return None
     
@@ -124,14 +135,18 @@ def extract_solution(text):
     start = last_match.end()  # Position right after \boxed{
     brace_count = 1
     end = start
-    while end < len(text) and brace_count > 0:
-        if text[end] == '{':
+    while end < len(search_text) and brace_count > 0:
+        if search_text[end] == '{':
             brace_count += 1
-        elif text[end] == '}':
+        elif search_text[end] == '}':
             brace_count -= 1
         end += 1
     
-    expr = text[start:end-1].strip()  # -1 to exclude the closing brace
+    expr = search_text[start:end-1].strip()  # -1 to exclude the closing brace
+
+    # Skip empty \boxed{} (e.g., from verifier feedback "Wrap in \boxed{}.")
+    if not expr:
+        return None
 
     # 1. Convert \frac{a}{b} to (a/b)
     frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
@@ -151,8 +166,12 @@ def extract_solution(text):
     expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
     expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
 
-    # 3. Cleanup (remove LaTeX spacing)
+    # 3. Cleanup (remove LaTeX formatting artifacts)
     expr = expr.replace(r"\,", "").replace(r"\ ", "")
+    expr = expr.replace(r"\left", "").replace(r"\right", "")
+
+    # 3b. Strip trailing "= <number>" (e.g., "10 - 8/8 * 1 = 24" -> "10 - 8/8 * 1")
+    expr = re.sub(r'\s*=\s*[\d.]+\s*$', '', expr)
 
     # 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
     # Insert * between: )( , )number, number(, )(
@@ -219,8 +238,8 @@ def evaluate_game24_answer(answer, nums):
     parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--thinking_verify", "-tv", action="store_true", default = True, help="Enable thinking-phase step verification (verify during <think> trace)")
-    parser.add_argument("--newline_threshold", type=int, default=10, help="Number of newlines in thinking before forcing step verification (used with --thinking_verify)")
-    parser.add_argument("--warmup", type=int, default=0, help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
+    parser.add_argument("--newline_threshold", type=int, default=20, help="Number of newlines in thinking before forcing step verification (used with --thinking_verify)")
+    parser.add_argument("--warmup", type=int, default=4, help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
     parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
     args = parser.parse_args()
@@ -262,7 +281,9 @@ def evaluate_game24_answer(answer, nums):
 
     num_correct = 0
     num_attempted = 0  # examples where a \boxed{} answer was produced
+    num_excluded = 0   # examples excluded from soundness (no solution or token budget exceeded)
     N = args.num_examples
+    max_token_budget = llm_server["payload"]["max_tokens"]
     total_reasoning_tokens = 0
     reasoning_token_counts = []
     per_example_results = []  # list of dicts for CSV
@@ -275,7 +296,21 @@ def evaluate_game24_answer(answer, nums):
         nums = example["numbers"]
 
         prompt = build_prompt(nums)
-        full_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+        system_prompt = (
+            "You are Phi, a language model trained by Microsoft to help users. "
+            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+            "explorations, and reflections from the Thought section, systematically present the final solution that you "
+            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        )
+        full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n"
 
         if args.monitor:
             # ThinkingPhaseStepVerifierGame24Monitor handles both cases:
@@ -288,7 +323,7 @@ def evaluate_game24_answer(answer, nums):
                 llm_server=llm_server,
                 prompt=full_prompt,
                 newline_threshold=threshold,
-                max_corrections=5,
+                max_corrections=3,
                 answer_start_token="</think>",
                 warmup_newlines=args.warmup,
             ),)
@@ -321,6 +356,15 @@ def evaluate_game24_answer(answer, nums):
         attempted = (expr is not None and expr.strip().lower() != "no solution")
         if attempted:
             num_attempted += 1
+
+        # Determine if this example should be excluded from soundness:
+        #   - answered "no solution" (gave up / max corrections)
+        #   - no expression found (verifier never completed Phase 2)
+        gave_no_solution = (expr is not None and "no solution" in expr.strip().lower())
+        no_expr_found = (expr is None)
+        excluded = gave_no_solution or no_expr_found
+        if excluded:
+            num_excluded += 1
         
         if expr:
             logger.info(f"Extracted expression: {expr}")
@@ -335,6 +379,7 @@ def evaluate_game24_answer(answer, nums):
             "expression": expr if expr else "",
             "correct": is_correct,
             "attempted": attempted,
+            "excluded": excluded,
             "tokens": reasoning_tokens,
             "message": message,
         })
@@ -342,17 +387,19 @@ def evaluate_game24_answer(answer, nums):
     # Calculate final statistics
     avg_reasoning_tokens = total_reasoning_tokens / N if N > 0 else 0
     accuracy = num_correct / N if N > 0 else 0
-    soundness = num_correct / num_attempted if num_attempted > 0 else 0  # correct / attempted
+    soundness_denom = N - num_excluded
+    soundness = num_correct / soundness_denom if soundness_denom > 0 else 0  # correct / (total - excluded)
     
     print(f"\nFinal Accuracy: {num_correct}/{N} ({accuracy:.2%})")
-    print(f"Soundness: {num_correct}/{num_attempted} ({soundness:.2%})")
+    print(f"Soundness: {num_correct}/{soundness_denom} ({soundness:.2%})")
+    print(f"Excluded from soundness (no solution / token budget exceeded): {num_excluded}")
     print(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}")
     print(f"Total Reasoning Tokens: {total_reasoning_tokens}")
 
     # Save per-example CSV
     csv_file = os.path.join(output_dirs["csv_saved"], f"results_{N}examples.csv")
     with open(csv_file, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=["index", "numbers", "expression", "correct", "attempted", "tokens", "message"])
+        writer = csv.DictWriter(f, fieldnames=["index", "numbers", "expression", "correct", "attempted", "excluded", "tokens", "message"])
         writer.writeheader()
         writer.writerows(per_example_results)
     logger.info(f"Per-example CSV saved to {csv_file}")
@@ -374,7 +421,8 @@ def evaluate_game24_answer(answer, nums):
         f.write(f"Correct: {num_correct}/{N}\n")
         f.write(f"Accuracy: {accuracy:.2%}\n")
         f.write(f"Attempted (produced \\boxed answer): {num_attempted}/{N}\n")
-        f.write(f"Soundness (correct/attempted): {soundness:.2%}\n\n")
+        f.write(f"Excluded (no solution / token budget exceeded): {num_excluded}/{N}\n")
+        f.write(f"Soundness (correct / (total - excluded)): {num_correct}/{soundness_denom} = {soundness:.2%}\n\n")
         f.write(f"Token Statistics:\n")
         f.write(f"---------------------------\n")
         f.write(f"Total Tokens: {total_reasoning_tokens}\n")
diff --git a/examples/TTSwithVerification/game24meta.py b/examples/TTSwithVerification/game24meta.py
index 4ee74a6a..fc8fd318 100644
--- a/examples/TTSwithVerification/game24meta.py
+++ b/examples/TTSwithVerification/game24meta.py
@@ -16,7 +16,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -69,15 +69,14 @@ def load_game24_dataset():
     ds = load_dataset("nlile/24-game", split="train")
     return ds
 
-def init_llm_server(modelname, max_tokens=200, port=8000):
+def init_llm_server(modelname, max_tokens=200, port=8001):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 20,
+        "top_k": 50,
         "top_p": 0.95,
-        "min_p": 0.0,
-        "temperature": 0.6,
+        "temperature": 0.8,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -279,8 +278,9 @@ def extract_solution(text):
     expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
     expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
 
-    # 3. Cleanup (remove LaTeX spacing)
+    # 3. Cleanup (remove LaTeX formatting artifacts)
     expr = expr.replace(r"\,", "").replace(r"\ ", "")
+    expr = expr.replace(r"\left", "").replace(r"\right", "")
 
     # 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
     # Insert * between: )( , )number, number(, )(
@@ -413,9 +413,24 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
+        system_prompt = (
+            "You are Phi, a language model trained by Microsoft to help users. "
+            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+            "explorations, and reflections from the Thought section, systematically present the final solution that you "
+            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        )
+
         try:
             answer = asyncio.run(stream_completion(
-                f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
+                f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n",
                 llm_server=llm_server,
                 monitors=monitors,
                 add_delay=False,
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index 4c2ca456..e6cdf8f4 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
 # =================================================
 
 
@@ -118,10 +118,9 @@ def init_llm_server(model_name, max_tokens=22000, port=8000):
     payload = {
         "model": model_name,
         "max_tokens": max_tokens,
-        "top_k": 20,
+        "top_k": 50,
         "top_p": 0.95,
-        "min_p": 0.0,
-        "temperature": 0.6,
+        "temperature": 0.8,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -278,8 +277,22 @@ def evaluate_maze_answer(answer, options, ground_truth):
         pattern = rf'\b({keys})\.\s*([A-Za-z0-9]+)\b'
         options = dict(re.findall(pattern, user_prompt))
         
-        # Build simple prompt — no system/meta prompt; structure injected by monitor
-        full_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        # Build prompt with Phi-4-reasoning system prompt
+        phi_system_prompt = (
+            "You are Phi, a language model trained by Microsoft to help users. "
+            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+            "explorations, and reflections from the Thought section, systematically present the final solution that you "
+            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        )
+        full_prompt = f"<|im_start|>system<|im_sep|>\n{phi_system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
         
         # Parse maze from prompt
         grid, start_pos, exit_pos = parse_maze_from_prompt(user_prompt)
diff --git a/examples/TTSwithVerification/mazemeta.py b/examples/TTSwithVerification/mazemeta.py
index 0bf3df5d..b13f2c5b 100644
--- a/examples/TTSwithVerification/mazemeta.py
+++ b/examples/TTSwithVerification/mazemeta.py
@@ -24,7 +24,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
 # =================================================
 
 
@@ -208,10 +208,9 @@ def init_llm_server(model_name, max_tokens=22000, port=8000):
     payload = {
         "model": model_name,
         "max_tokens": max_tokens,
-        "top_k": 20,
+        "top_k": 50,
         "top_p": 0.95,
-        "min_p": 0.0,
-        "temperature": 0.6,
+        "temperature": 0.8,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -332,8 +331,8 @@ def evaluate_maze_answer(answer, options, ground_truth):
         pattern = rf'\b({keys})\.\s*([A-Za-z0-9]+)\b'
         options = dict(re.findall(pattern, user_prompt))
         
-        # Build full prompt
-        full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        # Build full prompt with Phi-4-reasoning ChatML format
+        full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
         
         # Parse maze from prompt
         grid, start_pos, exit_pos = parse_maze_from_prompt(user_prompt)
diff --git a/examples/TTSwithVerification/spatialmap_stepverifier.py b/examples/TTSwithVerification/spatialmap_stepverifier.py
index 0e78e4d9..acfb26af 100644
--- a/examples/TTSwithVerification/spatialmap_stepverifier.py
+++ b/examples/TTSwithVerification/spatialmap_stepverifier.py
@@ -27,7 +27,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
 # =================================================
 
 
@@ -117,10 +117,9 @@ def init_llm_server(model_name, max_tokens=20000, port=8000):
     payload = {
         "model": model_name,
         "max_tokens": max_tokens,
-        "top_k": 20,
+        "top_k": 50,
         "top_p": 0.95,
-        "min_p": 0.0,
-        "temperature": 0.6,
+        "temperature": 0.8,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -276,8 +275,22 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
         # Determine question type
         question_type = get_question_type(idx)
         
-        # Build simple prompt — no system/meta prompt; structure injected by monitor
-        full_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        # Build prompt with Phi-4-reasoning system prompt
+        phi_system_prompt = (
+            "You are Phi, a language model trained by Microsoft to help users. "
+            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+            "explorations, and reflections from the Thought section, systematically present the final solution that you "
+            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        )
+        full_prompt = f"<|im_start|>system<|im_sep|>\n{phi_system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
         
         logger.info(f"\n{'='*60}")
         logger.info(f"Example {idx} ({question_type})")
diff --git a/examples/TTSwithVerification/spatialmeta.py b/examples/TTSwithVerification/spatialmeta.py
new file mode 100644
index 00000000..0cfc7672
--- /dev/null
+++ b/examples/TTSwithVerification/spatialmeta.py
@@ -0,0 +1,517 @@
+"""
+SpatialMap experiment with step-by-step verification using StepVerifierSpatialMapMonitor.
+
+Uses the new monitor-based architecture that integrates with stream_completion.
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import re
+import numpy as np
+from pathlib import Path
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from interwhen import stream_completion
+from interwhen.monitors import StepVerifierSpatialMapMonitor
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+logger = logging.getLogger(__name__)
+
+# ============== MODEL CONFIGURATION ==============
+MAIN_MODEL = "microsoft/Phi-4-reasoning"
+# =================================================
+
+
+def get_model_short_name(model_name: str) -> str:
+    """Extract a short, filesystem-safe name from the model path."""
+    short_name = model_name.split("/")[-1]
+    short_name = short_name.replace(" ", "_").replace(":", "-")
+    return short_name
+
+
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/SpatialMapResults/metaPrompt"):
+    """Create and return output directory paths based on model name."""
+    model_short_name = get_model_short_name(main_model)
+    output_base = os.path.join(base_dir, model_short_name)
+    
+    dirs = {
+        "base": output_base,
+        "reasoning": os.path.join(output_base, "Reasoning_output"),
+    }
+    
+    for dir_path in dirs.values():
+        os.makedirs(dir_path, exist_ok=True)
+    
+    return dirs
+
+
+def remove_last_paragraph(s: str) -> str:
+    """Remove the last instruction paragraph from the prompt."""
+    return s[:-143] if len(s) > 143 else s
+
+
+def get_question_type(idx: int) -> str:
+    """Determine question type based on index range.
+    
+    Dataset structure (1500 examples total):
+    - 0-499: Q0 (direction finding)
+    - 500-999: Q1 (object finding)
+    - 1000-1499: Q2 (counting)
+    """
+    if idx < 500:
+        return "direction"
+    elif idx < 1000:
+        return "object"
+    else:
+        return "counting"
+
+
+def build_meta_prompt_from_example(example):
+    """Build prompt with structured output format instructions."""
+    
+    # Get the description
+    description = example.get("prompt")
+    description = str(description)
+    description = remove_last_paragraph(description)
+    
+    pre_prompt = """You are a spatial reasoning expert. Given a description of objects on a map and their relative positions, analyze the spatial relationships step by step.
+
+CRITICAL INSTRUCTION: DO NOT use abbreviations or initials for entity names. Always use the COMPLETE FULL NAME of each entity exactly as given in the problem. For example, write "Police Supply Store" not "PSS" or "PS".
+
+DIRECTION DEFINITIONS (Diagonal Directions):
+- Northwest = up and to the left (row decreases, col decreases)
+- Northeast = up and to the right (row decreases, col increases)  
+- Southwest = down and to the left (row increases, col decreases)
+- Southeast = down and to the right (row increases, col increases)
+
+CARDINAL DIRECTIONS (for questions asking about North/South/East/West):
+- North = directly up - requires BOTH Northwest AND Northeast relationships to be confirmed
+- South = directly down - requires BOTH Southwest AND Southeast relationships to be confirmed
+- West = directly left - requires BOTH Northwest AND Southwest relationships to be confirmed
+- East = directly right - requires BOTH Northeast AND Southeast relationships to be confirmed
+
+IMPORTANT: In this dataset, only diagonal relationships (NW/NE/SW/SE) are given. An object can ONLY be in a pure cardinal direction (N/S/E/W) if BOTH required diagonal relationships exist.
+
+IMPORTANT RULES:
+- Directions are TRANSITIVE: If A is Northwest of B, and B is Northwest of C, then A is Northwest of C.
+- Directions are REVERSIBLE: If A is Northwest of B, then B is Southeast of A.
+- Opposite pairs: Northwest ↔ Southeast, Northeast ↔ Southwest
+
+STRUCTURED OUTPUT FORMAT:
+
+═══════════════════════════════════════════════════════════════════════════════
+EXAMPLE 1: Direction Finding (Q0)
+═══════════════════════════════════════════════════════════════════════════════
+
+Map Description:
+Police Supply Store is in the map. Narwhal's Novelties is to the Northwest of Police Supply Store. Coral Crafts is to the Northwest of Narwhal's Novelties. Coral Crafts is to the Northwest of Police Supply Store. Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store. Oz Oddities is to the Southwest of Planetarium Prints. Oz Oddities is to the Southwest of Police Supply Store. Ice Queen Ice Cream is to the Northwest of Planetarium Prints. Ice Queen Ice Cream is to the Southeast of Coral Crafts.
+
+Question: In which direction is Planetarium Prints relative to Police Supply Store?
+
+### Final Answer
+
+>>> STEP 1: PARSE RELATIONSHIPS
+    - Narwhal's Novelties is to the Northwest of Police Supply Store
+    - Coral Crafts is to the Northwest of Narwhal's Novelties
+    - Coral Crafts is to the Northwest of Police Supply Store
+    - Planetarium Prints is to the Southeast of Coral Crafts
+    - Planetarium Prints is to the Northeast of Police Supply Store
+    - Oz Oddities is to the Southwest of Planetarium Prints
+    - Oz Oddities is to the Southwest of Police Supply Store
+    - Ice Queen Ice Cream is to the Northwest of Planetarium Prints
+    - Ice Queen Ice Cream is to the Southeast of Coral Crafts
+
+>>> STEP 2: FIND DIRECT RELATIONSHIP
+    - Looking for: Planetarium Prints relative to Police Supply Store
+    - Direct relationship found: "Planetarium Prints is to the Northeast of Police Supply Store"
+
+>>> STEP 3: ANSWER
+    - Planetarium Prints is to the NORTHEAST of Police Supply Store.
+    
+>>> FINAL ANSWER: Northeast
+    \\boxed{A}
+
+═══════════════════════════════════════════════════════════════════════════════
+EXAMPLE 2: Object Finding (Q1)
+═══════════════════════════════════════════════════════════════════════════════
+
+Map Description:
+Quail's Quilts is in the map. Olive's Oils is to the Southeast of Quail's Quilts. Lumber's Marketplace is to the Northeast of Olive's Oils. Lumber's Marketplace is to the Northeast of Quail's Quilts. Stingray Shoes is to the Northeast of Quail's Quilts. Stingray Shoes is to the Northwest of Lumber's Marketplace. Elephant's Electronics is to the Northeast of Olive's Oils. Elephant's Electronics is to the Northeast of Lumber's Marketplace. Blossom Boutique is to the Northwest of Elephant's Electronics. Blossom Boutique is to the Southeast of Stingray Shoes.
+
+Question: Which object is in the Southwest of Lumber's Marketplace?
+
+### Final Answer
+
+>>> STEP 1: PARSE RELATIONSHIPS
+    - Olive's Oils is to the Southeast of Quail's Quilts
+    - Lumber's Marketplace is to the Northeast of Olive's Oils
+    - Lumber's Marketplace is to the Northeast of Quail's Quilts
+    - Stingray Shoes is to the Northeast of Quail's Quilts
+    - Stingray Shoes is to the Northwest of Lumber's Marketplace
+    - Elephant's Electronics is to the Northeast of Olive's Oils
+    - Elephant's Electronics is to the Northeast of Lumber's Marketplace
+    - Blossom Boutique is to the Northwest of Elephant's Electronics
+    - Blossom Boutique is to the Southeast of Stingray Shoes
+
+>>> STEP 2: FIND OBJECTS IN SOUTHWEST OF Lumber's Marketplace
+    - Using reversibility: if Lumber's Marketplace is to the Northeast of X, then X is to the Southwest of Lumber's Marketplace.
+    - Scanning relationships for "Lumber's Marketplace is to the Northeast of X":
+    - "Lumber's Marketplace is to the Northeast of Olive's Oils" → Olive's Oils is SOUTHWEST of Lumber's Marketplace ✓
+    - "Lumber's Marketplace is to the Northeast of Quail's Quilts" → Quail's Quilts is SOUTHWEST of Lumber's Marketplace ✓
+    - Other objects:
+    - Stingray Shoes is Northwest of Lumber's Marketplace → NOT Southwest
+    - Elephant's Electronics is Northeast of Lumber's Marketplace → NOT Southwest
+    - Blossom Boutique: no direct relationship to Lumber's Marketplace given
+    - Objects in Southwest of Lumber's Marketplace: Olive's Oils, Quail's Quilts
+    - Checking options: Quail's Quilts matches option D.
+
+>>> STEP 3: ANSWER
+    - Quail's Quilts is in the Southwest of Lumber's Marketplace.
+    
+>>> FINAL ANSWER: Quail's Quilts
+    \\boxed{D}
+
+═══════════════════════════════════════════════════════════════════════════════
+EXAMPLE 3: Counting (Q2)
+═══════════════════════════════════════════════════════════════════════════════
+
+Map Description:
+Tremor Toys is in the map. Fresh Foods is to the Northeast of Tremor Toys. Salmon Sushi is to the Northeast of Fresh Foods. Salmon Sushi is to the Northeast of Tremor Toys. Recycle Center is to the Northeast of Fresh Foods. Recycle Center is to the Southeast of Salmon Sushi. Wolf's Wardrobe is to the Southeast of Fresh Foods. Wolf's Wardrobe is to the Southeast of Tremor Toys. Mantis's Maps is to the Southeast of Salmon Sushi. Mantis's Maps is to the Southeast of Fresh Foods.
+
+Question: How many objects are in the Southwest of Mantis's Maps?
+
+### Final Answer
+
+>>> STEP 1: PARSE RELATIONSHIPS
+    - Fresh Foods is to the Northeast of Tremor Toys
+    - Salmon Sushi is to the Northeast of Fresh Foods
+    - Salmon Sushi is to the Northeast of Tremor Toys
+    - Recycle Center is to the Northeast of Fresh Foods
+    - Recycle Center is to the Southeast of Salmon Sushi
+    - Wolf's Wardrobe is to the Southeast of Fresh Foods
+    - Wolf's Wardrobe is to the Southeast of Tremor Toys
+    - Mantis's Maps is to the Southeast of Salmon Sushi
+    - Mantis's Maps is to the Southeast of Fresh Foods
+
+>>> STEP 2: COUNT OBJECTS IN SOUTHWEST OF Mantis's Maps
+    - Using reversibility: if Mantis's Maps is to the Southeast of X, then X is to the Northwest of Mantis's Maps (NOT Southwest!).
+    - For X to be Southwest of Mantis's Maps, we need: "Mantis's Maps is to the Northeast of X" or "X is to the Southwest of Mantis's Maps".
+    - Scanning ALL relationships involving Mantis's Maps:
+    - Mantis's Maps is to the Southeast of Salmon Sushi → Salmon Sushi is NORTHWEST of Mantis's Maps (not Southwest)
+    - Mantis's Maps is to the Southeast of Fresh Foods → Fresh Foods is NORTHWEST of Mantis's Maps (not Southwest)
+    - No other relationships mention Mantis's Maps directly.
+    - Checking each object for SOUTHWEST relationship to Mantis's Maps:
+    - Tremor Toys: No direct relationship to Mantis's Maps given. Cannot determine.
+    - Fresh Foods: Northwest of Mantis's Maps (not Southwest)
+    - Salmon Sushi: Northwest of Mantis's Maps (not Southwest)
+    - Recycle Center: No direct relationship to Mantis's Maps given. Cannot determine.
+    - Wolf's Wardrobe: No direct relationship to Mantis's Maps given. Cannot determine.
+    - Count of objects confirmed to be Southwest of Mantis's Maps: 0
+    - But wait - let me check if we can use transitivity:
+    - Wolf's Wardrobe is Southeast of Tremor Toys
+    - Mantis's Maps is Southeast of Fresh Foods, Fresh Foods is Northeast of Tremor Toys
+    - So Mantis's Maps is "more east and south" than Tremor Toys, but exact direction unclear.
+    - Using only DIRECT relationships where we can confirm Southwest: 0 objects.
+    - Checking the options: If 0 is not available, we need to reconsider.
+    - Options available: A. 5, B. 3, C. 2, D. 1
+    - Re-examining with transitivity for Southwest (row increase, col decrease from Mantis's Maps):
+    - For Tremor Toys to be SW of Mantis's Maps: Tremor Toys must be south and west of Mantis's Maps.
+    - Tremor Toys → Fresh Foods (NE) → Mantis's Maps (SE of Fresh Foods)
+    - So Tremor Toys is southwest of Fresh Foods, and Mantis's Maps is southeast of Fresh Foods.
+    - This means Tremor Toys is west of Mantis's Maps, but row comparison is unclear.
+    - Since only 1 object (Tremor Toys) could potentially be SW based on chain reasoning, answer is D. 1.
+
+>>> STEP 3: ANSWER
+    - There is 1 object in the Southwest of Mantis's Maps.
+    
+>>> FINAL ANSWER: 1
+    \\boxed{D}
+
+═══════════════════════════════════════════════════════════════════════════════
+
+REMINDER: Use the COMPLETE FULL NAME of each entity. DO NOT abbreviate or use initials.
+
+Now solve the following spatial reasoning problem using the EXACT same format."""
+    
+    return pre_prompt, description
+
+
+def extract_solution(text: str) -> str:
+    """Extract the boxed answer from the response (after </think>)."""
+    if "</think>" in text:
+        answer_section = text.split("</think>")[-1]
+    else:
+        answer_section = text
+    
+    matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
+    if matches:
+        return matches[-1].strip()
+    
+    match = re.search(r'(?:answer|Answer)[:\s]+([A-D])', answer_section)
+    if match:
+        return match.group(1).strip()
+    
+    return None
+
+
+def count_tokens(text: str, tokenizer) -> int:
+    """Count the total number of tokens in the generated text using the tokenizer."""
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    return len(tokens)
+
+
+def init_llm_server(model_name, max_tokens=22000, port=8001):
+    """Initialize LLM server configuration."""
+    url = f"http://localhost:{port}/v1/completions"
+    payload = {
+        "model": model_name,
+        "max_tokens": max_tokens,
+        "top_k": 50,
+        "top_p": 0.95,
+        "temperature": 0.8,
+        "stream": True,
+        "logprobs": 20,
+        "use_beam_search": False,
+        "prompt_cache": True,
+        "seed": 42
+    }
+    headers = {"Content-Type": "application/json"}
+    return {"url": url, "payload": payload, "headers": headers}
+
+
+def save_output(idx: int, output: str, output_dir: str):
+    """Save output to file."""
+    os.makedirs(output_dir, exist_ok=True)
+    filepath = os.path.join(output_dir, f"output_{idx}.txt")
+    with open(filepath, 'w') as f:
+        f.write(output)
+    logger.info(f"Saved output to {filepath}")
+
+def evaluate_spatialmap_answer(answer, options, ground_truth):
+    """
+    Evaluate a SpatialMap MCQ answer and return (is_correct, extracted_answer, message).
+    
+    Args:
+        answer: Raw model output
+        options: Dictionary mapping option letters (A/B/C/D) to their values
+        ground_truth: The correct answer value
+        
+    Returns:
+        Tuple of (is_correct, extracted_answer, message)
+    """
+    sol = extract_solution(answer)
+    gt_sol = str(ground_truth).strip()
+    
+    if not sol:
+        return False, None, "No expression found"
+    
+    sol = sol.strip()
+    
+    # Case 1: LLM returned option letter (A/B/C/D)
+    if sol in options:
+        if options[sol] == gt_sol:
+            return True, sol, f"Correct: option {sol} -> {options[sol]}"
+        else:
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
+    
+    # Case 2: LLM returned the actual answer text
+    # First check if sol matches ground truth directly
+    if sol.lower() == gt_sol.lower():
+        return True, sol, f"Correct: answer text matches ground truth: {sol}"
+    
+    # Check if sol matches any option value
+    for opt_letter, opt_value in options.items():
+        if sol.lower() == opt_value.lower():
+            if opt_value == gt_sol:
+                return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
+            else:
+                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
+    
+    return False, sol, f"Solution '{sol}' not found in options or ground truth"
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run SpatialMap experiments with StepVerifierSpatialMapMonitor")
+    parser.add_argument("--model", type=str, default=MAIN_MODEL,
+                        help="Model name for generation")
+    parser.add_argument("--indices", type=str, default=None,
+                        help="Comma-separated indices to run (e.g., '0,100,200')")
+    parser.add_argument("--start", type=int, default=0, help="Start index")
+    parser.add_argument("--end", type=int, default=1500, help="End index")
+    parser.add_argument("--num_examples", "-n", type=int, default=None,
+                        help="Number of examples to run (overrides start/end)")
+    parser.add_argument("--max_corrections", type=int, default=5,
+                        help="Maximum number of correction attempts per example")
+    parser.add_argument("--port", type=int, default=8001, help="vLLM server port")
+    parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+    
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
+    # Load dataset (spatial_map_text_only has 1500 examples)
+    dataset = load_dataset("microsoft/VISION_LANGUAGE", 'spatial_map_text_only', split='val')
+    
+    # Setup LLM server
+    llm_server = init_llm_server(args.model, port=args.port)
+    
+    # Load tokenizer for accurate token counting
+    logger.info(f"Loading tokenizer for {args.model}...")
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    logger.info("Tokenizer loaded successfully.")
+    
+    # Setup output directory
+    output_dirs = get_output_dirs(args.model)
+    reason_dir = output_dirs["reasoning"]
+    
+    # Determine indices
+    max_idx = len(dataset) - 1
+    if args.indices:
+        indices = [int(x.strip()) for x in args.indices.split(",")]
+    elif args.num_examples:
+        # Sample evenly across all 1500 examples (0-1499)
+        indices = np.linspace(0, min(max_idx, 1499), args.num_examples, dtype=int)
+    else:
+        indices = range(args.start, min(args.end, max_idx + 1))
+    
+    # Stats tracking
+    results = []
+    total_correct = 0
+    total_examples = 0
+    total_reasoning_tokens = 0
+    
+    # Per-type stats
+    stats_by_type = {
+        "direction": {"total": 0, "correct": 0},
+        "object": {"total": 0, "correct": 0},
+        "counting": {"total": 0, "correct": 0},
+    }
+    
+    for idx in indices:
+        example = dataset[idx]
+        system_prompt, user_prompt = build_meta_prompt_from_example(example)
+        if str(example.get("ground_truth", "")).strip() == "Q4":
+                target_options = ["A", "B"]
+        else:
+            target_options = ["A", "B", "C", "D"] 
+        keys = "|".join(map(re.escape, target_options))
+        pattern = r'\b([A-D])\.\s*(.*?)(?=\s*[A-D]\.|$)'
+        raw = re.findall(pattern, user_prompt, flags=re.DOTALL)
+
+        options = {k: v.strip().rstrip(".") for k, v in raw}
+        
+        # Determine question type
+        question_type = get_question_type(idx)
+        
+        # Build full prompt with Phi-4-reasoning ChatML format
+        full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
+        
+        logger.info(f"\n{'='*60}")
+        logger.info(f"Example {idx} ({question_type})")
+        logger.info(f"{'='*60}")
+        
+        # Create the monitor with the problem text
+        monitor = StepVerifierSpatialMapMonitor.from_prompt(
+            problem_text=user_prompt,
+            max_corrections=args.max_corrections,
+            name="spatialmap_step_verifier"
+        )
+        
+        logger.info(f"Z3 solver initialized with {len(monitor.z3_solver.parsed_relations)} relations")
+        
+        # Run with stream_completion
+        try:
+            answer = asyncio.run(stream_completion(
+                full_prompt,
+                llm_server=llm_server,
+                monitors=(),
+                add_delay=False,
+                termination_requires_validation=False,
+                async_execution=True
+            ))
+        except Exception as e:
+            logger.error(f"Error running example {idx}: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+        
+        # Count generated tokens
+        reasoning_tokens = count_tokens(answer, tokenizer)
+        total_reasoning_tokens += reasoning_tokens
+        
+        # Evaluate the answer
+        gt_sol = str(example.get("ground_truth", "")).strip()
+        is_correct, extracted_answer, message = evaluate_spatialmap_answer(answer, options, gt_sol)
+        
+        if extracted_answer:
+            logger.info(f"Extracted answer: {extracted_answer}")
+        logger.info(message)
+        
+        if is_correct:
+            total_correct += 1
+            stats_by_type[question_type]["correct"] += 1
+            
+        total_examples += 1
+        stats_by_type[question_type]["total"] += 1
+        # Save output
+        save_output(idx, answer, reason_dir)
+        
+        # Log result
+        result = {
+            'idx': int(idx),
+            'question_type': question_type,
+            'correct': is_correct,
+            'sol': extracted_answer,
+            'gt': gt_sol,
+            'reasoning_tokens': reasoning_tokens,
+            'num_relations': len(monitor.z3_solver.parsed_relations),
+            'verified_claims': len(monitor.verified_claims),
+        }
+        results.append(result)
+        
+        logger.info(f"Result: sol={extracted_answer}, gt={gt_sol}, correct={is_correct}")
+        logger.info(f"Verified claims: {len(monitor.verified_claims)}")
+        logger.info(f"Reasoning tokens: {reasoning_tokens}")
+    
+    # Compute final metrics
+    accuracy = total_correct / total_examples if total_examples > 0 else 0
+    avg_reasoning_tokens = total_reasoning_tokens / total_examples if total_examples > 0 else 0
+    
+    logger.info(f"\n{'='*60}")
+    logger.info(f"FINAL RESULTS")
+    logger.info(f"{'='*60}")
+    logger.info(f"Total examples: {total_examples}")
+    logger.info(f"Correct: {total_correct}")
+    logger.info(f"Accuracy: {accuracy:.4f} ({total_correct}/{total_examples})")
+    logger.info(f"Total reasoning tokens: {total_reasoning_tokens}")
+    logger.info(f"Avg reasoning tokens: {avg_reasoning_tokens:.1f}")
+    
+    # Per-type breakdown
+    logger.info(f"\nPer-type breakdown:")
+    for qtype, stats in stats_by_type.items():
+        if stats["total"] > 0:
+            acc = stats["correct"] / stats["total"]
+            logger.info(f"  {qtype}: {acc:.4f} ({stats['correct']}/{stats['total']})")
+    
+    # Save summary
+    summary = {
+        'model': args.model,
+        'total_examples': total_examples,
+        'correct': total_correct,
+        'accuracy': accuracy,
+        'total_reasoning_tokens': total_reasoning_tokens,
+        'avg_reasoning_tokens': avg_reasoning_tokens,
+        'max_corrections': args.max_corrections,
+        'stats_by_type': stats_by_type,
+        'results': results,
+    }
+    
+    summary_path = os.path.join(output_dirs["base"], "summary.json")
+    with open(summary_path, 'w') as f:
+        json.dump(summary, f, indent=2)
+    logger.info(f"\nSaved summary to {summary_path}")
\ No newline at end of file
diff --git a/interwhen/monitors/stepVerifier.py b/interwhen/monitors/stepVerifier.py
index ca5fc4b0..be73b80f 100644
--- a/interwhen/monitors/stepVerifier.py
+++ b/interwhen/monitors/stepVerifier.py
@@ -12,7 +12,43 @@
     extract_step2_claims, verify_spatialmap_step, format_spatialmap_feedback
 )
 
-    
+
+def _find_complete_boxed(text: str):
+    """Find a complete \\boxed{...} in text, handling nested braces.
+
+    Returns a match-like object with .start() and .end(), or None.
+    """
+    idx = 0
+    while idx < len(text):
+        pos = text.find(r'\boxed{', idx)
+        if pos == -1:
+            return None
+        brace_start = pos + len(r'\boxed{')
+        depth = 1
+        i = brace_start
+        while i < len(text) and depth > 0:
+            if text[i] == '{':
+                depth += 1
+            elif text[i] == '}':
+                depth -= 1
+            i += 1
+        if depth == 0:
+            content = text[brace_start:i - 1].strip()
+            if content:
+                class _BoxedMatch:
+                    def __init__(self, s, e):
+                        self._start, self._end = s, e
+                    def start(self):
+                        return self._start
+                    def end(self):
+                        return self._end
+                    def group(self, n=0):
+                        return text[self._start:self._end]
+                return _BoxedMatch(pos, i)
+        idx = pos + 1
+    return None
+
+
 class StepVerifierGame24Monitor(VerifyMonitor):
     """
     Step-by-step Game of 24 verifier monitor.
@@ -739,7 +775,7 @@ def _step_extractor_relative_position(
         3. LOCATE section is complete and analysis has started (verify LOCATE)
         """
         # Check for boxed answer first (highest priority)
-        boxed_match = re.search(r'\\boxed\{[^}]+\}', text)
+        boxed_match = _find_complete_boxed(text)
         if boxed_match:
             # Found answer, verify it (include full text up to boxed answer)
             end_pos = text_start_in_generated + boxed_match.end()
@@ -989,7 +1025,7 @@ def step_extractor(self, chunk: str, generated_text: str) -> Tuple[bool, Optiona
                     return True, generated_text[:end_pos]
         
         # Check for boxed answer (trigger final verification)
-        boxed_match = re.search(r'\\boxed\{[^}]+\}', text)
+        boxed_match = _find_complete_boxed(text)
         if boxed_match:
             # Verify any remaining claims before final answer
             new_claims = self._extract_new_claims(generated_text)
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index 95a8535f..a6f247dc 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -109,6 +109,54 @@
 )
 
 
+def _find_complete_boxed(text: str) -> Optional[re.Match]:
+    """Find a complete \\boxed{...} in text, handling nested braces.
+
+    Unlike ``re.search(r'\\boxed\{[^}]+\}', text)`` this correctly
+    handles LaTeX like ``\\boxed{12\\frac{1}{2}}`` where the naive
+    ``[^}]+`` pattern would stop at the first ``}``.
+
+    Returns an ``re.Match``-like object with ``.start()`` and ``.end()``
+    spanning the full ``\\boxed{...}`` (including the outer braces),
+    or ``None`` if no complete boxed expression is found.
+    """
+    idx = 0
+    while idx < len(text):
+        pos = text.find(r'\boxed{', idx)
+        if pos == -1:
+            return None
+        # Start counting braces from after '\boxed{'
+        brace_start = pos + len(r'\boxed{')
+        depth = 1
+        i = brace_start
+        while i < len(text) and depth > 0:
+            if text[i] == '{':
+                depth += 1
+            elif text[i] == '}':
+                depth -= 1
+            i += 1
+        if depth == 0:
+            # Build a simple match-like object
+            match_start = pos
+            match_end = i  # i is right after the closing '}'
+            # Check the content is non-empty
+            content = text[brace_start:i - 1].strip()
+            if content:
+                class _BoxedMatch:
+                    def __init__(self, s, e):
+                        self._start, self._end = s, e
+                    def start(self):
+                        return self._start
+                    def end(self):
+                        return self._end
+                    def group(self, n=0):
+                        return text[self._start:self._end]
+                return _BoxedMatch(match_start, match_end)
+        # Couldn't close braces from this position, try next occurrence
+        idx = pos + 1
+    return None
+
+
 def _extract_numbers_from_expr(expr: str) -> List[float]:
     """Extract all numbers (integers and decimals) from an expression string."""
     numbers = re.findall(r'\d+\.?\d*', expr)
@@ -284,6 +332,7 @@ def _extract_braced_expression(text: str) -> Optional[str]:
         # Basic cleanup: remove LaTeX
         expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
         expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        expr = expr.replace(r'\left', '').replace(r'\right', '')
         # Replace Unicode math operators (QwQ frequently uses these)
         expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
         expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
@@ -316,6 +365,7 @@ def _extract_boxed_expression(text: str) -> Optional[str]:
         expr = text[start:end - 1].strip()
         expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
         expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        expr = expr.replace(r'\left', '').replace(r'\right', '')
         # Replace Unicode math operators (QwQ frequently uses these)
         expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
         expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
@@ -449,7 +499,7 @@ def step_extractor(self, chunk: str, generated_text: str):
             last_feedback_end = match.end()
         text = text_after_think[last_feedback_end:]
 
-        has_boxed = re.search(r'\\boxed\{[^}]+\}', text)
+        has_boxed = _find_complete_boxed(text)
         if has_boxed:
             return True, generated_text
 
@@ -1227,7 +1277,7 @@ def step_extractor(self, chunk: str, generated_text: str):
                     return True, generated_text[:end_pos]
 
         # Check for boxed answer (any question type)
-        boxed = re.search(r'\\boxed\{[^}]+\}', text)
+        boxed = _find_complete_boxed(text)
         if boxed:
             end_pos = text_start + boxed.end()
             return True, generated_text[:end_pos]
@@ -2045,7 +2095,7 @@ def step_extractor(self, chunk: str, generated_text: str):
                     return True, generated_text[:end_pos]
 
         # Check for boxed answer (trigger final verification)
-        boxed_match = re.search(r'\\boxed\{[^}]+\}', text)
+        boxed_match = _find_complete_boxed(text)
         if boxed_match:
             new_claims = self._extract_new_claims(text)
             if new_claims:
@@ -2226,7 +2276,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
                 return step, feedback
 
         # --- Check for boxed answer ---
-        boxed_match = re.search(r'\\boxed\{[^}]+\}', recent_text)
+        boxed_match = _find_complete_boxed(recent_text)
         if boxed_match:
 
             # ==========================================================

From d88eb0e531d3eea28e481c831d7ceec44a929988 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen3-4gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen3-4gpus.bonete11.svc.cluster.local>
Date: Tue, 3 Mar 2026 08:11:15 +0000
Subject: [PATCH 10/17] sanity

---
 examples/EarlyStopping/game24_example.py      |  82 ++++++----
 examples/EarlyStopping/maze_example.py        |  93 +++++------
 examples/EarlyStopping/spatialmap_example.py  |  87 +++++-----
 .../game24_stepverifier.py                    |  41 ++---
 .../TTSwithVerification/maze_stepverifier.py  | 153 ++++++++----------
 examples/TTSwithVerification/mazemeta.py      |  41 +++--
 6 files changed, 245 insertions(+), 252 deletions(-)

diff --git a/examples/EarlyStopping/game24_example.py b/examples/EarlyStopping/game24_example.py
index 82df71aa..f1c3f2a0 100644
--- a/examples/EarlyStopping/game24_example.py
+++ b/examples/EarlyStopping/game24_example.py
@@ -15,7 +15,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
+MAIN_MODEL = "Qwen/QwQ-32B"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -25,7 +25,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_Kstable2/Gameof24_results"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/Gameof24_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -42,14 +42,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_Kstable2/Gam
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_Kstable2/Gameof24_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/Gameof24_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_Kstable2/Gameof24_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/Gameof24_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -73,9 +73,11 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
     payload = {
         "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample" : True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -112,10 +114,22 @@ def count_tokens(text, tokenizer):
 
 def extract_solution(text):
     
+    # Only search for \boxed{} AFTER </think> to avoid grabbing unverified
+    # expressions from inside the thinking trace.
+    # If model opened <think> but never closed it (hit token limit), there is
+    # no final answer — return None.
+    if '</think>' in text:
+        search_text = text[text.rfind('</think>'):]
+    elif '<think>' in text:
+        # Model started thinking but never finished — no verified answer
+        return None
+    else:
+        search_text = text
+
     # Use a more robust extraction that handles nested braces in \boxed{}
     # Find \boxed{ and then match braces properly
     boxed_pattern = r"\\boxed\{"
-    matches = list(re.finditer(boxed_pattern, text))
+    matches = list(re.finditer(boxed_pattern, search_text))
     if not matches:
         return None
     
@@ -124,14 +138,18 @@ def extract_solution(text):
     start = last_match.end()  # Position right after \boxed{
     brace_count = 1
     end = start
-    while end < len(text) and brace_count > 0:
-        if text[end] == '{':
+    while end < len(search_text) and brace_count > 0:
+        if search_text[end] == '{':
             brace_count += 1
-        elif text[end] == '}':
+        elif search_text[end] == '}':
             brace_count -= 1
         end += 1
     
-    expr = text[start:end-1].strip()  # -1 to exclude the closing brace
+    expr = search_text[start:end-1].strip()  # -1 to exclude the closing brace
+
+    # Skip empty \boxed{} (e.g., from verifier feedback "Wrap in \boxed{}.")
+    if not expr:
+        return None
 
     # 1. Convert \frac{a}{b} to (a/b)
     frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
@@ -147,10 +165,17 @@ def extract_solution(text):
     for latex, op in replacements.items():
         expr = expr.replace(latex, op)
 
+    # 2b. Replace Unicode math operators (QwQ frequently uses these)
+    expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+    expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
+
     # 3. Cleanup (remove LaTeX formatting artifacts)
     expr = expr.replace(r"\,", "").replace(r"\ ", "")
     expr = expr.replace(r"\left", "").replace(r"\right", "")
 
+    # 3b. Strip trailing "= <number>" (e.g., "10 - 8/8 * 1 = 24" -> "10 - 8/8 * 1")
+    expr = re.sub(r'\s*=\s*[\d.]+\s*$', '', expr)
+
     # 4. Handle implicit multiplication (e.g., "(11+1)(1+1)" -> "(11+1)*(1+1)")
     # Insert * between: )( , )number, number(, )(
     expr = re.sub(r'\)\s*\(', ')*(', expr)  # )( -> )*(
@@ -183,7 +208,6 @@ def evaluate_expression(expr, expected_nums=None):
     except Exception:
         return False
 
-
 def evaluate_game24_answer(answer, nums):
     """
     Evaluate a Game24 answer and return (is_correct, expr, error_message).
@@ -214,7 +238,7 @@ def evaluate_game24_answer(answer, nums):
 
     parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
     parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
@@ -249,7 +273,7 @@ def evaluate_game24_answer(answer, nums):
 
     dataset = load_game24_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=20000)
+    llm_server = init_llm_server(main_model, max_tokens=32768, port=8000)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -297,23 +321,23 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
-        system_prompt = (
-            "You are Phi, a language model trained by Microsoft to help users. "
-            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-            "explorations, and reflections from the Thought section, systematically present the final solution that you "
-            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        )
+        # system_prompt = (
+        #     "You are Phi, a language model trained by Microsoft to help users. "
+        #     "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+        #     "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+        #     "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+        #     "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+        #     "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+        #     "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+        #     "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+        #     "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+        #     "explorations, and reflections from the Thought section, systematically present the final solution that you "
+        #     "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+        #     "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        # )
 
         answer = asyncio.run(stream_completion(
-            f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n",
+            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
             llm_server=llm_server,
             monitors=monitors,
             add_delay=False,
diff --git a/examples/EarlyStopping/maze_example.py b/examples/EarlyStopping/maze_example.py
index 175624dd..cc378aaa 100644
--- a/examples/EarlyStopping/maze_example.py
+++ b/examples/EarlyStopping/maze_example.py
@@ -17,7 +17,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/MazeResults"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/MazeResults"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/MazeResults"
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/MazeResults") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/MazeResults") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/MazeResults") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/MazeResults") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -66,10 +66,10 @@ def remove_last_paragraph(s: str) -> str:
 logger = logging.getLogger(__name__)
 
 def load_maze_dataset(split="val"):
-    ds = load_dataset("microsoft/VISION_LANGUAGE", "maze", split=split)
+    ds = load_dataset("microsoft/VISION_LANGUAGE", "maze_text_only", split=split)
     return ds
 
-def init_llm_server(modelname, max_tokens=200, port=8000): #
+def init_llm_server(modelname, max_tokens=200, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
@@ -101,22 +101,31 @@ def build_prompt_from_example(example): #(original prompt config)
     return pre_prompt , description
 
 
-def extract_solution(text):
-    # Strip injected <format>...</format> template blocks so we don't
-    # accidentally match the placeholder \boxed{LETTER} from the template.
-    text = re.sub(r'<format>.*?</format>', '', text, flags=re.DOTALL)
-    matches = re.findall(r"\\boxed\{([^}]*)\}", text)
-    if not matches:
-        return None
-
-    expr = matches[-1].strip()   # take last boxed content
-
-    # find one of A/B/C/D inside the boxed content
-    choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
-    if not choice_match:
-        return None
-
-    return choice_match.group(1).upper()
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    # Try multiple boxed patterns
+    patterns = [
+        r"\\boxed\{([^}]*)\}",  # \boxed{...}
+        r"boxed\{([^}]*)\}",     # boxed{...} without escape
+        r"\*\*([A-D])\*\*",      # **A** format
+        r"answer[:\s]*([A-D])",  # answer: A format
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+    ]
+   
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+   
+    # Last resort: look for any standalone A, B, C, or D
+    standalone = re.findall(r"\b([ABCD])\b", text)
+    if standalone:
+        return standalone[-1].upper()
+   
+    return None
 
 def save_prompt(idx, prompt_with_answer, reason_dir):
     filename = os.path.join(reason_dir, f"reason_{idx}.txt")
@@ -130,54 +139,30 @@ def count_tokens(text, tokenizer):
     return len(tokens)
 
 
-def evaluate_maze_answer(answer, options, ground_truth):
-    """
-    Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).
-    
-    Args:
-        answer: Raw model output
-        options: Dictionary mapping option letters (A/B/C/D) to their values
-        ground_truth: The correct answer value
-        
-    Returns:
-        Tuple of (is_correct, extracted_answer, message)
-    """
-    sol = extract_solution(answer)
+def evaluate_mcq_answer(answer, options, ground_truth):
+    sol = extract_solution_mcq(answer)
     gt_sol = str(ground_truth).strip()
-    
     if not sol:
         return False, None, "No expression found"
-    
     sol = sol.strip()
-    
-    # Case 1: LLM returned option letter (A/B/C/D)
     if sol in options:
         if options[sol] == gt_sol:
             return True, sol, f"Correct: option {sol} -> {options[sol]}"
-        else:
-            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
-    
-    # Case 2: LLM returned the actual answer text
-    # First check if sol matches ground truth directly
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
     if sol.lower() == gt_sol.lower():
         return True, sol, f"Correct: answer text matches ground truth: {sol}"
-    
-    # Check if sol matches any option value
     for opt_letter, opt_value in options.items():
         if sol.lower() == opt_value.lower():
             if opt_value == gt_sol:
                 return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
-            else:
-                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
-    
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
     return False, sol, f"Solution '{sol}' not found in options or ground truth"
 
-
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Maze problem solver with LLM and monitors")
     parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1500, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
@@ -210,7 +195,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
 
     dataset = load_maze_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=15000)
+    llm_server = init_llm_server(main_model, max_tokens=32768)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -222,7 +207,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
     total_generated_tokens = 0
     generated_token_counts = []
     total = len(dataset)
-    indices = np.linspace(3000, total-1, N, dtype=int).tolist()
+    indices = np.linspace(0, total-1, N, dtype=int).tolist()
 
     for idx in indices:
         example = dataset[idx]
@@ -271,7 +256,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
 
         # Evaluate the answer
         gt_sol = str(example.get("ground_truth", "")).strip()
-        is_correct, extracted_answer, message = evaluate_maze_answer(answer, options, gt_sol)
+        is_correct, extracted_answer, message = evaluate_mcq_answer(answer, options, gt_sol)
         
         if extracted_answer:
             logger.info(f"Extracted answer: {extracted_answer}")
diff --git a/examples/EarlyStopping/spatialmap_example.py b/examples/EarlyStopping/spatialmap_example.py
index 3a0604c2..308d5d56 100644
--- a/examples/EarlyStopping/spatialmap_example.py
+++ b/examples/EarlyStopping/spatialmap_example.py
@@ -17,7 +17,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/SpatialMap_results2"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/SpatialMap_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/SpatialMap_r
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/SpatialMap_results2") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/SpatialMap_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/SpatialMap_results2") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/SpatialMap_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -99,22 +99,32 @@ def build_prompt_from_example(example):
     description = remove_last_paragraph(description)
     return pre_prompt , description
 
-def extract_solution(text):
-    # Strip injected <format>...</format> template blocks so we don't
-    # accidentally match the placeholder \boxed{LETTER} from the template.
-    text = re.sub(r'<format>.*?</format>', '', text, flags=re.DOTALL)
-    matches = re.findall(r"\\boxed\{([^}]*)\}", text)
-    if not matches:
-        return None
-
-    expr = matches[-1].strip()   # take last boxed content
-
-    # find one of A/B/C/D inside the boxed content
-    choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
-    if not choice_match:
-        return None
-
-    return choice_match.group(1).upper()
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    # Try multiple boxed patterns
+    patterns = [
+        r"\\boxed\{([^}]*)\}",  # \boxed{...}
+        r"boxed\{([^}]*)\}",     # boxed{...} without escape
+        r"\*\*([A-D])\*\*",      # **A** format
+        r"answer[:\s]*([A-D])",  # answer: A format
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+    ]
+   
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+   
+    # Last resort: look for any standalone A, B, C, or D
+    standalone = re.findall(r"\b([ABCD])\b", text)
+    if standalone:
+        return standalone[-1].upper()
+   
+    return None
+ 
 
 def save_prompt(idx, prompt_with_answer, reason_dir):
     filename = os.path.join(reason_dir, f"reason_{idx}.txt")
@@ -128,46 +138,23 @@ def count_tokens(text, tokenizer):
     return len(tokens)
 
 
-def evaluate_spatialmap_answer(answer, options, ground_truth):
-    """
-    Evaluate a SpatialMap MCQ answer and return (is_correct, extracted_answer, message).
-    
-    Args:
-        answer: Raw model output
-        options: Dictionary mapping option letters (A/B/C/D) to their values
-        ground_truth: The correct answer value
-        
-    Returns:
-        Tuple of (is_correct, extracted_answer, message)
-    """
-    sol = extract_solution(answer)
+def evaluate_mcq_answer(answer, options, ground_truth):
+    sol = extract_solution_mcq(answer)
     gt_sol = str(ground_truth).strip()
-    
     if not sol:
         return False, None, "No expression found"
-    
     sol = sol.strip()
-    
-    # Case 1: LLM returned option letter (A/B/C/D)
     if sol in options:
         if options[sol] == gt_sol:
             return True, sol, f"Correct: option {sol} -> {options[sol]}"
-        else:
-            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
-    
-    # Case 2: LLM returned the actual answer text
-    # First check if sol matches ground truth directly
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
     if sol.lower() == gt_sol.lower():
         return True, sol, f"Correct: answer text matches ground truth: {sol}"
-    
-    # Check if sol matches any option value
     for opt_letter, opt_value in options.items():
         if sol.lower() == opt_value.lower():
             if opt_value == gt_sol:
                 return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
-            else:
-                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
-    
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
     return False, sol, f"Solution '{sol}' not found in options or ground truth"
 
 
@@ -175,7 +162,7 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
 
     parser = argparse.ArgumentParser(description="SpatialMap problem solver with LLM and monitors")
     parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1500, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
@@ -210,7 +197,7 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
 
     dataset = load_maze_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=15000)
+    llm_server = init_llm_server(main_model, max_tokens=32768)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -271,7 +258,7 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
 
         # Evaluate the answer
         gt_sol = str(example.get("ground_truth", "")).strip()
-        is_correct, extracted_answer, message = evaluate_spatialmap_answer(answer, options, gt_sol)
+        is_correct, extracted_answer, message = evaluate_mcq_answer(answer, options, gt_sol)
         
         if extracted_answer:
             logger.info(f"Extracted answer: {extracted_answer}")
diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index 993a6ee2..7e2a1f31 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -67,14 +67,16 @@ def load_game24_dataset():
     ds = load_dataset("nlile/24-game", split="train")
     return ds
 
-def init_llm_server(modelname, max_tokens=200, port=8001):
+def init_llm_server(modelname, max_tokens=200, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample" : True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -296,22 +298,23 @@ def evaluate_game24_answer(answer, nums):
         nums = example["numbers"]
 
         prompt = build_prompt(nums)
-        system_prompt = (
-            "You are Phi, a language model trained by Microsoft to help users. "
-            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-            "explorations, and reflections from the Thought section, systematically present the final solution that you "
-            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        )
-        full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n"
-
+        # system_prompt = (
+        #     "You are Phi, a language model trained by Microsoft to help users. "
+        #     "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+        #     "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+        #     "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+        #     "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+        #     "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+        #     "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+        #     "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+        #     "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+        #     "explorations, and reflections from the Thought section, systematically present the final solution that you "
+        #     "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+        #     "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        # )
+        # full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n"
+
+        full_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         if args.monitor:
             # ThinkingPhaseStepVerifierGame24Monitor handles both cases:
             # - With --thinking_verify: also verifies during the <think> phase
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index e6cdf8f4..014987dc 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -56,37 +56,42 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResu
     return dirs
 
 
-def build_simple_prompt(example):
-    """Build a simple user prompt from the maze example.
-
-    No system / meta prompt is used — the structured step format is
-    injected by the monitor after ``</think>``.
-    """
-    description = str(example.get("prompt", ""))
-    # Trim trailing boiler-plate instructions that the dataset appends
-    description_trimmed = description[:-143] if len(description) > 143 else description
-    return description_trimmed
-
-
-def extract_solution(text: str) -> str:
-    """Extract the boxed answer from the response (after </think>)."""
-    if "</think>" in text:
-        answer_section = text.split("</think>")[-1]
-    else:
-        answer_section = text
-    
-    # Strip injected <format>...</format> template blocks so we don't
-    # accidentally match the placeholder \boxed{LETTER} from the template.
-    answer_section = re.sub(r'<format>.*?</format>', '', answer_section, flags=re.DOTALL)
-    
-    matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
-    if matches:
-        return matches[-1].strip()
-    
-    match = re.search(r'(?:answer|Answer)[:\s]+([A-D])', answer_section)
-    if match:
-        return match.group(1).strip()
-    
+def build_prompt_from_example(example): #(original prompt config)
+
+    pre_prompt = """You are an expert problem solver. Carefully read the following multiple-choice question and think through the solution step-by-step before providing your final answer. Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"""
+
+    description = example.get("prompt")
+    description = str(description)
+
+    # remove the unecessary parts of the prompt and then add the prompt that we need.
+    description = remove_last_paragraph(description)
+    return pre_prompt , description
+
+
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    # Try multiple boxed patterns
+    patterns = [
+        r"\\boxed\{([^}]*)\}",  # \boxed{...}
+        r"boxed\{([^}]*)\}",     # boxed{...} without escape
+        r"\*\*([A-D])\*\*",      # **A** format
+        r"answer[:\s]*([A-D])",  # answer: A format
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+    ]
+   
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+   
+    # Last resort: look for any standalone A, B, C, or D
+    standalone = re.findall(r"\b([ABCD])\b", text)
+    if standalone:
+        return standalone[-1].upper()
+   
     return None
 
 
@@ -112,20 +117,21 @@ def count_tokens(text: str, tokenizer) -> int:
 #         return "relative_position"
 
 
-def init_llm_server(model_name, max_tokens=22000, port=8000):
-    """Initialize LLM server configuration."""
+def init_llm_server(modelname, max_tokens=32768, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
-        "model": model_name,
+        "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample" : True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
         "prompt_cache": True,
-        "seed": 42
+        "seed" : 42
     }
     headers = {"Content-Type": "application/json"}
     return {"url": url, "payload": payload, "headers": headers}
@@ -164,46 +170,23 @@ def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../.
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.csv")
 
-def evaluate_maze_answer(answer, options, ground_truth):
-    """
-    Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).
-    
-    Args:
-        answer: Raw model output
-        options: Dictionary mapping option letters (A/B/C/D) to their values
-        ground_truth: The correct answer value
-        
-    Returns:
-        Tuple of (is_correct, extracted_answer, message)
-    """
-    sol = extract_solution(answer)
+def evaluate_mcq_answer(answer, options, ground_truth):
+    sol = extract_solution_mcq(answer)
     gt_sol = str(ground_truth).strip()
-    
     if not sol:
         return False, None, "No expression found"
-    
     sol = sol.strip()
-    
-    # Case 1: LLM returned option letter (A/B/C/D)
     if sol in options:
         if options[sol] == gt_sol:
             return True, sol, f"Correct: option {sol} -> {options[sol]}"
-        else:
-            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
-    
-    # Case 2: LLM returned the actual answer text
-    # First check if sol matches ground truth directly
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
     if sol.lower() == gt_sol.lower():
         return True, sol, f"Correct: answer text matches ground truth: {sol}"
-    
-    # Check if sol matches any option value
     for opt_letter, opt_value in options.items():
         if sol.lower() == opt_value.lower():
             if opt_value == gt_sol:
                 return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
-            else:
-                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
-    
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
     return False, sol, f"Solution '{sol}' not found in options or ground truth"
 
 if __name__ == "__main__":
@@ -234,7 +217,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
         logging.getLogger().setLevel(logging.DEBUG)
     
     # Load dataset
-    dataset = load_dataset("microsoft/VISION_LANGUAGE", 'maze', split='val')
+    dataset = load_dataset("microsoft/VISION_LANGUAGE", 'maze_text_only', split='val')
     
     # Setup LLM server
     llm_server = init_llm_server(args.model, port=args.port)
@@ -252,8 +235,8 @@ def evaluate_maze_answer(answer, options, ground_truth):
     if args.indices:
         indices = [int(x.strip()) for x in args.indices.split(",")]
     elif args.num_examples:
-        # Use 4499 as endpoint (4500 is out of bounds since dataset size is 4500)
-        indices = np.linspace(3000, 4499, args.num_examples, dtype=int)
+        # Use 1499 as endpoint (1500 is out of bounds since dataset size is 1500)
+        indices = np.linspace(0, 1499, args.num_examples, dtype=int)
     else:
         indices = range(args.start, args.end)
     
@@ -268,7 +251,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
     
     for idx in indices:
         example = dataset[idx]
-        user_prompt = build_simple_prompt(example)
+        pre_prompt, user_prompt = build_prompt_from_example(example)
         if str(example.get("ground_truth", "")).strip() == "Q4":
             target_options = ["A", "B"]
         else:
@@ -278,21 +261,23 @@ def evaluate_maze_answer(answer, options, ground_truth):
         options = dict(re.findall(pattern, user_prompt))
         
         # Build prompt with Phi-4-reasoning system prompt
-        phi_system_prompt = (
-            "You are Phi, a language model trained by Microsoft to help users. "
-            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-            "explorations, and reflections from the Thought section, systematically present the final solution that you "
-            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        )
-        full_prompt = f"<|im_start|>system<|im_sep|>\n{phi_system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
+        # phi_system_prompt = (
+        #     "You are Phi, a language model trained by Microsoft to help users. "
+        #     "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
+        #     "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
+        #     "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
+        #     "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
+        #     "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
+        #     "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
+        #     "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
+        #     "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
+        #     "explorations, and reflections from the Thought section, systematically present the final solution that you "
+        #     "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
+        #     "to reach the conclusion. Now, try to solve the following question through the above guidelines."
+        # )
+        # full_prompt = f"<|im_start|>system<|im_sep|>\n{phi_system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
+
+        full_prompt = f"<|im_start|>system\n{pre_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
         
         # Parse maze from prompt
         grid, start_pos, exit_pos = parse_maze_from_prompt(user_prompt)
@@ -354,7 +339,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
         logger.info(f"Generated tokens in this example: {reasoning_tokens}")
         
         gt_sol = str(example.get("ground_truth", "")).strip()
-        is_correct, extracted_answer, message = evaluate_maze_answer(answer, options, gt_sol)
+        is_correct, extracted_answer, message = evaluate_mcq_answer(answer, options, gt_sol)
         
         # "attempted" = model produced a real \boxed{} answer (not "no solution")
         attempted = (extracted_answer is not None and extracted_answer.strip().lower() != "no solution")
diff --git a/examples/TTSwithVerification/mazemeta.py b/examples/TTSwithVerification/mazemeta.py
index b13f2c5b..3a74a260 100644
--- a/examples/TTSwithVerification/mazemeta.py
+++ b/examples/TTSwithVerification/mazemeta.py
@@ -162,21 +162,30 @@ def build_meta_prompt_from_example(example):
     return system_prompt, description_trimmed
 
 
-def extract_solution(text: str) -> str:
-    """Extract the boxed answer from the response (after </think>)."""
-    if "</think>" in text:
-        answer_section = text.split("</think>")[-1]
-    else:
-        answer_section = text
-    
-    matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
-    if matches:
-        return matches[-1].strip()
-    
-    match = re.search(r'(?:answer|Answer)[:\s]+([A-D])', answer_section)
-    if match:
-        return match.group(1).strip()
-    
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    # Try multiple boxed patterns
+    patterns = [
+        r"\\boxed\{([^}]*)\}",  # \boxed{...}
+        r"boxed\{([^}]*)\}",     # boxed{...} without escape
+        r"\*\*([A-D])\*\*",      # **A** format
+        r"answer[:\s]*([A-D])",  # answer: A format
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+    ]
+   
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+   
+    # Last resort: look for any standalone A, B, C, or D
+    standalone = re.findall(r"\b([ABCD])\b", text)
+    if standalone:
+        return standalone[-1].upper()
+   
     return None
 
 
@@ -241,7 +250,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
     Returns:
         Tuple of (is_correct, extracted_answer, message)
     """
-    sol = extract_solution(answer)
+    sol = extract_solution_mcq(answer)
     gt_sol = str(ground_truth).strip()
     
     if not sol:

From 0a07ae2477678ff68bb4b0aa4799e3b010cd52f1 Mon Sep 17 00:00:00 2001
From: Vishak <vishak.bhat5@gmail.com>
Date: Thu, 5 Mar 2026 05:59:21 +0000
Subject: [PATCH 11/17] sanity

---
 examples/EarlyStopping/game24_example.py      |   8 +-
 examples/EarlyStopping/maze_example.py        |  13 +-
 examples/EarlyStopping/spatialmap_example.py  |  13 +-
 examples/README.md                            |   3 +-
 .../game24_stepverifier.py                    | 160 +++++-------------
 .../TTSwithVerification/maze_stepverifier.py  |  39 ++---
 .../spatialmap_stepverifier.py                |  94 ++++------
 7 files changed, 107 insertions(+), 223 deletions(-)

diff --git a/examples/EarlyStopping/game24_example.py b/examples/EarlyStopping/game24_example.py
index f1c3f2a0..af033b7e 100644
--- a/examples/EarlyStopping/game24_example.py
+++ b/examples/EarlyStopping/game24_example.py
@@ -15,7 +15,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -25,7 +25,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/Gameof24_results"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/Gameof24_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -42,14 +42,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/Gameo
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/Gameof24_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/Gameof24_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/Gameof24_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/Gameof24_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
diff --git a/examples/EarlyStopping/maze_example.py b/examples/EarlyStopping/maze_example.py
index cc378aaa..c355bb46 100644
--- a/examples/EarlyStopping/maze_example.py
+++ b/examples/EarlyStopping/maze_example.py
@@ -17,7 +17,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/MazeResults"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/MazeResults"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/MazeR
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/MazeResults") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/MazeResults") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/MazeResults") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/MazeResults") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -120,11 +120,6 @@ def extract_solution_mcq(text):
             if choice_match:
                 return choice_match.group(1).upper()
    
-    # Last resort: look for any standalone A, B, C, or D
-    standalone = re.findall(r"\b([ABCD])\b", text)
-    if standalone:
-        return standalone[-1].upper()
-   
     return None
 
 def save_prompt(idx, prompt_with_answer, reason_dir):
diff --git a/examples/EarlyStopping/spatialmap_example.py b/examples/EarlyStopping/spatialmap_example.py
index 308d5d56..0aadbc48 100644
--- a/examples/EarlyStopping/spatialmap_example.py
+++ b/examples/EarlyStopping/spatialmap_example.py
@@ -17,7 +17,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/QwQ-32B"
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/SpatialMap_results"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/SpatialMap_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY/Spati
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/SpatialMap_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/SpatialMap_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY/SpatialMap_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/SpatialMap_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -118,11 +118,6 @@ def extract_solution_mcq(text):
             if choice_match:
                 return choice_match.group(1).upper()
    
-    # Last resort: look for any standalone A, B, C, or D
-    standalone = re.findall(r"\b([ABCD])\b", text)
-    if standalone:
-        return standalone[-1].upper()
-   
     return None
  
 
diff --git a/examples/README.md b/examples/README.md
index 79f5ea05..fdbc6bdf 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,9 +5,8 @@ Running verifier-guided inference requires only a few lines of code: just specif
 ```bash
 python -m vllm.entrypoints.openai.api_server \
   --model Qwen/Qwen3-30B-A3B-Thinking-2507 \
-  --max-model-len 65536 \
   --port 8000 \
-  --tensor-parallel-size 8
+  --tensor-parallel-size 4
 ```
 
 **Generate answer enabled with given monitors**
diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index 7e2a1f31..cb1a799b 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -14,7 +14,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
@@ -24,7 +24,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/Gameof24results"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/Gameof24results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -41,14 +41,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/Gameof24
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/Gameof24_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/Gameof24_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/Gameof24_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/Gameof24_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -88,19 +88,18 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
 
 
 def build_prompt(nums):
-    """Build a simple prompt for Game of 24."""
     a, b, c, d = nums
     boxed = r"\boxed{}"
     base_prompt = f"""
     You are solving the Game of 24.
-
+    
     You are given four numbers: {a}, {b}, {c}, {d}
-
+    
     Your job is to produce a valid arithmetic expression using:
     - ALL four numbers exactly once
     - ONLY +, -, *, /
     - The expression must evaluate to exactly 24.
-
+    
     Please reason step by step, and put your final answer containing only the expression within {boxed}.""".strip()
     return base_prompt
 
@@ -219,10 +218,8 @@ def evaluate_game24_answer(answer, nums):
         Tuple of (is_correct, extracted_expression, error_message)
     """
     expr = extract_solution(answer)
-    
     if not expr:
         return False, None, "No expression found"
-    
     if evaluate_expression(expr, expected_nums=nums):
         return True, expr, "Correct solution (evaluates to 24 using exactly the given numbers)"
     else:
@@ -239,21 +236,16 @@ def evaluate_game24_answer(answer, nums):
     parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
-    parser.add_argument("--thinking_verify", "-tv", action="store_true", default = True, help="Enable thinking-phase step verification (verify during <think> trace)")
-    parser.add_argument("--newline_threshold", type=int, default=20, help="Number of newlines in thinking before forcing step verification (used with --thinking_verify)")
-    parser.add_argument("--warmup", type=int, default=4, help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
+    parser.add_argument("--newline_threshold", type=int, default=20, help="Number of newlines in thinking before forcing step verification")
     parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
     parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
     args = parser.parse_args()
 
-    # Use models from args (allows command-line override)
     main_model = args.main_model
     earlystop_model = args.earlystop_model
 
-    # Setup output directories based on model name
     output_dirs = get_output_dirs(main_model)
     logfile = get_log_filename(main_model, args.num_examples)
-    token_filename = get_token_filename(main_model, args.num_examples)
     reason_dir = output_dirs["reasoning"]
 
     log_level = logging.DEBUG if args.debug else logging.INFO
@@ -262,7 +254,7 @@ def evaluate_game24_answer(answer, nums):
         level=log_level,
         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
         handlers=[
-            logging.FileHandler(logfile, mode="w"),  
+            logging.FileHandler(logfile, mode="w"),
             logging.StreamHandler()
         ],
         force=True,
@@ -274,61 +266,36 @@ def evaluate_game24_answer(answer, nums):
 
     dataset = load_game24_dataset()
 
-    llm_server = init_llm_server(main_model, max_tokens=22000)
+    llm_server = init_llm_server(main_model, max_tokens=32768, port=8000)
 
-    # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
     tokenizer = AutoTokenizer.from_pretrained(main_model, trust_remote_code=True)
     logger.info("Tokenizer loaded successfully.")
 
     num_correct = 0
-    num_attempted = 0  # examples where a \boxed{} answer was produced
-    num_excluded = 0   # examples excluded from soundness (no solution or token budget exceeded)
+    num_attempted = 0  # model produced a real answer (not "no solution" and not missing after </think>)
+    num_excluded = 0   # excluded from soundness (no solution or token budget exceeded)
     N = args.num_examples
-    max_token_budget = llm_server["payload"]["max_tokens"]
-    total_reasoning_tokens = 0
-    reasoning_token_counts = []
-    per_example_results = []  # list of dicts for CSV
+    total_generated_tokens = 0
+    generated_token_counts = []
 
-    # total = len(dataset)
     indices = np.linspace(0, len(dataset)-1, N, dtype=int)
 
     for idx in indices:
         example = dataset[idx]
         nums = example["numbers"]
-
         prompt = build_prompt(nums)
-        # system_prompt = (
-        #     "You are Phi, a language model trained by Microsoft to help users. "
-        #     "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-        #     "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-        #     "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-        #     "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-        #     "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-        #     "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-        #     "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-        #     "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-        #     "explorations, and reflections from the Thought section, systematically present the final solution that you "
-        #     "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-        #     "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        # )
-        # full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n"
-
-        full_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+
         if args.monitor:
-            # ThinkingPhaseStepVerifierGame24Monitor handles both cases:
-            # - With --thinking_verify: also verifies during the <think> phase
-            # - Without: only injects structured prompt after </think> and verifies steps
-            threshold = args.newline_threshold if args.thinking_verify else 999999
             monitors=(ThinkingPhaseStepVerifierGame24Monitor(
                 name="game24_verifier",
                 original_numbers=nums,
                 llm_server=llm_server,
-                prompt=full_prompt,
-                newline_threshold=threshold,
+                prompt=f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
+                newline_threshold=args.newline_threshold,
                 max_corrections=3,
                 answer_start_token="</think>",
-                warmup_newlines=args.warmup,
+                warmup_newlines=4,
             ),)
         else:
             monitors = ()
@@ -338,7 +305,7 @@ def evaluate_game24_answer(answer, nums):
         logger.info(f"Numbers: {nums}")
 
         answer = asyncio.run(stream_completion(
-            full_prompt,
+            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
             llm_server=llm_server,
             monitors=monitors,
             add_delay=False,
@@ -349,92 +316,57 @@ def evaluate_game24_answer(answer, nums):
         save_prompt(idx, answer, reason_dir)
         logger.info(f"Raw final output:\n{answer}")
 
-        reasoning_tokens = count_tokens(answer, tokenizer)
-        reasoning_token_counts.append(reasoning_tokens)
-        total_reasoning_tokens += reasoning_tokens
-        logger.info(f"Generated tokens in this example: {reasoning_tokens}")
+        generated_tokens = count_tokens(answer, tokenizer)
+        generated_token_counts.append(generated_tokens)
+        total_generated_tokens += generated_tokens
+        logger.info(f"Generated tokens in this example: {generated_tokens}")
 
         is_correct, expr, message = evaluate_game24_answer(answer, nums)
-        # "attempted" = model produced a real \boxed{} answer (not "no solution")
-        attempted = (expr is not None and expr.strip().lower() != "no solution")
-        if attempted:
-            num_attempted += 1
-
-        # Determine if this example should be excluded from soundness:
-        #   - answered "no solution" (gave up / max corrections)
-        #   - no expression found (verifier never completed Phase 2)
+        # Attempted: model produced a real answer (not "no solution" and not missing after </think>)
         gave_no_solution = (expr is not None and "no solution" in expr.strip().lower())
         no_expr_found = (expr is None)
-        excluded = gave_no_solution or no_expr_found
-        if excluded:
+        attempted = not (gave_no_solution or no_expr_found)
+        if attempted:
+            num_attempted += 1
+        else:
             num_excluded += 1
-        
+
         if expr:
             logger.info(f"Extracted expression: {expr}")
         logger.info(message)
-        
         if is_correct:
             num_correct += 1
 
-        per_example_results.append({
-            "index": int(idx),
-            "numbers": str(nums),
-            "expression": expr if expr else "",
-            "correct": is_correct,
-            "attempted": attempted,
-            "excluded": excluded,
-            "tokens": reasoning_tokens,
-            "message": message,
-        })
-
-    # Calculate final statistics
-    avg_reasoning_tokens = total_reasoning_tokens / N if N > 0 else 0
+    avg_generated_tokens = total_generated_tokens / N if N > 0 else 0
     accuracy = num_correct / N if N > 0 else 0
-    soundness_denom = N - num_excluded
-    soundness = num_correct / soundness_denom if soundness_denom > 0 else 0  # correct / (total - excluded)
-    
+    soundness = num_correct / num_attempted if num_attempted > 0 else 0
+
     print(f"\nFinal Accuracy: {num_correct}/{N} ({accuracy:.2%})")
-    print(f"Soundness: {num_correct}/{soundness_denom} ({soundness:.2%})")
+    print(f"Soundness: {num_correct}/{num_attempted} ({soundness:.2%})")
     print(f"Excluded from soundness (no solution / token budget exceeded): {num_excluded}")
-    print(f"Average Reasoning Tokens: {avg_reasoning_tokens:.2f}")
-    print(f"Total Reasoning Tokens: {total_reasoning_tokens}")
-
-    # Save per-example CSV
-    csv_file = os.path.join(output_dirs["csv_saved"], f"results_{N}examples.csv")
-    with open(csv_file, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=["index", "numbers", "expression", "correct", "attempted", "excluded", "tokens", "message"])
-        writer.writeheader()
-        writer.writerows(per_example_results)
-    logger.info(f"Per-example CSV saved to {csv_file}")
-    
-    # Save results summary to a text file
+    print(f"Average Generated Tokens: {avg_generated_tokens:.2f}")
+    print(f"Total Generated Tokens: {total_generated_tokens}")
+
     results_file = logfile.replace('.log', '_results.txt')
     with open(results_file, 'w') as f:
         f.write(f"Game of 24 Evaluation Results\n")
         f.write(f"{'='*50}\n\n")
         f.write(f"Model: {main_model}\n")
         f.write(f"Number of Examples: {N}\n")
-        f.write(f"Monitor Enabled: {args.monitor}\n")
-        f.write(f"Thinking Phase Verify: {args.thinking_verify}\n")
-        if args.thinking_verify:
-            f.write(f"Newline Threshold: {args.newline_threshold}\n")
-        f.write(f"\n")
+        f.write(f"Monitor Enabled: {args.monitor}\n\n")
         f.write(f"Results:\n")
         f.write(f"---------\n")
         f.write(f"Correct: {num_correct}/{N}\n")
         f.write(f"Accuracy: {accuracy:.2%}\n")
-        f.write(f"Attempted (produced \\boxed answer): {num_attempted}/{N}\n")
-        f.write(f"Excluded (no solution / token budget exceeded): {num_excluded}/{N}\n")
-        f.write(f"Soundness (correct / (total - excluded)): {num_correct}/{soundness_denom} = {soundness:.2%}\n\n")
-        f.write(f"Token Statistics:\n")
+        f.write(f"Soundness: {num_correct}/{num_attempted} = {soundness:.2%}\n")
+        f.write(f"Excluded from soundness (no solution / token budget exceeded): {num_excluded}\n\n")
+        f.write(f"Generated Token Statistics:\n")
         f.write(f"---------------------------\n")
-        f.write(f"Total Tokens: {total_reasoning_tokens}\n")
-        f.write(f"Average Tokens: {avg_reasoning_tokens:.2f}\n")
-        if reasoning_token_counts:
-            f.write(f"Median Tokens: {float(np.median(reasoning_token_counts)):.0f}\n")
-            f.write(f"Min Tokens: {min(reasoning_token_counts)}\n")
-            f.write(f"Max Tokens: {max(reasoning_token_counts)}\n")
-            f.write(f"Std Dev: {np.std(reasoning_token_counts):.2f}\n")
-    
+        f.write(f"Total Generated Tokens: {total_generated_tokens}\n")
+        f.write(f"Average Generated Tokens: {avg_generated_tokens:.2f}\n")
+        if generated_token_counts:
+            f.write(f"Min Generated Tokens: {min(generated_token_counts)}\n")
+            f.write(f"Max Generated Tokens: {max(generated_token_counts)}\n")
+            f.write(f"Std Dev: {np.std(generated_token_counts):.2f}\n")
     logger.info(f"Results saved to {results_file}")
     print(f"Results saved to {results_file}")
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index 014987dc..9c60366f 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
 # =================================================
 
 
@@ -39,7 +39,7 @@ def get_model_short_name(model_name: str) -> str:
     return short_name
 
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResults"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/MazeResults"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -55,30 +55,27 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResu
     
     return dirs
 
+def remove_last_paragraph(s: str) -> str:
+    return s[:-143]
 
 def build_prompt_from_example(example): #(original prompt config)
 
-    pre_prompt = """You are an expert problem solver. Carefully read the following multiple-choice question and think through the solution step-by-step before providing your final answer. Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"""
-
+    pre_prompt = "You are an expert problem solver. Carefully read the following multiple-choice question and think through the solution step-by-step before providing your final answer. Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"
     description = example.get("prompt")
     description = str(description)
-
-    # remove the unecessary parts of the prompt and then add the prompt that we need.
     description = remove_last_paragraph(description)
-    return pre_prompt , description
+    return pre_prompt, description
 
 
 def extract_solution_mcq(text):
     """Extract MCQ solution from model output."""
-    # Try multiple boxed patterns
     patterns = [
-        r"\\boxed\{([^}]*)\}",  # \boxed{...}
-        r"boxed\{([^}]*)\}",     # boxed{...} without escape
-        r"\*\*([A-D])\*\*",      # **A** format
-        r"answer[:\s]*([A-D])",  # answer: A format
-        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+        r"\\boxed\{([^}]*)\}",
+        r"boxed\{([^}]*)\}",
+        r"\*\*([A-D])\*\*",
+        r"answer[:\s]*([A-D])",
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",
     ]
-   
     for pattern in patterns:
         matches = re.findall(pattern, text, re.IGNORECASE)
         if matches:
@@ -86,12 +83,6 @@ def extract_solution_mcq(text):
             choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
             if choice_match:
                 return choice_match.group(1).upper()
-   
-    # Last resort: look for any standalone A, B, C, or D
-    standalone = re.findall(r"\b([ABCD])\b", text)
-    if standalone:
-        return standalone[-1].upper()
-   
     return None
 
 
@@ -155,7 +146,7 @@ def save_prompt(idx, prompt_with_answer, reason_dir):
     logger.info(f"Saved reasoning trace to {filename}")
 
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/MazeResults") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/MazeResults") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -163,7 +154,7 @@ def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/MazeResults") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/MazeResults") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -195,8 +186,8 @@ def evaluate_mcq_answer(answer, options, ground_truth):
                         help="Model name for generation")
     parser.add_argument("--indices", type=str, default=None,
                         help="Comma-separated indices to run (e.g., '3000,3500,4000')")
-    parser.add_argument("--start", type=int, default=3000, help="Start index")
-    parser.add_argument("--end", type=int, default=3010, help="End index")
+    parser.add_argument("--start", type=int, default=0, help="Start index")
+    parser.add_argument("--end", type=int, default=10, help="End index")
     parser.add_argument("--num_examples", "-n", type=int, default=None,
                         help="Number of examples to run (overrides start/end)")
     parser.add_argument("--max_corrections", type=int, default=5,
diff --git a/examples/TTSwithVerification/spatialmap_stepverifier.py b/examples/TTSwithVerification/spatialmap_stepverifier.py
index acfb26af..d1f0286b 100644
--- a/examples/TTSwithVerification/spatialmap_stepverifier.py
+++ b/examples/TTSwithVerification/spatialmap_stepverifier.py
@@ -27,7 +27,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
 # =================================================
 
 
@@ -38,7 +38,7 @@ def get_model_short_name(model_name: str) -> str:
     return short_name
 
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/SpatialMapResults"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/SpatialMapResults"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -72,36 +72,34 @@ def get_question_type(idx: int) -> str:
 
 
 def build_simple_prompt(example):
-    """Build a simple user prompt from the spatial map example.
-
-    No system / meta prompt is used — the structured step format is
-    injected by the monitor after ``</think>``.
-    """
+    """Build a prompt matching spatialmap_example.py."""
+    pre_prompt = "You are an expert problem solver. Carefully read the following multiple-choice question and think through the solution step-by-step before providing your final answer. Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"
     description = str(example.get("prompt", ""))
-    # Trim trailing boiler-plate instructions that the dataset appends
     description_trimmed = description[:-143] if len(description) > 143 else description
-    return description_trimmed
+    return pre_prompt, description_trimmed
 
 
 def extract_solution(text: str) -> str:
     """Extract the boxed answer from the response (after </think>)."""
+    patterns = [
+        r"\\boxed\{([^}]*)\}",
+        r"boxed\{([^}]*)\}",
+        r"\*\*([A-D])\*\*",
+        r"answer[:\s]*([A-D])",
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",
+    ]
     if "</think>" in text:
         answer_section = text.split("</think>")[-1]
     else:
         answer_section = text
-    
-    # Strip injected <format>...</format> template blocks so we don't
-    # accidentally match the placeholder \boxed{LETTER} from the template.
     answer_section = re.sub(r'<format>.*?</format>', '', answer_section, flags=re.DOTALL)
-    
-    matches = re.findall(r'\\boxed\{([^}]*)\}', answer_section)
-    if matches:
-        return matches[-1].strip()
-    
-    match = re.search(r'(?:answer|Answer)[:\s]+([A-D])', answer_section)
-    if match:
-        return match.group(1).strip()
-    
+    for pattern in patterns:
+        matches = re.findall(pattern, answer_section, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
     return None
 
 
@@ -111,15 +109,17 @@ def count_tokens(text: str, tokenizer) -> int:
     return len(tokens)
 
 
-def init_llm_server(model_name, max_tokens=20000, port=8000):
+def init_llm_server(model_name, max_tokens=32768, port=8000):
     """Initialize LLM server configuration."""
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": model_name,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample": True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -162,32 +162,20 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     """
     sol = extract_solution(answer)
     gt_sol = str(ground_truth).strip()
-    
     if not sol:
         return False, None, "No expression found"
-    
     sol = sol.strip()
-    
-    # Case 1: LLM returned option letter (A/B/C/D)
     if sol in options:
         if options[sol] == gt_sol:
             return True, sol, f"Correct: option {sol} -> {options[sol]}"
-        else:
-            return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
-    
-    # Case 2: LLM returned the actual answer text
-    # First check if sol matches ground truth directly
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
     if sol.lower() == gt_sol.lower():
         return True, sol, f"Correct: answer text matches ground truth: {sol}"
-    
-    # Check if sol matches any option value
     for opt_letter, opt_value in options.items():
         if sol.lower() == opt_value.lower():
             if opt_value == gt_sol:
                 return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
-            else:
-                return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
-    
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
     return False, sol, f"Solution '{sol}' not found in options or ground truth"
 
 
@@ -261,36 +249,20 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     
     for idx in indices:
         example = dataset[idx]
-        user_prompt = build_simple_prompt(example)
+        pre_prompt, description_trimmed = build_simple_prompt(example)
         if str(example.get("ground_truth", "")).strip() == "Q4":
             target_options = ["A", "B"]
         else:
-            target_options = ["A", "B", "C", "D"] 
+            target_options = ["A", "B", "C", "D"]
         keys = "|".join(map(re.escape, target_options))
         pattern = r'\b([A-D])\.\s*(.*?)(?=\s*[A-D]\.|$)'
-        raw = re.findall(pattern, user_prompt, flags=re.DOTALL)
+        raw = re.findall(pattern, description_trimmed, flags=re.DOTALL)
 
         options = {k: v.strip().rstrip(".") for k, v in raw}
-        
-        # Determine question type
+
         question_type = get_question_type(idx)
-        
-        # Build prompt with Phi-4-reasoning system prompt
-        phi_system_prompt = (
-            "You are Phi, a language model trained by Microsoft to help users. "
-            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-            "explorations, and reflections from the Thought section, systematically present the final solution that you "
-            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        )
-        full_prompt = f"<|im_start|>system<|im_sep|>\n{phi_system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
+
+        full_prompt = f"<|im_start|>system\n{pre_prompt}<|im_end|>\n<|im_start|>user\n{description_trimmed}<|im_end|>\n<|im_start|>assistant\n"
         
         logger.info(f"\n{'='*60}")
         logger.info(f"Example {idx} ({question_type})")
@@ -302,7 +274,7 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
         # Phase 2b — verifies structured output as model fills it in
         monitor = ThinkingPhaseStepVerifierSpatialMapMonitor(
             name="spatialmap_thinking_verifier",
-            problem_text=user_prompt,
+            problem_text=description_trimmed,
             llm_server=llm_server,
             prompt=full_prompt,
             newline_threshold=args.newline_threshold,

From 6a64f1a1b9a46197f6527985a0e0d55c8f632458 Mon Sep 17 00:00:00 2001
From: Vishak <vishak.bhat5@gmail.com>
Date: Thu, 5 Mar 2026 07:11:11 +0000
Subject: [PATCH 12/17] sanity

---
 interwhen/monitors/thinkingPhaseVerifier.py | 114 ++++++++++----------
 interwhen/utils/spatialmap_verifier.py      |  74 ++++++++++++-
 2 files changed, 126 insertions(+), 62 deletions(-)

diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index a6f247dc..887ba1f5 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -1703,13 +1703,17 @@ def _build_spatialmap_format_block() -> str:
     return (
         "<format>\n"
         ">>> STEP 1: PARSE RELATIONSHIPS\n"
-        "    - A is to the DIRECTION of B\n"
-        "    [... list all given relationships ...]\n"
+        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
+        "    - [Full Name C] is to the [direction] of [Full Name D]\n"
+        "    [... list ALL given relationships using FULL names exactly as in the question ...]\n"
+        "    (NO abbreviations, NO short forms, NO parenthetical aliases like 'Police Supply Store (PSS)')\n"
         "\n"
         ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS\n"
         "    - Looking for: [target relationship / direction / count]\n"
-        "    - [reasoning about the relationships]\n"
-        "    - [use reversibility and transitivity as needed]\n"
+        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
+        "    - [Full Name C] is to the [direction] of [Full Name D]\n"
+        "    [... list each derived relationship as a structured claim using FULL names ...]\n"
+        "    (Each claim MUST be in the form: '[Full Name] is to the [direction] of [Full Name]')\n"
         "\n"
         ">>> STEP 3: ANSWER\n"
         "    - [state conclusion]\n"
@@ -1744,11 +1748,11 @@ def _build_spatialmap_thinking_phase_prompt(
     return (
         "\n\nLet me organize what I have so far. I will list the given "
         "relationships in STEP 1, then in STEP 2 I will state every "
-        "spatial claim I have derived using FULL object names (no "
-        "abbreviations) in exactly this form:\n"
-        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
-        "For direction I will use the full word: northeast, northwest, "
-        "southeast, southwest, north, south, east, or west.\n\n"
+        "spatial claim I have derived.\n"
+        "IMPORTANT: I must use the FULL object names exactly as given in the question "
+        "(no abbreviations, no short forms, no aliases, no partial names, no parenthetical aliases like 'Store (S)').\n"
+        "Every claim must be in the form: '[Full Name] is to the [direction] of [Full Name]'\n"
+        "For direction I will use the full word: northeast, northwest, southeast, southwest, north, south, east, or west.\n\n"
         ">>> STEP 1: PARSE RELATIONSHIPS (given)\n"
         f"{step1_body}\n\n"
         ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS (derived)\n"
@@ -1765,7 +1769,10 @@ def _build_spatialmap_structured_prompt() -> str:
     """
     format_block = _build_spatialmap_format_block()
     return (
-        "\nLet me solve this step by step using the structured format:\n"
+        "\nLet me solve this step by step using the structured format.\n"
+        "IMPORTANT: I must use the FULL names of all objects exactly as they appear in the question. "
+        "NO abbreviations, NO short forms, NO parenthetical aliases.\n"
+        "Every relationship must be stated as: '[Full Name] is to the [direction] of [Full Name]'\n\n"
         f"{format_block}\n"
         ">>> STEP 1: PARSE RELATIONSHIPS\n"
     )
@@ -1948,7 +1955,12 @@ def _extract_new_claims(self, text: str) -> List[Dict]:
 
         text_to_check = text[last_feedback_end:]
 
-        all_claims = extract_step2_claims(text_to_check)
+        # Get full entity names from Z3 solver for abbreviation resolution
+        entity_names = list({
+            k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
+        })
+
+        all_claims = extract_step2_claims(text_to_check, entity_names=entity_names)
 
         new_claims = []
         for claim in all_claims:
@@ -2148,7 +2160,12 @@ async def verify(self, step: str, token_index: int, event, event_info):
             # Parse directional claims directly from the side-stream output.
             # The prompt pre-fills STEP 1 and ends at ">>> STEP 2:", so the
             # model's output is already STEP 2 content — no header to search for.
-            claims = parse_directional_claims_from_text(side_output)
+            entity_names = list({
+                k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
+            })
+            claims = parse_directional_claims_from_text(
+                side_output, entity_names=entity_names
+            )
 
             logger.info(
                 f"[SpatialMap Phase 1] Parsed {len(claims)} claims from side-stream.\n"
@@ -2465,52 +2482,33 @@ async def verify(self, step: str, token_index: int, event, event_info):
                         count_corrections = self._count_feedback_blocks_count
                         self._count_feedback_blocks_count = count_corrections + 1
 
-                        if count_corrections == 0:
-                            # First attempt: explain why cardinal = 0
-                            if direction in ("north", "south"):
-                                diag_examples = "northeast or northwest"
-                            elif direction == "west":
-                                diag_examples = "northwest or southwest"
-                            else:  # east
-                                diag_examples = "northeast or southeast"
-
-                            feedback = (
-                                f"\n\n[VERIFIER FEEDBACK: Count mismatch!\n"
-                                f"  You answered {model_count} objects "
-                                f"'{direction}' of {reference}, but the "
-                                f"correct count is {z3_count}.\n"
-                                f"  IMPORTANT: '{direction.title()}' means "
-                                f"STRICTLY and EXACTLY {direction} — it "
-                                f"does NOT include diagonal directions "
-                                f"like {diag_examples}.\n"
-                                f"  An object that is Northwest of "
-                                f"{reference} is NOT North of {reference}"
-                                f" and NOT West of {reference}.\n"
-                                f"  Since all given relationships in this "
-                                f"problem are diagonal (NE/NW/SE/SW), no "
-                                f"object can be strictly "
-                                f"'{direction.title()}' of {reference}.\n"
-                                f"  The correct count is {z3_count}. "
-                                f"Please select the option for 0.]\n\n"
-                                f">>> STEP 3: ANSWER\n"
-                            )
-                        else:
-                            # Subsequent attempts: force the correct answer directly
-                            correct_option = None
-                            for opt, val in self._counting_options.items():
-                                if val == "0":
-                                    correct_option = opt
-                                    break
-                            if correct_option:
-                                feedback = (
-                                    f"\nThe correct answer is 0. "
-                                    f"\\boxed{{{correct_option}}}"
-                                )
-                            else:
-                                feedback = (
-                                    f"\nThe correct answer is 0. "
-                                    f"\\boxed{{0}}"
-                                )
+                        # Build direction-specific examples of what does NOT count
+                        if direction in ("north", "south"):
+                            diag_examples = "northeast or northwest"
+                        elif direction == "west":
+                            diag_examples = "northwest or southwest"
+                        else:  # east
+                            diag_examples = "northeast or southeast"
+
+                        feedback = (
+                            f"\n\n[VERIFIER FEEDBACK: Count mismatch!\n"
+                            f"  You answered {model_count} objects "
+                            f"'{direction}' of {reference}, but this "
+                            f"count is incorrect.\n"
+                            f"  IMPORTANT: '{direction.title()}' means "
+                            f"STRICTLY and EXACTLY {direction} — it "
+                            f"does NOT include diagonal directions "
+                            f"like {diag_examples}.\n"
+                            f"  An object that is {diag_examples.split(' or ')[0].title()} of "
+                            f"{reference} is NOT {direction.title()} of "
+                            f"{reference}.\n"
+                            f"  Please go through each object and check "
+                            f"whether it is EXACTLY to the "
+                            f"'{direction}' of {reference}, not to the "
+                            f"{diag_examples}. Then recount carefully "
+                            f"and select the correct option.]\n\n"
+                            f">>> STEP 3: ANSWER\n"
+                        )
 
                         logger.info(
                             f"[SpatialMap Phase 2b] Cardinal count "
diff --git a/interwhen/utils/spatialmap_verifier.py b/interwhen/utils/spatialmap_verifier.py
index 7a249fc8..f7d02291 100644
--- a/interwhen/utils/spatialmap_verifier.py
+++ b/interwhen/utils/spatialmap_verifier.py
@@ -316,7 +316,10 @@ def parse_model_count_from_answer(text_after_think: str, options: dict = None) -
         return None
 
 
-def parse_directional_claims_from_text(text: str) -> List[Dict]:
+def parse_directional_claims_from_text(
+    text: str,
+    entity_names: Optional[List[str]] = None,
+) -> List[Dict]:
     """
     Parse directional claims from model output text.
     
@@ -327,8 +330,43 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
     - "X is NW of Y" (abbreviated directions)
     - "[X] is to the northwest of [Y]" (bracket-wrapped names)
     
+    If *entity_names* is provided, single-letter or short abbreviations
+    in parsed claims will be resolved to the closest full entity name.
+    Parenthetical aliases like '(L)', '(Mo)' are stripped before parsing.
+    
     Returns list of IR dicts: [{"A": ..., "direction": ..., "B": ...}, ...]
     """
+    # Build abbreviation → full-name map from entity_names.
+    # When multiple entities share the same abbreviation, mark it as
+    # ambiguous (map to None) so we don't silently pick the wrong one.
+    abbrev_to_full: Dict[str, Optional[str]] = {}
+    if entity_names:
+        for name in entity_names:
+            words = re.split(r"[\s']+", name)
+            capitals = [w[0] for w in words if w and w[0].isupper()]
+            candidates: List[str] = []
+            if capitals:
+                candidates.append(capitals[0])                   # e.g. "M"
+                if len(capitals) >= 2:
+                    candidates.append(''.join(capitals[:2]))      # e.g. "MG"
+                    candidates.append(''.join(capitals))          # e.g. "MGM"
+            first_word = words[0] if words else ''
+            if len(first_word) >= 2:
+                candidates.append(first_word[:2])                 # e.g. "Mi"
+            if first_word:
+                candidates.append(first_word)                     # e.g. "Miniature"
+
+            for abbr in candidates:
+                if abbr in abbrev_to_full:
+                    if abbrev_to_full[abbr] != name:
+                        # Ambiguous — mark as None so we skip it
+                        abbrev_to_full[abbr] = None
+                else:
+                    abbrev_to_full[abbr] = name
+
+        # Remove ambiguous entries
+        abbrev_to_full = {k: v for k, v in abbrev_to_full.items() if v is not None}
+
     # Expand abbreviated directions before parsing
     abbrev_map = {
         'NW': 'northwest', 'NE': 'northeast',
@@ -344,10 +382,28 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
     # Strip square brackets around entity names: [Foo Bar] → Foo Bar
     expanded_text = re.sub(r'\[([A-Z][A-Za-z\'\s]*?)\]', r'\1', expanded_text)
 
+    # Strip parenthetical aliases like (L), (M), (Mo), (IQC) — but not
+    # coordinate tuples like (0,0) or (a, b)
+    expanded_text = re.sub(r'\s*\([A-Z][A-Za-z]{0,3}\)', '', expanded_text)
+
     claims = []
     
     # Pattern: "X is (to the) DIRECTION of Y"
-    pattern = r"([A-Z][A-Za-z'][A-Za-z'\s]*?)\s+is\s+(?:to\s+the\s+)?(northwest|northeast|southwest|southeast|north|south|east|west)\s+of\s+([A-Z][A-Za-z'][A-Za-z'\s]*?)(?:\.|,|;|:|\s*[→✓✗]|\s*\n|\s*$|\s+(?:and|so|which|therefore|thus|but|since|because|while|whereas|however|hence|then|for|as|meaning|indicating|implying|suggesting|confirming|\())"
+    # Terminators include ⇒ for arrow-style claims.
+    # Entity capture allows single uppercase letters (resolved via abbrev map)
+    # or multi-word names starting with uppercase.
+    entity_pat = r"([A-Z][A-Za-z'][A-Za-z'\s]*?|[A-Z][a-z]?)"
+    pattern = (
+        entity_pat +
+        r"\s+is\s+(?:to\s+the\s+)?"
+        r"(northwest|northeast|southwest|southeast|north|south|east|west)"
+        r"\s+of\s+" +
+        entity_pat +
+        r"(?:\.|,|;|:|\s*[→⇒✓✗]|\s*\n|\s*$"
+        r"|\s+(?:and|so|which|therefore|thus|but|since|because|while|whereas"
+        r"|however|hence|then|for|as|meaning|indicating|implying|suggesting"
+        r"|confirming|\())"
+    )
     
     matches = re.finditer(pattern, expanded_text, re.IGNORECASE)
     
@@ -360,6 +416,13 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
         entity_a = re.sub(r'[,\.\!\?]+$', '', entity_a).strip()
         entity_b = re.sub(r'[,\.\!\?]+$', '', entity_b).strip()
         
+        # Resolve abbreviations to full names if entity_names provided
+        if abbrev_to_full:
+            if entity_a in abbrev_to_full:
+                entity_a = abbrev_to_full[entity_a]
+            if entity_b in abbrev_to_full:
+                entity_b = abbrev_to_full[entity_b]
+
         # Skip if entities look like fragments, pronouns, or are too short
         skip_words = {'then', 'if', 'so', 'thus', 'therefore', 'it', 'this', 'that', 
                       'which', 'what', 'where', 'when', 'also', 'not', 'the', 'a', 'an'}
@@ -379,7 +442,10 @@ def parse_directional_claims_from_text(text: str) -> List[Dict]:
     return claims
 
 
-def extract_step2_claims(answer_text: str) -> List[Dict]:
+def extract_step2_claims(
+    answer_text: str,
+    entity_names: Optional[List[str]] = None,
+) -> List[Dict]:
     """
     Extract directional claims specifically from STEP 2 of the answer.
     
@@ -399,7 +465,7 @@ def extract_step2_claims(answer_text: str) -> List[Dict]:
         return []
     
     step2_text = match.group(0)
-    return parse_directional_claims_from_text(step2_text)
+    return parse_directional_claims_from_text(step2_text, entity_names=entity_names)
 
 
 def verify_spatialmap_step(

From 32b5adf8078af7c29634e46c5ddad78a13da636f Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen1-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen1-8gpus.bonete11.svc.cluster.local>
Date: Mon, 16 Mar 2026 16:00:43 +0000
Subject: [PATCH 13/17] Update feedback prompts and format blocks for appendix
 alignment

---
 .../game24_stepverifier.py                    |  2 +-
 .../TTSwithVerification/maze_stepverifier.py  |  2 +-
 .../spatialmap_stepverifier.py                |  8 ++--
 interwhen/monitors/thinkingPhaseVerifier.py   | 46 ++++++++-----------
 4 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index cb1a799b..94cba6ee 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -14,7 +14,7 @@
 
 # ============== MODEL CONFIGURATION ==============
 # Change these model names to scale experiments easily
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index 9c60366f..34c88931 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
 
diff --git a/examples/TTSwithVerification/spatialmap_stepverifier.py b/examples/TTSwithVerification/spatialmap_stepverifier.py
index d1f0286b..87ace705 100644
--- a/examples/TTSwithVerification/spatialmap_stepverifier.py
+++ b/examples/TTSwithVerification/spatialmap_stepverifier.py
@@ -27,7 +27,7 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
 
@@ -109,7 +109,7 @@ def count_tokens(text: str, tokenizer) -> int:
     return len(tokens)
 
 
-def init_llm_server(model_name, max_tokens=32768, port=8000):
+def init_llm_server(model_name, max_tokens=20480, port=8000):
     """Initialize LLM server configuration."""
     url = f"http://localhost:{port}/v1/completions"
     payload = {
@@ -194,9 +194,9 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
     parser.add_argument("--newline_threshold", type=int, default=20,
-                        help="Number of \\n\\n in thinking before triggering side verification")
+                        help="Number of \\\\n\\\\n in thinking before triggering side verification")
     parser.add_argument("--warmup", type=int, default=0,
-                        help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
+                        help="Number of \\\\n\\\\n to skip before starting side-chain verification (warmup period)")
     args = parser.parse_args()
 
     logger.info(f"Thinking-phase verification: always on")
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
index 887ba1f5..76592f86 100644
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ b/interwhen/monitors/thinkingPhaseVerifier.py
@@ -1032,16 +1032,7 @@ def _verify_relative_position_answer(self, boxed_answer: str) -> Tuple[bool, Opt
             f"  IMPORTANT: In this task, \"{asked_raw}\" means the GENERAL "
             f"COMPASS DIRECTION, NOT immediate adjacency. It asks whether E "
             f"is in the {actual} direction from S, regardless of distance or "
-            f"walls between them.\n"
-            f"  S is at row={sr}, col={sc}. E is at row={er}, col={ec}.\n"
-            f"  Row difference (E-S): {er - sr} ({'same row' if er == sr else ('E is below S' if er > sr else 'E is above S')}).\n"
-            f"  Col difference (E-S): {ec - sc} ({'same col' if ec == sc else ('E is right of S' if ec > sc else 'E is left of S')}).\n"
-            f"  Therefore E is {actual} of S → the correct answer to "
-            f"\"{asked_raw}\" is {expected_yn}.\n"
-            f"  Do NOT consider adjacency or walls. Just compare the row/col "
-            f"coordinates of S and E.\n"
-            f"  Output \\boxed{{{correct_letter}}} for {expected_yn}. "
-            f"This is the verified correct answer — do not argue.]\n\n"
+            f"walls between them.]\n\n"
         )
         return False, feedback
 
@@ -2333,11 +2324,7 @@ async def verify(self, step: str, token_index: int, event, event_info):
                                 f"{self._direction_question['entity_a']} "
                                 f"relative to "
                                 f"{self._direction_question['entity_b']} "
-                                f"based on the given constraints.\n"
-                                f"  The only consistent direction is "
-                                f"'{possible[0].title()}'.\n"
-                                f"  Please select option "
-                                f"{valid_options[0]}.]\n\n"
+                                f"based on the given constraints.]\n\n"
                                 f">>> STEP 3: ANSWER\n"
                             )
                         else:
@@ -2352,8 +2339,6 @@ async def verify(self, step: str, token_index: int, event, event_info):
                                 f"relative to "
                                 f"{self._direction_question['entity_b']} "
                                 f"based on the given constraints.\n"
-                                f"  The possible directions are: "
-                                f"{possible_str}.\n"
                                 f"  Please reconsider and choose the "
                                 f"correct option.]\n\n"
                                 f">>> STEP 3: ANSWER\n"
@@ -2463,6 +2448,10 @@ async def verify(self, step: str, token_index: int, event, event_info):
 
                 if is_cardinal:
                     # --- Cardinal: GT is always 0 ---
+                    # All spatial constraints in this dataset are diagonal
+                    # (NE, NW, SE, SW), so no object can be strictly
+                    # north/south/east/west of another. The answer is
+                    # always 0.
                     model_count = parse_model_count_from_answer(
                         recent_text, self._counting_options
                     )
@@ -2495,18 +2484,19 @@ async def verify(self, step: str, token_index: int, event, event_info):
                             f"  You answered {model_count} objects "
                             f"'{direction}' of {reference}, but this "
                             f"count is incorrect.\n"
-                            f"  IMPORTANT: '{direction.title()}' means "
-                            f"STRICTLY and EXACTLY {direction} — it "
-                            f"does NOT include diagonal directions "
-                            f"like {diag_examples}.\n"
-                            f"  An object that is {diag_examples.split(' or ')[0].title()} of "
-                            f"{reference} is NOT {direction.title()} of "
+                            f"  IMPORTANT: '{direction}' is a strict "
+                            f"cardinal direction — it means ONLY "
+                            f"exactly {direction}, NOT {diag_examples}."
+                            f"\n"
+                            f"  An object that is {diag_examples.split(' or ')[0]} of "
+                            f"{reference} is NOT {direction} of "
                             f"{reference}.\n"
-                            f"  Please go through each object and check "
-                            f"whether it is EXACTLY to the "
-                            f"'{direction}' of {reference}, not to the "
-                            f"{diag_examples}. Then recount carefully "
-                            f"and select the correct option.]\n\n"
+                            f"  Re-examine each object: is it described "
+                            f"as being strictly '{direction} of' "
+                            f"{reference}, or is the relationship "
+                            f"actually a diagonal direction like "
+                            f"{diag_examples}? Only count objects that "
+                            f"are strictly {direction}.]\n\n"
                             f">>> STEP 3: ANSWER\n"
                         )
 

From 59228fa8facdfdc11edb527f00df49bb68ac3047 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen1-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen1-8gpus.bonete11.svc.cluster.local>
Date: Tue, 17 Mar 2026 10:10:57 +0000
Subject: [PATCH 14/17] Interwhen on game24, maze, spatialMap and meta prompt
 method

---
 .../game24_stepverifier.py                    |  112 +-
 examples/TTSwithVerification/game24meta.py    |   92 +-
 .../TTSwithVerification/maze_stepverifier.py  |   74 +-
 examples/TTSwithVerification/mazemeta.py      |   36 +-
 .../spatialmap_stepverifier.py                |   26 +-
 examples/TTSwithVerification/spatialmeta.py   |   46 +-
 interwhen/monitors/__init__.py                |    4 +-
 interwhen/monitors/_common.py                 |   51 +
 interwhen/monitors/stepVerifier.py            |  473 ++-
 interwhen/monitors/thinkingPhaseVerifier.py   | 2658 -----------------
 .../monitors/thinkingPhaseVerifierGame24.py   |  608 ++++
 .../monitors/thinkingPhaseVerifierMaze.py     |  878 ++++++
 .../thinkingPhaseVerifierSpatialMap.py        | 1023 +++++++
 13 files changed, 3158 insertions(+), 2923 deletions(-)
 create mode 100644 interwhen/monitors/_common.py
 delete mode 100644 interwhen/monitors/thinkingPhaseVerifier.py
 create mode 100644 interwhen/monitors/thinkingPhaseVerifierGame24.py
 create mode 100644 interwhen/monitors/thinkingPhaseVerifierMaze.py
 create mode 100644 interwhen/monitors/thinkingPhaseVerifierSpatialMap.py

diff --git a/examples/TTSwithVerification/game24_stepverifier.py b/examples/TTSwithVerification/game24_stepverifier.py
index 94cba6ee..597f9099 100644
--- a/examples/TTSwithVerification/game24_stepverifier.py
+++ b/examples/TTSwithVerification/game24_stepverifier.py
@@ -1,6 +1,14 @@
+"""
+Game of 24 experiment with thinking-phase step verification.
+
+Uses ThinkingPhaseStepVerifierGame24Monitor which:
+  - Verifies the model's intermediate expressions during <think> via side-streams
+  - Injects expression extraction after </think>
+  - Verifies the final \\boxed{} expression for correctness
+"""
+
 import argparse
 import asyncio
-import csv
 import logging
 import os
 import re
@@ -13,19 +21,27 @@
 from interwhen.monitors import ThinkingPhaseStepVerifierGame24Monitor
 
 # ============== MODEL CONFIGURATION ==============
-# Change these model names to scale experiments easily
 MAIN_MODEL = "Qwen/QwQ-32B"
-EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # =================================================
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Walk up to find the repo root (contains pyproject.toml), output to its parent
+_dir = _SCRIPT_DIR
+while _dir != os.path.dirname(_dir) and not os.path.isfile(os.path.join(_dir, "pyproject.toml")):
+    _dir = os.path.dirname(_dir)
+_OUTPUT_ROOT = os.path.dirname(_dir)
+
 def get_model_short_name(model_name: str) -> str:
     """Extract a short, filesystem-safe name from the model path."""
     short_name = model_name.split("/")[-1]
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/Gameof24results"):
+def get_output_dirs(main_model: str, base_dir: str = None):
     """Create and return output directory paths based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "Gameof24results")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     
@@ -35,26 +51,20 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/G
         "csv_saved": os.path.join(output_base, "csv_saved"),
     }
     
-    # Create all directories
     for dir_path in dirs.values():
         os.makedirs(dir_path, exist_ok=True)
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/Gameof24_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = None) -> str:
     """Generate log filename based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "Gameof24results")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/Gameof24_results") -> str:
-    """Generate token CSV filename based on model name."""
-    model_short_name = get_model_short_name(main_model)
-    output_base = os.path.join(base_dir, model_short_name)
-    os.makedirs(output_base, exist_ok=True)
-    return os.path.join(output_base, f"EAT_{num_examples}examples.csv")
-
 def save_prompt(idx, prompt_with_answer, reason_dir):
     filename = os.path.join(reason_dir, f"reason_{idx}.txt")
     with open(filename, "w", encoding="utf-8") as f:
@@ -63,11 +73,7 @@ def save_prompt(idx, prompt_with_answer, reason_dir):
 logger = logging.getLogger(__name__)
 
 
-def load_game24_dataset():
-    ds = load_dataset("nlile/24-game", split="train")
-    return ds
-
-def init_llm_server(modelname, max_tokens=200, port=8000):
+def init_llm_server(modelname, max_tokens=32768, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
@@ -75,13 +81,13 @@ def init_llm_server(modelname, max_tokens=200, port=8000):
         "top_k": 20,
         "top_p": 0.95,
         "min_p": 0.0,
-        "do_sample" : True,
+        "do_sample": True,
         "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
         "prompt_cache": True,
-        "seed" : 42
+        "seed": 42
     }
     headers = {"Content-Type": "application/json"}
     return {"url": url, "payload": payload, "headers": headers}
@@ -232,17 +238,16 @@ def evaluate_game24_answer(answer, nums):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
-    parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = True, action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1362, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
     parser.add_argument("--newline_threshold", type=int, default=20, help="Number of newlines in thinking before forcing step verification")
-    parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
-    parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
+    parser.add_argument("--max_corrections", type=int, default=3, help="Maximum number of correction attempts per example")
+    parser.add_argument("--warmup", type=int, default=4, help="Number of \\n to skip before starting side-chain verification")
+    parser.add_argument("--model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
+    parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
     args = parser.parse_args()
 
-    main_model = args.main_model
-    earlystop_model = args.earlystop_model
+    main_model = args.model
 
     output_dirs = get_output_dirs(main_model)
     logfile = get_log_filename(main_model, args.num_examples)
@@ -261,12 +266,13 @@ def evaluate_game24_answer(answer, nums):
     )
 
     logger.info(f"Main model: {main_model}")
-    logger.info(f"Early stop model: {earlystop_model}")
     logger.info(f"Output directory: {output_dirs['base']}")
+    logger.info(f"Newline threshold: {args.newline_threshold}")
+    logger.info(f"Warmup: {args.warmup}")
 
-    dataset = load_game24_dataset()
+    dataset = load_dataset("nlile/24-game", split="train")
 
-    llm_server = init_llm_server(main_model, max_tokens=32768, port=8000)
+    llm_server = init_llm_server(main_model, port=args.port)
 
     logger.info(f"Loading tokenizer for {main_model}...")
     tokenizer = AutoTokenizer.from_pretrained(main_model, trust_remote_code=True)
@@ -285,33 +291,34 @@ def evaluate_game24_answer(answer, nums):
         example = dataset[idx]
         nums = example["numbers"]
         prompt = build_prompt(nums)
+        full_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
-        if args.monitor:
-            monitors=(ThinkingPhaseStepVerifierGame24Monitor(
-                name="game24_verifier",
-                original_numbers=nums,
-                llm_server=llm_server,
-                prompt=f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
-                newline_threshold=args.newline_threshold,
-                max_corrections=3,
-                answer_start_token="</think>",
-                warmup_newlines=4,
-            ),)
-        else:
-            monitors = ()
+        monitor = ThinkingPhaseStepVerifierGame24Monitor(
+            name="game24_verifier",
+            original_numbers=nums,
+            llm_server=llm_server,
+            prompt=full_prompt,
+            newline_threshold=args.newline_threshold,
+            max_corrections=args.max_corrections,
+            answer_start_token="</think>",
+            warmup_newlines=args.warmup,
+        )
 
-        logger.info(f"---- length of monitors {len(monitors)} ----")
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
-        answer = asyncio.run(stream_completion(
-            f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
-            llm_server=llm_server,
-            monitors=monitors,
-            add_delay=False,
-            termination_requires_validation=False,
-            async_execution=True
-        ))
+        try:
+            answer = asyncio.run(stream_completion(
+                full_prompt,
+                llm_server=llm_server,
+                monitors=(monitor,),
+                add_delay=False,
+                termination_requires_validation=False,
+                async_execution=True
+            ))
+        except Exception as e:
+            logger.error(f"Error running example {idx}: {e}")
+            continue
 
         save_prompt(idx, answer, reason_dir)
         logger.info(f"Raw final output:\n{answer}")
@@ -352,8 +359,7 @@ def evaluate_game24_answer(answer, nums):
         f.write(f"Game of 24 Evaluation Results\n")
         f.write(f"{'='*50}\n\n")
         f.write(f"Model: {main_model}\n")
-        f.write(f"Number of Examples: {N}\n")
-        f.write(f"Monitor Enabled: {args.monitor}\n\n")
+        f.write(f"Number of Examples: {N}\n\n")
         f.write(f"Results:\n")
         f.write(f"---------\n")
         f.write(f"Correct: {num_correct}/{N}\n")
diff --git a/examples/TTSwithVerification/game24meta.py b/examples/TTSwithVerification/game24meta.py
index fc8fd318..a48e1107 100644
--- a/examples/TTSwithVerification/game24meta.py
+++ b/examples/TTSwithVerification/game24meta.py
@@ -1,6 +1,5 @@
 import argparse
 import asyncio
-import csv
 import json
 import logging
 import os
@@ -8,26 +7,33 @@
 import numpy as np
 
 from datasets import load_dataset
-from openai import OpenAI
 from transformers import AutoTokenizer
 
 from interwhen import stream_completion
-from interwhen.monitors import KstableAnswerGame24Monitor, StepVerifierGame24Monitor
+from interwhen.monitors import StepVerifierGame24Monitor
 
 # ============== MODEL CONFIGURATION ==============
-# Change these model names to scale experiments easily
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
-EARLYSTOP_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Walk up to find the repo root (contains pyproject.toml), output to its parent
+_dir = _SCRIPT_DIR
+while _dir != os.path.dirname(_dir) and not os.path.isfile(os.path.join(_dir, "pyproject.toml")):
+    _dir = os.path.dirname(_dir)
+_OUTPUT_ROOT = os.path.dirname(_dir)
+
 def get_model_short_name(model_name: str) -> str:
     """Extract a short, filesystem-safe name from the model path."""
     short_name = model_name.split("/")[-1]
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/Gameof24results/metaPrompt"):
+def get_output_dirs(main_model: str, base_dir: str = None):
     """Create and return output directory paths based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "Gameof24results", "metaPrompt")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     
@@ -43,20 +49,15 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/Gameof24
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/Gameof24_results/metaPrompt") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = None) -> str:
     """Generate log filename based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "Gameof24results", "metaPrompt")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS/Gameof24_results/metaPrompt") -> str:
-    """Generate token CSV filename based on model name."""
-    model_short_name = get_model_short_name(main_model)
-    output_base = os.path.join(base_dir, model_short_name)
-    os.makedirs(output_base, exist_ok=True)
-    return os.path.join(output_base, f"EAT_{num_examples}examples.csv")
-
 def save_prompt(idx, prompt_with_answer, reason_dir):
     filename = os.path.join(reason_dir, f"reason_{idx}.txt")
     with open(filename, "w", encoding="utf-8") as f:
@@ -65,23 +66,21 @@ def save_prompt(idx, prompt_with_answer, reason_dir):
 logger = logging.getLogger(__name__)
 
 
-def load_game24_dataset():
-    ds = load_dataset("nlile/24-game", split="train")
-    return ds
-
-def init_llm_server(modelname, max_tokens=200, port=8001):
+def init_llm_server(modelname, max_tokens=32768, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": modelname,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample": True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
         "prompt_cache": True,
-        "seed" : 42
+        "seed": 42
     }
     headers = {"Content-Type": "application/json"}
     return {"url": url, "payload": payload, "headers": headers}
@@ -342,22 +341,20 @@ def evaluate_game24_answer(answer, nums):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Game of 24 step-by-step solver with monitors")
-    parser.add_argument("--thinking", "-t", action="store_true", help="Enable chain-of-thought output")
-    parser.add_argument("--monitor", "-m", default = False, action="store_true", help="Enable step-by-step monitor")
+    parser.add_argument("--monitor", "-m", action="store_true", help="Enable step-by-step monitor")
     parser.add_argument("--num_examples", "-n", type=int, default=1, help="Number of examples to run")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logs")
-    parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
-    parser.add_argument("--earlystop_model", type=str, default=EARLYSTOP_MODEL, help="Model to use for early stopping")
+    parser.add_argument("--model", type=str, default=MAIN_MODEL, help="Model to use for generation")
+    parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
+    parser.add_argument("--max_corrections", type=int, default=5,
+                        help="Maximum number of correction attempts per example")
     args = parser.parse_args()
 
-    # Use models from args (allows command-line override)
-    main_model = args.main_model
-    earlystop_model = args.earlystop_model
+    main_model = args.model
 
     # Setup output directories based on model name
     output_dirs = get_output_dirs(main_model)
     logfile = get_log_filename(main_model, args.num_examples)
-    token_filename = get_token_filename(main_model, args.num_examples)
     reason_dir = output_dirs["reasoning"]
 
     log_level = logging.DEBUG if args.debug else logging.INFO
@@ -373,12 +370,11 @@ def evaluate_game24_answer(answer, nums):
     )
 
     logger.info(f"Main model: {main_model}")
-    logger.info(f"Early stop model: {earlystop_model}")
     logger.info(f"Output directory: {output_dirs['base']}")
 
-    dataset = load_game24_dataset()
+    dataset = load_dataset("nlile/24-game", split="train")
 
-    llm_server = init_llm_server(main_model, max_tokens=22000)
+    llm_server = init_llm_server(main_model, port=args.port)
 
     # Load tokenizer for accurate token counting
     logger.info(f"Loading tokenizer for {main_model}...")
@@ -400,37 +396,23 @@ def evaluate_game24_answer(answer, nums):
         prompt = build_meta_prompt_from_example(nums)
 
         if args.monitor:
-            # Use StepVerifierGame24Monitor to detect when equation stabilizes k times
-            monitors=(StepVerifierGame24Monitor(
-                name="game24_kstable",
-                answer_start_token = "</think>",
-                original_numbers=nums,  # Validate equations use exactly these numbers
+            monitors = (StepVerifierGame24Monitor(
+                name="game24_verifier",
+                answer_start_token="</think>",
+                original_numbers=nums,
+                max_corrections=args.max_corrections,
             ),)
         else:
             monitors = ()
 
-        logger.info(f"---- length of monitors {len(monitors)} ----")
         logger.info(f"---- Example {idx+1} ----")
         logger.info(f"Numbers: {nums}")
 
-        system_prompt = (
-            "You are Phi, a language model trained by Microsoft to help users. "
-            "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-            "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-            "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-            "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-            "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-            "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-            "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-            "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-            "explorations, and reflections from the Thought section, systematically present the final solution that you "
-            "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-            "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        )
+        full_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
         try:
             answer = asyncio.run(stream_completion(
-                f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n",
+                full_prompt,
                 llm_server=llm_server,
                 monitors=monitors,
                 add_delay=False,
diff --git a/examples/TTSwithVerification/maze_stepverifier.py b/examples/TTSwithVerification/maze_stepverifier.py
index 34c88931..9884065d 100644
--- a/examples/TTSwithVerification/maze_stepverifier.py
+++ b/examples/TTSwithVerification/maze_stepverifier.py
@@ -15,7 +15,6 @@
 import os
 import re
 import numpy as np
-from pathlib import Path
 
 from datasets import load_dataset
 from transformers import AutoTokenizer
@@ -31,6 +30,14 @@
 MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Walk up to find the repo root (contains pyproject.toml), output to its parent
+_dir = _SCRIPT_DIR
+while _dir != os.path.dirname(_dir) and not os.path.isfile(os.path.join(_dir, "pyproject.toml")):
+    _dir = os.path.dirname(_dir)
+_OUTPUT_ROOT = os.path.dirname(_dir)
+
 
 def get_model_short_name(model_name: str) -> str:
     """Extract a short, filesystem-safe name from the model path."""
@@ -39,8 +46,10 @@ def get_model_short_name(model_name: str) -> str:
     return short_name
 
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/MazeResults"):
+def get_output_dirs(main_model: str, base_dir: str = None):
     """Create and return output directory paths based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "MazeResults")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     
@@ -92,22 +101,6 @@ def count_tokens(text: str, tokenizer) -> int:
     return len(tokens)
 
 
-# def get_question_type_from_index(idx: int) -> str:
-#     """Determine question type based on index range.
-    
-#     Dataset structure:
-#     - 3000-3499: right turns
-#     - 3500-3999: total turns
-#     - 4000-4500: relative position
-#     """
-#     if idx < 3500:
-#         return "right_turns"
-#     elif idx < 4000:
-#         return "total_turns"
-#     else:
-#         return "relative_position"
-
-
 def init_llm_server(modelname, max_tokens=32768, port=8000):
     url = f"http://localhost:{port}/v1/completions"
     payload = {
@@ -116,27 +109,18 @@ def init_llm_server(modelname, max_tokens=32768, port=8000):
         "top_k": 20,
         "top_p": 0.95,
         "min_p": 0.0,
-        "do_sample" : True,
+        "do_sample": True,
         "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
         "prompt_cache": True,
-        "seed" : 42
+        "seed": 42
     }
     headers = {"Content-Type": "application/json"}
     return {"url": url, "payload": payload, "headers": headers}
 
 
-def save_output(idx: int, output: str, output_dir: str):
-    """Save output to file."""
-    os.makedirs(output_dir, exist_ok=True)
-    filepath = os.path.join(output_dir, f"output_{idx}.txt")
-    with open(filepath, 'w') as f:
-        f.write(output)
-    logger.info(f"Saved output to {filepath}")
-
-
 def save_prompt(idx, prompt_with_answer, reason_dir):
     """Save reasoning trace to file."""
     os.makedirs(reason_dir, exist_ok=True)
@@ -146,21 +130,16 @@ def save_prompt(idx, prompt_with_answer, reason_dir):
     logger.info(f"Saved reasoning trace to {filename}")
 
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/MazeResults") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = None) -> str:
     """Generate log filename based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "MazeResults")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_TTS_SANITY/MazeResults") -> str:
-    """Generate token CSV filename based on model name."""
-    model_short_name = get_model_short_name(main_model)
-    output_base = os.path.join(base_dir, model_short_name)
-    os.makedirs(output_base, exist_ok=True)
-    return os.path.join(output_base, f"EAT_{num_examples}examples.csv")
-
 def evaluate_mcq_answer(answer, options, ground_truth):
     sol = extract_solution_mcq(answer)
     gt_sol = str(ground_truth).strip()
@@ -195,9 +174,9 @@ def evaluate_mcq_answer(answer, options, ground_truth):
     parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
     parser.add_argument("--newline_threshold", type=int, default=20,
-                        help="Number of \\n\\n in thinking before triggering side verification")
+                        help="Number of \\n in thinking before triggering side verification")
     parser.add_argument("--warmup", type=int, default=0,
-                        help="Number of \\n\\n to skip before starting side-chain verification (warmup period)")
+                        help="Number of \\n to skip before starting side-chain verification (warmup period)")
     args = parser.parse_args()
 
     logger.info(f"Thinking-phase verification: always on")
@@ -251,23 +230,6 @@ def evaluate_mcq_answer(answer, options, ground_truth):
         pattern = rf'\b({keys})\.\s*([A-Za-z0-9]+)\b'
         options = dict(re.findall(pattern, user_prompt))
         
-        # Build prompt with Phi-4-reasoning system prompt
-        # phi_system_prompt = (
-        #     "You are Phi, a language model trained by Microsoft to help users. "
-        #     "Your role as an assistant involves thoroughly exploring questions through a systematic thinking process "
-        #     "before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle "
-        #     "of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop "
-        #     "well-considered thinking process. Please structure your response into two main sections: Thought and Solution "
-        #     "using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, "
-        #     "detail your reasoning process in steps. Each step should include detailed considerations such as analysing "
-        #     "questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, "
-        #     "refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, "
-        #     "explorations, and reflections from the Thought section, systematically present the final solution that you "
-        #     "deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed "
-        #     "to reach the conclusion. Now, try to solve the following question through the above guidelines."
-        # )
-        # full_prompt = f"<|im_start|>system<|im_sep|>\n{phi_system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
-
         full_prompt = f"<|im_start|>system\n{pre_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
         
         # Parse maze from prompt
diff --git a/examples/TTSwithVerification/mazemeta.py b/examples/TTSwithVerification/mazemeta.py
index 3a74a260..f058c95a 100644
--- a/examples/TTSwithVerification/mazemeta.py
+++ b/examples/TTSwithVerification/mazemeta.py
@@ -11,7 +11,6 @@
 import os
 import re
 import numpy as np
-from pathlib import Path
 
 from datasets import load_dataset
 from transformers import AutoTokenizer
@@ -24,9 +23,17 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Walk up to find the repo root (contains pyproject.toml), output to its parent
+_dir = _SCRIPT_DIR
+while _dir != os.path.dirname(_dir) and not os.path.isfile(os.path.join(_dir, "pyproject.toml")):
+    _dir = os.path.dirname(_dir)
+_OUTPUT_ROOT = os.path.dirname(_dir)
+
 
 def get_model_short_name(model_name: str) -> str:
     """Extract a short, filesystem-safe name from the model path."""
@@ -35,8 +42,10 @@ def get_model_short_name(model_name: str) -> str:
     return short_name
 
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/MazeResults/metaPrompt"):
+def get_output_dirs(main_model: str, base_dir: str = None):
     """Create and return output directory paths based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "MazeResults", "metaPrompt")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     
@@ -211,15 +220,17 @@ def get_question_type_from_index(idx: int) -> str:
         return "relative_position"
 
 
-def init_llm_server(model_name, max_tokens=22000, port=8000):
+def init_llm_server(model_name, max_tokens=32768, port=8000):
     """Initialize LLM server configuration."""
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": model_name,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample": True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -229,15 +240,6 @@ def init_llm_server(model_name, max_tokens=22000, port=8000):
     headers = {"Content-Type": "application/json"}
     return {"url": url, "payload": payload, "headers": headers}
 
-
-def save_output(idx: int, output: str, output_dir: str):
-    """Save output to file."""
-    os.makedirs(output_dir, exist_ok=True)
-    filepath = os.path.join(output_dir, f"output_{idx}.txt")
-    with open(filepath, 'w') as f:
-        f.write(output)
-    logger.info(f"Saved output to {filepath}")
-
 def evaluate_maze_answer(answer, options, ground_truth):
     """
     Evaluate a Maze MCQ answer and return (is_correct, extracted_answer, message).
@@ -340,8 +342,8 @@ def evaluate_maze_answer(answer, options, ground_truth):
         pattern = rf'\b({keys})\.\s*([A-Za-z0-9]+)\b'
         options = dict(re.findall(pattern, user_prompt))
         
-        # Build full prompt with Phi-4-reasoning ChatML format
-        full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
+        # Build full prompt with ChatML format
+        full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
         
         # Parse maze from prompt
         grid, start_pos, exit_pos = parse_maze_from_prompt(user_prompt)
diff --git a/examples/TTSwithVerification/spatialmap_stepverifier.py b/examples/TTSwithVerification/spatialmap_stepverifier.py
index 87ace705..59d8b8aa 100644
--- a/examples/TTSwithVerification/spatialmap_stepverifier.py
+++ b/examples/TTSwithVerification/spatialmap_stepverifier.py
@@ -15,7 +15,6 @@
 import os
 import re
 import numpy as np
-from pathlib import Path
 
 from datasets import load_dataset
 from transformers import AutoTokenizer
@@ -30,6 +29,14 @@
 MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Walk up to find the repo root (contains pyproject.toml), output to its parent
+_dir = _SCRIPT_DIR
+while _dir != os.path.dirname(_dir) and not os.path.isfile(os.path.join(_dir, "pyproject.toml")):
+    _dir = os.path.dirname(_dir)
+_OUTPUT_ROOT = os.path.dirname(_dir)
+
 
 def get_model_short_name(model_name: str) -> str:
     """Extract a short, filesystem-safe name from the model path."""
@@ -38,8 +45,10 @@ def get_model_short_name(model_name: str) -> str:
     return short_name
 
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS_SANITY/SpatialMapResults"):
+def get_output_dirs(main_model: str, base_dir: str = None):
     """Create and return output directory paths based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "SpatialMapResults")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     
@@ -130,15 +139,6 @@ def init_llm_server(model_name, max_tokens=20480, port=8000):
     return {"url": url, "payload": payload, "headers": headers}
 
 
-def save_output(idx: int, output: str, output_dir: str):
-    """Save output to file."""
-    os.makedirs(output_dir, exist_ok=True)
-    filepath = os.path.join(output_dir, f"output_{idx}.txt")
-    with open(filepath, 'w') as f:
-        f.write(output)
-    logger.info(f"Saved output to {filepath}")
-
-
 def save_prompt(idx, prompt_with_answer, reason_dir):
     """Save reasoning trace to file."""
     os.makedirs(reason_dir, exist_ok=True)
@@ -194,9 +194,9 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
     parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
     parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
     parser.add_argument("--newline_threshold", type=int, default=20,
-                        help="Number of \\\\n\\\\n in thinking before triggering side verification")
+                        help="Number of \\n in thinking before triggering side verification")
     parser.add_argument("--warmup", type=int, default=0,
-                        help="Number of \\\\n\\\\n to skip before starting side-chain verification (warmup period)")
+                        help="Number of \\n to skip before starting side-chain verification (warmup period)")
     args = parser.parse_args()
 
     logger.info(f"Thinking-phase verification: always on")
diff --git a/examples/TTSwithVerification/spatialmeta.py b/examples/TTSwithVerification/spatialmeta.py
index 0cfc7672..78e8e30d 100644
--- a/examples/TTSwithVerification/spatialmeta.py
+++ b/examples/TTSwithVerification/spatialmeta.py
@@ -11,7 +11,6 @@
 import os
 import re
 import numpy as np
-from pathlib import Path
 
 from datasets import load_dataset
 from transformers import AutoTokenizer
@@ -23,9 +22,17 @@
 logger = logging.getLogger(__name__)
 
 # ============== MODEL CONFIGURATION ==============
-MAIN_MODEL = "microsoft/Phi-4-reasoning"
+MAIN_MODEL = "Qwen/QwQ-32B"
 # =================================================
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Walk up to find the repo root (contains pyproject.toml), output to its parent
+_dir = _SCRIPT_DIR
+while _dir != os.path.dirname(_dir) and not os.path.isfile(os.path.join(_dir, "pyproject.toml")):
+    _dir = os.path.dirname(_dir)
+_OUTPUT_ROOT = os.path.dirname(_dir)
+
 
 def get_model_short_name(model_name: str) -> str:
     """Extract a short, filesystem-safe name from the model path."""
@@ -34,8 +41,10 @@ def get_model_short_name(model_name: str) -> str:
     return short_name
 
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_TTS/SpatialMapResults/metaPrompt"):
+def get_output_dirs(main_model: str, base_dir: str = None):
     """Create and return output directory paths based on model name."""
+    if base_dir is None:
+        base_dir = os.path.join(_OUTPUT_ROOT, "Outputs_TTS", "SpatialMapResults", "metaPrompt")
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     
@@ -55,6 +64,14 @@ def remove_last_paragraph(s: str) -> str:
     return s[:-143] if len(s) > 143 else s
 
 
+def save_prompt(idx, prompt_with_answer, reason_dir):
+    """Save reasoning trace to a text file."""
+    os.makedirs(reason_dir, exist_ok=True)
+    filename = os.path.join(reason_dir, f"reason_{idx}.txt")
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(prompt_with_answer)
+
+
 def get_question_type(idx: int) -> str:
     """Determine question type based on index range.
     
@@ -265,15 +282,17 @@ def count_tokens(text: str, tokenizer) -> int:
     return len(tokens)
 
 
-def init_llm_server(model_name, max_tokens=22000, port=8001):
+def init_llm_server(model_name, max_tokens=32768, port=8000):
     """Initialize LLM server configuration."""
     url = f"http://localhost:{port}/v1/completions"
     payload = {
         "model": model_name,
         "max_tokens": max_tokens,
-        "top_k": 50,
+        "top_k": 20,
         "top_p": 0.95,
-        "temperature": 0.8,
+        "min_p": 0.0,
+        "do_sample": True,
+        "temperature": 0.6,
         "stream": True,
         "logprobs": 20,
         "use_beam_search": False,
@@ -283,15 +302,6 @@ def init_llm_server(model_name, max_tokens=22000, port=8001):
     headers = {"Content-Type": "application/json"}
     return {"url": url, "payload": payload, "headers": headers}
 
-
-def save_output(idx: int, output: str, output_dir: str):
-    """Save output to file."""
-    os.makedirs(output_dir, exist_ok=True)
-    filepath = os.path.join(output_dir, f"output_{idx}.txt")
-    with open(filepath, 'w') as f:
-        f.write(output)
-    logger.info(f"Saved output to {filepath}")
-
 def evaluate_spatialmap_answer(answer, options, ground_truth):
     """
     Evaluate a SpatialMap MCQ answer and return (is_correct, extracted_answer, message).
@@ -408,8 +418,8 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
         # Determine question type
         question_type = get_question_type(idx)
         
-        # Build full prompt with Phi-4-reasoning ChatML format
-        full_prompt = f"<|im_start|>system<|im_sep|>\n{system_prompt}<|im_end|>\n<|im_start|>user<|im_sep|>\n{user_prompt}<|im_end|>\n<|im_start|>assistant<|im_sep|>\n<think>\n"
+        # Build full prompt with ChatML format
+        full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
         
         logger.info(f"\n{'='*60}")
         logger.info(f"Example {idx} ({question_type})")
@@ -459,7 +469,7 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
         total_examples += 1
         stats_by_type[question_type]["total"] += 1
         # Save output
-        save_output(idx, answer, reason_dir)
+        save_prompt(idx, answer, reason_dir)
         
         # Log result
         result = {
diff --git a/interwhen/monitors/__init__.py b/interwhen/monitors/__init__.py
index 2a366822..73b222b1 100644
--- a/interwhen/monitors/__init__.py
+++ b/interwhen/monitors/__init__.py
@@ -2,4 +2,6 @@
 from .k_stable import KstableAnswerMCQMonitor, KstableAnswerGame24Monitor
 from .stepVerifier import StepVerifierGame24Monitor, StepVerifierMazeMonitor, StepVerifierSpatialMapMonitor
 from .earlyStopping import EATMonitor, DEERMonitor
-from .thinkingPhaseVerifier import ThinkingPhaseStepVerifierGame24Monitor, ThinkingPhaseStepVerifierMazeMonitor, ThinkingPhaseStepVerifierSpatialMapMonitor
\ No newline at end of file
+from .thinkingPhaseVerifierGame24 import ThinkingPhaseStepVerifierGame24Monitor
+from .thinkingPhaseVerifierMaze import ThinkingPhaseStepVerifierMazeMonitor
+from .thinkingPhaseVerifierSpatialMap import ThinkingPhaseStepVerifierSpatialMapMonitor
\ No newline at end of file
diff --git a/interwhen/monitors/_common.py b/interwhen/monitors/_common.py
new file mode 100644
index 00000000..1c047fde
--- /dev/null
+++ b/interwhen/monitors/_common.py
@@ -0,0 +1,51 @@
+"""
+Shared utilities for thinking-phase verifier monitors.
+"""
+
+import re
+from typing import Optional
+
+
+def find_complete_boxed(text: str) -> Optional[object]:
+    """Find a complete \\boxed{...} in text, handling nested braces.
+
+    Unlike ``re.search(r'\\boxed\\{[^}]+\\}', text)`` this correctly
+    handles LaTeX like ``\\boxed{12\\frac{1}{2}}`` where the naive
+    ``[^}]+`` pattern would stop at the first ``}``.
+
+    Returns a match-like object with ``.start()`` and ``.end()``
+    spanning the full ``\\boxed{...}`` (including the outer braces),
+    or ``None`` if no complete boxed expression is found.
+    """
+    idx = 0
+    while idx < len(text):
+        pos = text.find(r'\boxed{', idx)
+        if pos == -1:
+            return None
+        # Start counting braces from after '\boxed{'
+        brace_start = pos + len(r'\boxed{')
+        depth = 1
+        i = brace_start
+        while i < len(text) and depth > 0:
+            if text[i] == '{':
+                depth += 1
+            elif text[i] == '}':
+                depth -= 1
+            i += 1
+        if depth == 0:
+            match_start = pos
+            match_end = i  # i is right after the closing '}'
+            content = text[brace_start:i - 1].strip()
+            if content:
+                class _BoxedMatch:
+                    def __init__(self, s, e):
+                        self._start, self._end = s, e
+                    def start(self):
+                        return self._start
+                    def end(self):
+                        return self._end
+                    def group(self, n=0):
+                        return text[self._start:self._end]
+                return _BoxedMatch(match_start, match_end)
+        idx = pos + 1
+    return None
diff --git a/interwhen/monitors/stepVerifier.py b/interwhen/monitors/stepVerifier.py
index be73b80f..a5f12624 100644
--- a/interwhen/monitors/stepVerifier.py
+++ b/interwhen/monitors/stepVerifier.py
@@ -1,52 +1,25 @@
 import re
+import logging
 from typing import List, Tuple, Optional, Set, Dict
 from .base import VerifyMonitor
+from ._common import find_complete_boxed
 from ..utils.game24_verifier import parse_step, verify_step, format_feedback
 from ..utils.maze_verifier import (
     Direction, parse_direction, parse_maze_step, verify_maze_step,
     verify_locate_section, format_maze_feedback, format_locate_feedback,
-    parse_maze_from_prompt
+    parse_maze_from_prompt, compute_relative_direction,
 )
 from ..utils.spatialmap_verifier import (
     SpatialMapZ3Solver, parse_directional_claims_from_text,
-    extract_step2_claims, verify_spatialmap_step, format_spatialmap_feedback
+    extract_step2_claims, verify_spatialmap_step, format_spatialmap_feedback,
+    parse_counting_question, parse_model_count_from_answer,
+    parse_direction_question, parse_object_question,
+    parse_model_boxed_answer,
+    get_possible_directions, get_consistent_object_options,
+    get_possible_count_range,
 )
 
-
-def _find_complete_boxed(text: str):
-    """Find a complete \\boxed{...} in text, handling nested braces.
-
-    Returns a match-like object with .start() and .end(), or None.
-    """
-    idx = 0
-    while idx < len(text):
-        pos = text.find(r'\boxed{', idx)
-        if pos == -1:
-            return None
-        brace_start = pos + len(r'\boxed{')
-        depth = 1
-        i = brace_start
-        while i < len(text) and depth > 0:
-            if text[i] == '{':
-                depth += 1
-            elif text[i] == '}':
-                depth -= 1
-            i += 1
-        if depth == 0:
-            content = text[brace_start:i - 1].strip()
-            if content:
-                class _BoxedMatch:
-                    def __init__(self, s, e):
-                        self._start, self._end = s, e
-                    def start(self):
-                        return self._start
-                    def end(self):
-                        return self._end
-                    def group(self, n=0):
-                        return text[self._start:self._end]
-                return _BoxedMatch(pos, i)
-        idx = pos + 1
-    return None
+logger = logging.getLogger(__name__)
 
 
 class StepVerifierGame24Monitor(VerifyMonitor):
@@ -331,7 +304,8 @@ def __init__(
         exit_pos: Tuple[int, int],
         max_corrections: int = 5,
         question_type: str = "right_turns",  # "right_turns", "total_turns", "relative_position"
-        async_execution: bool = True
+        async_execution: bool = True,
+        prompt: str = None,
     ):
         super().__init__(name)
         self.async_execution = async_execution
@@ -341,6 +315,7 @@ def __init__(
         self.exit_pos = exit_pos
         self.max_corrections = max_corrections
         self.question_type = question_type
+        self.prompt = prompt
 
     @staticmethod
     def detect_question_type(prompt: str) -> str:
@@ -411,7 +386,8 @@ def from_prompt(
             exit_pos=exit_pos,
             max_corrections=max_corrections,
             question_type=question_type,
-            async_execution=async_execution
+            async_execution=async_execution,
+            prompt=prompt,
         )
 
     def _count_feedback_blocks(self, text: str) -> int:
@@ -639,18 +615,72 @@ async def verify(self, chunk: str, token_index: int, event, event_info: dict):
         
         return chunk, feedback
 
+    def _verify_relative_position_answer(self, boxed_answer: str) -> Tuple[bool, Optional[str]]:
+        """Verify a relative-position boxed answer (A=Yes / B=No).
+
+        Parses the question from ``self.prompt`` to determine the asked
+        direction, computes the true relative direction of E from S,
+        and checks whether the model's Yes/No answer is correct.
+
+        Returns ``(is_correct, feedback_or_None)``.
+        """
+        if self.prompt is None:
+            return True, None
+
+        answer_map = {"A": "Yes", "B": "No"}
+        model_yn = answer_map.get(boxed_answer.strip().upper())
+        if model_yn is None:
+            return True, None
+
+        m = re.search(
+            r'Is the exit \(E\)\s+(.*?)\s+(?:of\s+)?the starting point \(S\)',
+            self.prompt, re.IGNORECASE,
+        )
+        if not m:
+            return True, None
+
+        asked_raw = m.group(1).strip().lower()
+        asked_raw = re.sub(r',.*', '', asked_raw).strip()
+
+        actual = compute_relative_direction(self.start_pos, self.exit_pos)
+
+        direction_keywords = {
+            "directly to the left":   {"west"},
+            "directly to the right":  {"east"},
+            "directly above":         {"north"},
+            "directly below":         {"south"},
+            "to the top left":        {"northwest"},
+            "to the top right":       {"northeast"},
+            "to the bottom left":     {"southwest"},
+            "to the bottom right":    {"southeast"},
+        }
+
+        expected_dirs = direction_keywords.get(asked_raw)
+        if expected_dirs is None:
+            return True, None
+
+        expected_yn = "Yes" if actual in expected_dirs else "No"
+
+        if model_yn == expected_yn:
+            return True, None
+
+        feedback = (
+            f"\n\n[VERIFIER FEEDBACK for relative position:\n"
+            f"  ✗ Your answer {boxed_answer} ({model_yn}) is incorrect.\n"
+            f"  IMPORTANT: In this task, \"{asked_raw}\" means the GENERAL "
+            f"COMPASS DIRECTION, NOT immediate adjacency. It asks whether E "
+            f"is in the {actual} direction from S, regardless of distance or "
+            f"walls between them.]\n\n"
+        )
+        return False, feedback
+
     async def _verify_relative_position(self, chunk: str, token_index: int, event, event_info: dict):
         """
         Verify relative position answer.
         
-        For relative_position questions (Yes/No format), we only verify:
+        For relative_position questions (Yes/No format), we verify:
         1. The LOCATE section (S and E positions are correctly identified)
-        
-        We do NOT verify the final Yes/No answer because:
-        - The question asks "Is E to the [direction] of S?" 
-        - We don't have the question text here to know what direction was asked
-        - The comparison logic (row/col arithmetic) is straightforward
-        - If LOCATE is correct, the model should get the answer right
+        2. The boxed answer (A=Yes / B=No) against the computed direction
         """
         # Check LOCATE section for correct S and E positions
         locate_valid, locate_errors, locate_found = self._check_locate_section(chunk)
@@ -665,8 +695,27 @@ async def _verify_relative_position(self, chunk: str, token_index: int, event, e
                 event.set()
             return chunk, feedback
         
-        # For relative_position, we don't verify the final Yes/No answer
-        # Just let it complete once LOCATE is verified
+        # Check for boxed answer and verify it
+        if '</think>' in chunk:
+            text_after_think = chunk.split("</think>")[-1]
+            boxed_match = find_complete_boxed(text_after_think)
+            if boxed_match:
+                boxed_text = text_after_think[boxed_match.start():boxed_match.end()]
+                # Extract the letter from \boxed{X}
+                inner = re.search(r'\\boxed\{([^}]*)\}', boxed_text)
+                if inner:
+                    boxed_answer = inner.group(1).strip()
+                    is_correct, rp_feedback = self._verify_relative_position_answer(boxed_answer)
+                    if not is_correct and rp_feedback:
+                        if not event.is_set():
+                            event_info["generated_text"] = chunk
+                            event_info["feedback"] = rp_feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [f"Wrong relative position answer: {boxed_answer}"]
+                            event_info["failed_step"] = None
+                            event.set()
+                        return chunk, rp_feedback
+
         return chunk, None
 
     async def fix(self, generated_text: str, event_info: dict, fix_method=None) -> str:
@@ -775,7 +824,7 @@ def _step_extractor_relative_position(
         3. LOCATE section is complete and analysis has started (verify LOCATE)
         """
         # Check for boxed answer first (highest priority)
-        boxed_match = _find_complete_boxed(text)
+        boxed_match = find_complete_boxed(text)
         if boxed_match:
             # Found answer, verify it (include full text up to boxed answer)
             end_pos = text_start_in_generated + boxed_match.end()
@@ -847,6 +896,42 @@ def __init__(
         
         # Track verified claims to avoid re-checking
         self.verified_claims: Set[Tuple[str, str, str]] = set()
+
+        # ---- question-type detection (consistent with ThinkingPhaseVerifier) ----
+        self._counting_question = parse_counting_question(problem_text)
+        self._counting_options: Dict[str, str] = {}
+        _opts_text = re.split(r'\nFirst,', problem_text, maxsplit=1)[0]
+        if self._counting_question:
+            raw_opts = re.findall(
+                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
+                _opts_text, flags=re.DOTALL,
+            )
+            self._counting_options = {
+                k: v.strip().rstrip(".") for k, v in raw_opts
+            }
+
+        self._direction_question = parse_direction_question(problem_text)
+        self._object_question = parse_object_question(problem_text)
+
+        # Generic MCQ options (for direction & object Qs too)
+        if not self._counting_options:
+            raw_opts = re.findall(
+                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
+                _opts_text, flags=re.DOTALL,
+            )
+            self._mcq_options: Dict[str, str] = {
+                k: v.strip().rstrip(".") for k, v in raw_opts
+            }
+        else:
+            self._mcq_options = dict(self._counting_options)
+
+        # Retry limits for final-answer verification
+        self._max_final_answer_retries = 3
+        self._direction_feedback_count = 0
+        self._object_feedback_count = 0
+        self._diag_count_feedback_count = 0
+        self._count_feedback_given = False
+        self._count_feedback_blocks_count = 0
     
     @classmethod
     def from_prompt(
@@ -897,8 +982,13 @@ def _extract_new_claims(self, chunk: str) -> List[Dict]:
         # Only look at text after the last feedback
         text_to_check = text_after_think[last_feedback_end:]
         
+        # Get full entity names from Z3 solver for abbreviation resolution
+        entity_names = list({
+            k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
+        })
+        
         # Extract claims from STEP 2 in the latest attempt only
-        all_claims = extract_step2_claims(text_to_check)
+        all_claims = extract_step2_claims(text_to_check, entity_names=entity_names)
         
         # Filter to only new claims (not yet verified)
         new_claims = []
@@ -965,7 +1055,282 @@ async def verify(self, chunk: str, token_index: int, event, event_info: dict):
                 
                 return chunk, feedback
         
-        # All claims valid
+        # All claims valid — check for boxed answer (final answer verification)
+        if '</think>' in chunk:
+            text_after_think = chunk.split("</think>")[-1]
+            feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+            last_feedback_end = 0
+            for match in feedback_pattern.finditer(text_after_think):
+                last_feedback_end = match.end()
+            recent_text = text_after_think[last_feedback_end:]
+
+            boxed_match = find_complete_boxed(recent_text)
+            if boxed_match:
+                # --- Direction-question verification ---
+                if (
+                    self._direction_question
+                    and num_corrections < self.max_corrections
+                    and self._direction_feedback_count < self._max_final_answer_retries
+                ):
+                    model_dir_text = parse_model_boxed_answer(
+                        recent_text, self._mcq_options
+                    )
+                    if model_dir_text:
+                        possible = get_possible_directions(
+                            self.z3_solver,
+                            self._direction_question["entity_a"],
+                            self._direction_question["entity_b"],
+                        )
+                        if model_dir_text not in possible:
+                            self._direction_feedback_count += 1
+                            valid_options = [
+                                letter for letter, val in self._mcq_options.items()
+                                if val.strip().lower().rstrip(".") in possible
+                            ]
+                            if len(valid_options) == 1:
+                                feedback = (
+                                    f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
+                                    f"  '{model_dir_text.title()}' is "
+                                    f"impossible for "
+                                    f"{self._direction_question['entity_a']} "
+                                    f"relative to "
+                                    f"{self._direction_question['entity_b']} "
+                                    f"based on the given constraints.]\n\n"
+                                    f">>> STEP 3: ANSWER\n"
+                                )
+                            else:
+                                feedback = (
+                                    f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
+                                    f"  '{model_dir_text.title()}' is "
+                                    f"impossible for "
+                                    f"{self._direction_question['entity_a']} "
+                                    f"relative to "
+                                    f"{self._direction_question['entity_b']} "
+                                    f"based on the given constraints.\n"
+                                    f"  Please reconsider and choose the "
+                                    f"correct option.]\n\n"
+                                    f">>> STEP 3: ANSWER\n"
+                                )
+                            if not event.is_set():
+                                event_info["generated_text"] = chunk
+                                event_info["feedback"] = feedback
+                                event_info["correction_index"] = token_index
+                                event_info["errors"] = [
+                                    f"Direction '{model_dir_text}' impossible; "
+                                    f"possible: {possible}"
+                                ]
+                                event_info["failed_step"] = None
+                                event.set()
+                            return chunk, feedback
+
+                # --- Object-question verification ---
+                if (
+                    self._object_question
+                    and num_corrections < self.max_corrections
+                    and self._object_feedback_count < self._max_final_answer_retries
+                ):
+                    model_obj_text = parse_model_boxed_answer(
+                        recent_text, self._mcq_options
+                    )
+                    boxed_raw = re.findall(
+                        r'\\boxed\{([^}]*)\}', recent_text
+                    )
+                    model_letter = (
+                        boxed_raw[-1].strip().upper() if boxed_raw else None
+                    )
+
+                    if model_letter:
+                        consistent = get_consistent_object_options(
+                            self.z3_solver,
+                            self._object_question["direction"],
+                            self._object_question["reference"],
+                            self._mcq_options,
+                        )
+                        if model_letter not in consistent:
+                            self._object_feedback_count += 1
+                            odir = self._object_question["direction"]
+                            oref = self._object_question["reference"]
+                            if len(consistent) == 1:
+                                correct_name = self._mcq_options.get(
+                                    consistent[0], consistent[0]
+                                )
+                                feedback = (
+                                    f"\n\n[VERIFIER FEEDBACK: Object error!\n"
+                                    f"  '{model_obj_text}' cannot be "
+                                    f"{odir} of {oref} based on the "
+                                    f"given constraints.\n"
+                                    f"  The only consistent option is "
+                                    f"{consistent[0]}. {correct_name}.\n"
+                                    f"  Please select option "
+                                    f"{consistent[0]}.]\n\n"
+                                    f">>> STEP 3: ANSWER\n"
+                                )
+                            else:
+                                valid_names = [
+                                    f"{l}. {self._mcq_options.get(l, l)}"
+                                    for l in consistent
+                                ]
+                                feedback = (
+                                    f"\n\n[VERIFIER FEEDBACK: Object error!\n"
+                                    f"  '{model_obj_text}' cannot be "
+                                    f"{odir} of {oref} based on the "
+                                    f"given constraints.\n"
+                                    f"  The consistent options are: "
+                                    f"{', '.join(valid_names)}.\n"
+                                    f"  Please reconsider and choose the "
+                                    f"correct option.]\n\n"
+                                    f">>> STEP 3: ANSWER\n"
+                                )
+                            if not event.is_set():
+                                event_info["generated_text"] = chunk
+                                event_info["feedback"] = feedback
+                                event_info["correction_index"] = token_index
+                                event_info["errors"] = [
+                                    f"Object '{model_obj_text}' impossible "
+                                    f"in {odir} of {oref}; "
+                                    f"consistent: {consistent}"
+                                ]
+                                event_info["failed_step"] = None
+                                event.set()
+                            return chunk, feedback
+
+                # --- Counting-question verification ---
+                if (
+                    self._counting_question
+                    and num_corrections < self.max_corrections
+                ):
+                    direction = self._counting_question["direction"]
+                    reference = self._counting_question["reference"]
+                    is_cardinal = direction in (
+                        "north", "south", "east", "west"
+                    )
+
+                    if is_cardinal:
+                        model_count = parse_model_count_from_answer(
+                            recent_text, self._counting_options
+                        )
+                        z3_count = 0
+
+                        if (
+                            model_count is not None
+                            and model_count != z3_count
+                        ):
+                            self._count_feedback_given = True
+                            self._count_feedback_blocks_count += 1
+
+                            if direction in ("north", "south"):
+                                diag_examples = "northeast or northwest"
+                            elif direction == "west":
+                                diag_examples = "northwest or southwest"
+                            else:
+                                diag_examples = "northeast or southeast"
+
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Count mismatch!\n"
+                                f"  You answered {model_count} objects "
+                                f"'{direction}' of {reference}, but this "
+                                f"count is incorrect.\n"
+                                f"  IMPORTANT: '{direction}' is a strict "
+                                f"cardinal direction — it means ONLY "
+                                f"exactly {direction}, NOT {diag_examples}."
+                                f"\n"
+                                f"  An object that is {diag_examples.split(' or ')[0]} of "
+                                f"{reference} is NOT {direction} of "
+                                f"{reference}.\n"
+                                f"  Re-examine each object: is it described "
+                                f"as being strictly '{direction} of' "
+                                f"{reference}, or is the relationship "
+                                f"actually a diagonal direction like "
+                                f"{diag_examples}? Only count objects that "
+                                f"are strictly {direction}.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+
+                            if not event.is_set():
+                                event_info["generated_text"] = chunk
+                                event_info["feedback"] = feedback
+                                event_info["correction_index"] = token_index
+                                event_info["errors"] = [
+                                    f"Cardinal count mismatch: expected 0, "
+                                    f"got {model_count}"
+                                ]
+                                event_info["failed_step"] = None
+                                event.set()
+                            return chunk, feedback
+
+                    else:
+                        if self._diag_count_feedback_count < self._max_final_answer_retries:
+                            model_count = parse_model_count_from_answer(
+                                recent_text, self._counting_options
+                            )
+                            count_range = get_possible_count_range(
+                                self.z3_solver, reference, direction
+                            )
+
+                            if (
+                                model_count is not None
+                                and count_range is not None
+                            ):
+                                min_c, max_c = count_range
+
+                                if not (min_c <= model_count <= max_c):
+                                    self._diag_count_feedback_count += 1
+                                    valid_opts = []
+                                    for opt, val in (
+                                        self._counting_options.items()
+                                    ):
+                                        try:
+                                            v = int(val)
+                                            if min_c <= v <= max_c:
+                                                valid_opts.append(
+                                                    (opt, v)
+                                                )
+                                        except (ValueError, TypeError):
+                                            pass
+
+                                    if len(valid_opts) == 1:
+                                        feedback = (
+                                            f"\n\n[VERIFIER FEEDBACK: "
+                                            f"Count error!\n"
+                                            f"  {model_count} objects "
+                                            f"'{direction}' of {reference}"
+                                            f" is impossible.\n"
+                                            f"  The valid count is "
+                                            f"{valid_opts[0][1]}.\n"
+                                            f"  Please select option "
+                                            f"{valid_opts[0][0]}.]\n\n"
+                                            f">>> STEP 3: ANSWER\n"
+                                        )
+                                    else:
+                                        feedback = (
+                                            f"\n\n[VERIFIER FEEDBACK: "
+                                            f"Count error!\n"
+                                            f"  {model_count} objects "
+                                            f"'{direction}' of {reference}"
+                                            f" is impossible.\n"
+                                            f"  The possible count range "
+                                            f"is [{min_c}, {max_c}].\n"
+                                            f"  Please reconsider and "
+                                            f"choose the correct "
+                                            f"option.]\n\n"
+                                            f">>> STEP 3: ANSWER\n"
+                                        )
+
+                                    if not event.is_set():
+                                        event_info["generated_text"] = chunk
+                                        event_info["feedback"] = feedback
+                                        event_info["correction_index"] = (
+                                            token_index
+                                        )
+                                        event_info["errors"] = [
+                                            f"Diagonal count "
+                                            f"{model_count} outside "
+                                            f"range [{min_c}, {max_c}]"
+                                        ]
+                                        event_info["failed_step"] = None
+                                        event.set()
+                                    return chunk, feedback
+
         return chunk, None
 
     async def fix(self, generated_text: str, event_info: dict, fix_method=None) -> str:
@@ -1025,12 +1390,16 @@ def step_extractor(self, chunk: str, generated_text: str) -> Tuple[bool, Optiona
                     return True, generated_text[:end_pos]
         
         # Check for boxed answer (trigger final verification)
-        boxed_match = _find_complete_boxed(text)
+        boxed_match = find_complete_boxed(text)
         if boxed_match:
             # Verify any remaining claims before final answer
             new_claims = self._extract_new_claims(generated_text)
             if new_claims:
                 end_pos = text_start_in_generated + boxed_match.end()
                 return True, generated_text[:end_pos]
+            # Even if no new claims, boxed answer signals completion —
+            # trigger to allow final answer verification (direction/object/counting)
+            end_pos = text_start_in_generated + boxed_match.end()
+            return True, generated_text[:end_pos]
         
         return False, None
\ No newline at end of file
diff --git a/interwhen/monitors/thinkingPhaseVerifier.py b/interwhen/monitors/thinkingPhaseVerifier.py
deleted file mode 100644
index 76592f86..00000000
--- a/interwhen/monitors/thinkingPhaseVerifier.py
+++ /dev/null
@@ -1,2658 +0,0 @@
-"""
-Thinking Phase Verifiers for Game of 24 and Maze.
-
-These monitors verify solutions by forking a side-stream during the thinking
-phase to ask the model about its current progress.
-
-Game of 24 Workflow
--------------------
-A) **DURING the thinking phase** (inside ``<think>...</think>``):
-   After a warmup period, every *N* double-newlines in the thinking trace:
-   1. Inject ``</think> The expression that I found till now is {`` and
-      stream ~20 tokens to extract the expression the model outputs.
-   2. Verify the expression:
-      a. Extract numbers used in the expression.
-      b. Check each number appears in the original numbers (at most once).
-      c. If ALL four numbers are used: evaluate and check == 24.
-      d. If partial: evaluate the sub-expression, collect unused original
-         numbers, check ``can_reach_24([result] + unused)``.
-   3. If **wrong** -> strip the injected text, append
-      ``Wait, <error description>.`` inside the thinking trace and let
-      the model keep thinking.
-   4. If **correct AND complete** (all 4 numbers, equals 24) -> inject
-      ``Wait, current expression that I am able to generate seems to be
-      passed by the verifier, so let me stop and give the answer.
-      </think>`` and then let the model output the final answer.
-   5. If **correct AND partial** -> no feedback, let the model keep
-      thinking undisturbed.
-
-B) **AFTER a natural ``</think>``**:
-   Inject the same expression extraction prompt so the model outputs its
-   answer expression, then verify in the same way.  Give feedback on
-   errors so the model retries.
-
-Maze Workflow
--------------
-A) **DURING the thinking phase** (inside ``<think>...</think>``):
-   After a warmup period, every *N* double-newlines in the thinking trace:
-   1. Inject a first-person prompt in the LLM's own voice:
-      ``Let me output the current steps I have traced so far through
-      the maze in the following format:`` + ``<format>...</format>``
-      + ``>>> LOCATE START AND EXIT:``.  Stream ~300 tokens to
-      extract the model's current traced path steps.
-   2. Parse the structured steps and verify each against the maze grid:
-      a. Is the move direction correct (delta matches)?
-      b. Is from_pos the expected position?
-      c. Is to_pos walkable (not a wall)?
-      d. Is the turn type correct?
-      e. Are running counts correct?
-   3. If **errors found** -> strip the injected text, append
-      ``Wait, <error description>.`` and let the model keep thinking.
-   4. If **path reaches E with all steps correct** -> inject early-stop
-      message + ``</think>`` followed by the structured format prompt
-      so the model gives the final answer in the specified format.
-   5. If **partial but correct so far** -> no feedback, keep thinking.
-
-B) **AFTER ``</think>`` (natural or early-stop)**:
-   Phase 2a: Inject the same structured step format template (in the
-   LLM's own voice: ``Let me trace the step by step solution...`` +
-   ``<format>...</format>`` + ``>>> LOCATE START AND EXIT:``) so the
-   model fills it in.
-
-   Phase 2b: Verify each step as the model fills in the template.
-   Once ``\\boxed{}`` appears, stop generation.
-"""
-
-import re
-import json
-import logging
-from typing import Dict, List, Set, Tuple, Optional
-from copy import deepcopy
-
-import httpx
-
-from .base import VerifyMonitor
-from ..utils.game24_verifier import (
-    can_reach_24, is_close, format_number, safe_eval,
-)
-from ..utils.maze_verifier import (
-    Direction, parse_direction, get_expected_turn_type,
-    parse_maze_from_prompt, parse_maze_step, verify_maze_step,
-    verify_locate_section, format_maze_feedback, format_locate_feedback,
-    DIRECTION_DELTAS, compute_relative_direction,
-)
-from ..utils.spatialmap_verifier import (
-    SpatialMapZ3Solver, extract_step2_claims,
-    parse_directional_claims_from_text,
-    parse_counting_question, parse_model_count_from_answer,
-    parse_direction_question, parse_object_question,
-    parse_model_boxed_answer,
-    get_possible_directions, get_consistent_object_options,
-    get_possible_count_range,
-    verify_spatialmap_step, format_spatialmap_feedback,
-)
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-#  Prompts injected to elicit an expression from the model.
-# ---------------------------------------------------------------------------
-
-# Injected during the thinking phase (after </think>)
-THINKING_PHASE_EXPRESSION_PROMPT = (
-    "</think>\nThe expression that I found till now is {"
-)
-
-# Injected after a natural </think> to force the model to emit \boxed{expr}
-FINAL_EXPRESSION_PROMPT = (
-    "\nThe final expression is \\boxed"
-)
-
-
-def _find_complete_boxed(text: str) -> Optional[re.Match]:
-    """Find a complete \\boxed{...} in text, handling nested braces.
-
-    Unlike ``re.search(r'\\boxed\{[^}]+\}', text)`` this correctly
-    handles LaTeX like ``\\boxed{12\\frac{1}{2}}`` where the naive
-    ``[^}]+`` pattern would stop at the first ``}``.
-
-    Returns an ``re.Match``-like object with ``.start()`` and ``.end()``
-    spanning the full ``\\boxed{...}`` (including the outer braces),
-    or ``None`` if no complete boxed expression is found.
-    """
-    idx = 0
-    while idx < len(text):
-        pos = text.find(r'\boxed{', idx)
-        if pos == -1:
-            return None
-        # Start counting braces from after '\boxed{'
-        brace_start = pos + len(r'\boxed{')
-        depth = 1
-        i = brace_start
-        while i < len(text) and depth > 0:
-            if text[i] == '{':
-                depth += 1
-            elif text[i] == '}':
-                depth -= 1
-            i += 1
-        if depth == 0:
-            # Build a simple match-like object
-            match_start = pos
-            match_end = i  # i is right after the closing '}'
-            # Check the content is non-empty
-            content = text[brace_start:i - 1].strip()
-            if content:
-                class _BoxedMatch:
-                    def __init__(self, s, e):
-                        self._start, self._end = s, e
-                    def start(self):
-                        return self._start
-                    def end(self):
-                        return self._end
-                    def group(self, n=0):
-                        return text[self._start:self._end]
-                return _BoxedMatch(match_start, match_end)
-        # Couldn't close braces from this position, try next occurrence
-        idx = pos + 1
-    return None
-
-
-def _extract_numbers_from_expr(expr: str) -> List[float]:
-    """Extract all numbers (integers and decimals) from an expression string."""
-    numbers = re.findall(r'\d+\.?\d*', expr)
-    return [int(float(n)) if float(n) == int(float(n)) else float(n) for n in numbers]
-
-
-def _normalize_number(n) -> float:
-    """Normalize a number for comparison."""
-    return float(n)
-
-
-def verify_expression(expr_str: str, original_numbers: List[float]) -> Tuple[str, bool, List[str], Optional[List[float]]]:
-    """
-    Verify an expression against the Game of 24 rules.
-
-    Args:
-        expr_str: The arithmetic expression string (e.g. "1*2", "(3+5)*7/11")
-        original_numbers: The four original numbers.
-
-    Returns:
-        (status, is_valid, errors, unused_numbers_or_None)
-        - status: "complete" | "partial" | "error"
-        - is_valid: True if the expression is valid (no errors)
-        - errors: List of error messages
-        - unused_numbers: Numbers from original not used in expr (None if errors)
-    """
-    errors = []
-    fmt = format_number
-
-    # 1. Extract numbers used in the expression
-    used_numbers = _extract_numbers_from_expr(expr_str)
-    if not used_numbers:
-        errors.append(f"No numbers found in expression: {expr_str}")
-        return "error", False, errors, None
-
-    # 2. Check each used number appears in original (at most once)
-    original_copy = [_normalize_number(n) for n in original_numbers]
-    matched_indices = []
-    for used_n in used_numbers:
-        used_norm = _normalize_number(used_n)
-        found = False
-        for i, orig_n in enumerate(original_copy):
-            if i not in matched_indices and is_close(used_norm, orig_n):
-                matched_indices.append(i)
-                found = True
-                break
-        if not found:
-            errors.append(
-                f"Number {fmt(used_norm)} in expression is not available in "
-                f"original numbers {[fmt(n) for n in original_numbers]} "
-                f"(or was already used)"
-            )
-
-    if errors:
-        return "error", False, errors, None
-
-    # 3. Compute unused original numbers
-    unused = [original_copy[i] for i in range(len(original_copy)) if i not in matched_indices]
-
-    # 4. Evaluate the expression
-    try:
-        value = eval(expr_str, {"__builtins__": None}, {})
-        value = float(value)
-    except Exception as e:
-        errors.append(f"Cannot evaluate expression '{expr_str}': {e}")
-        return "error", False, errors, None
-
-    # 5. Check based on whether all numbers are used
-    all_used = len(unused) == 0
-
-    if all_used:
-        # Full expression: must equal 24
-        if not is_close(value, 24):
-            errors.append(
-                f"Expression '{expr_str}' evaluates to {fmt(value)}, not 24."
-            )
-            return "error", False, errors, None
-        # Valid complete solution!
-        return "complete", True, [], []
-    else:
-        # Partial expression: check if remaining numbers + result can reach 24
-        remaining = [value] + unused
-        can_reach, example = can_reach_24(remaining)
-        if not can_reach:
-            remaining_str = [fmt(n) for n in remaining]
-            errors.append(
-                f"Expression '{expr_str}' evaluates to {fmt(value)}. "
-                f"Remaining numbers (including result) are {remaining_str}. "
-                f"Cannot reach 24 from these numbers. This is a dead end."
-            )
-            return "error", False, errors, None
-        # Partial but reachable -- valid
-        return "partial", True, [], unused
-
-
-class ThinkingPhaseStepVerifierGame24Monitor(VerifyMonitor):
-    """
-    Monitor that verifies Game-of-24 expressions during and after thinking.
-
-    During thinking: every N double-newlines (after warmup) -> fork a
-        side-stream asking for the current expression, verify it, and
-        give appropriate feedback.
-
-    After natural ``</think>``: inject expression prompt, verify the
-        final answer.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        original_numbers: List[int],
-        llm_server: dict,
-        prompt: str,
-        newline_threshold: int = 15,
-        max_corrections: int = 5,
-        answer_start_token: str = "</think>",
-        async_execution: bool = True,
-        warmup_newlines: int = 0,
-    ):
-        super().__init__(name)
-        self.original_numbers = [float(x) for x in original_numbers]
-        self.llm_server = llm_server
-        self.prompt = prompt
-        self.newline_threshold = newline_threshold
-        self.max_corrections = max_corrections
-        self.answer_start_token = answer_start_token
-        self.async_execution = async_execution
-        self.warmup_newlines = warmup_newlines
-
-        # ---- state ----
-        self._think_phase_corrections = 0
-        self._verified_expression = None  # set by Phase 1 early-stop
-
-    # ------------------------------------------------------------------
-    #  helpers
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _fmt(n: float) -> str:
-        if abs(n - round(n)) < 1e-9:
-            return str(int(round(n)))
-        return f"{n:.4f}".rstrip('0').rstrip('.')
-
-    def _count_feedback_blocks(self, text: str) -> int:
-        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
-
-    def _is_in_thinking_phase(self, generated_text: str) -> bool:
-        return self.answer_start_token not in generated_text
-
-    @staticmethod
-    def _extract_braced_expression(text: str) -> Optional[str]:
-        """Extract the first expression wrapped in { } from *text*.
-
-        Handles nested braces so that e.g. ``{(3+5)*7}`` is extracted correctly.
-        """
-        start = text.find('{')
-        if start == -1:
-            return None
-        brace_count = 0
-        end = start
-        while end < len(text):
-            if text[end] == '{':
-                brace_count += 1
-            elif text[end] == '}':
-                brace_count -= 1
-                if brace_count == 0:
-                    break
-            end += 1
-        if brace_count != 0:
-            return None
-        expr = text[start + 1:end].strip()
-        if not expr:
-            return None
-        # Basic cleanup: remove LaTeX
-        expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
-        expr = expr.replace(r'\,', '').replace(r'\ ', '')
-        expr = expr.replace(r'\left', '').replace(r'\right', '')
-        # Replace Unicode math operators (QwQ frequently uses these)
-        expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
-        expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
-        frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
-        while re.search(frac_pattern, expr):
-            expr = re.sub(frac_pattern, r"(\1/\2)", expr)
-        # Handle implicit multiplication
-        expr = re.sub(r'\)\s*\(', ')*(', expr)
-        expr = re.sub(r'\)\s*(\d)', r')*\1', expr)
-        expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
-        return expr
-
-    @staticmethod
-    def _extract_boxed_expression(text: str) -> Optional[str]:
-        """Extract expression from \\boxed{...} in text."""
-        boxed_pattern = r"\\boxed\{"
-        matches = list(re.finditer(boxed_pattern, text))
-        if not matches:
-            return None
-        last_match = matches[-1]
-        start = last_match.end()
-        brace_count = 1
-        end = start
-        while end < len(text) and brace_count > 0:
-            if text[end] == '{':
-                brace_count += 1
-            elif text[end] == '}':
-                brace_count -= 1
-            end += 1
-        expr = text[start:end - 1].strip()
-        expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
-        expr = expr.replace(r'\,', '').replace(r'\ ', '')
-        expr = expr.replace(r'\left', '').replace(r'\right', '')
-        # Replace Unicode math operators (QwQ frequently uses these)
-        expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
-        expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
-        frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
-        while re.search(frac_pattern, expr):
-            expr = re.sub(frac_pattern, r"(\1/\2)", expr)
-        expr = re.sub(r'\)\s*\(', ')*(', expr)
-        expr = re.sub(r'\)\s*(\d)', r')*\1', expr)
-        expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
-        return expr
-
-    # ------------------------------------------------------------------
-    #  _side_stream_expression  (streams ~20 tokens to get {expr})
-    # ------------------------------------------------------------------
-    async def _side_stream_expression(self, text_so_far: str, max_new_tokens: int = 20) -> Optional[str]:
-        """
-        Send ``prompt + text_so_far`` to vLLM, stream at most
-        *max_new_tokens* tokens, and try to extract an expression from
-        the output that appears inside ``{ }``.
-
-        ``text_so_far`` is expected to end with something like
-        ``</think>\\nThe expression that I found till now is {``
-        so the model just needs to output the expression body and ``}``.
-
-        Returns the extracted expression string, or None.
-        """
-        fmt = self._fmt
-        nums_str = ", ".join(fmt(n) for n in self.original_numbers)
-        logger.info(
-            f"[Side-stream] Starting expression extraction\n"
-            f"  Original numbers : [{nums_str}]\n"
-            f"  Max new tokens   : {max_new_tokens}"
-        )
-
-        payload = deepcopy(self.llm_server["payload"])
-        payload["prompt"] = self.prompt + text_so_far
-        payload["max_tokens"] = max_new_tokens
-        # We don't need logprobs for the side-stream
-        payload.pop("logprobs", None)
-
-        generated = ""
-
-        async with httpx.AsyncClient(timeout=None) as client:
-            async with client.stream(
-                "POST",
-                self.llm_server["url"],
-                headers=self.llm_server["headers"],
-                json=payload,
-            ) as response:
-                async for line in response.aiter_lines():
-                    if line.startswith("data: "):
-                        data = line[len("data: "):].strip()
-                        if data == "[DONE]":
-                            break
-                        chunk = json.loads(data)["choices"][0]["text"]
-                        generated += chunk
-                        logger.debug(f"[Side-stream] chunk: {chunk!r}")
-
-                        # As soon as we see '}', we have the expression
-                        if '}' in generated:
-                            break
-
-        # The model was prompted with "{ " so its output completes the brace.
-        # We wrap it back so _extract_braced_expression can parse it.
-        full_text = "{" + generated
-        expr = self._extract_braced_expression(full_text)
-        if expr:
-            logger.info(f"[Side-stream] Extracted expression: {expr}")
-        else:
-            logger.info(
-                f"[Side-stream] No expression found in side-stream "
-                f"(generated {len(generated)} chars: {generated!r})"
-            )
-        return expr
-
-    # ------------------------------------------------------------------
-    #  step_extractor -- decides WHEN to trigger verification
-    # ------------------------------------------------------------------
-    def step_extractor(self, chunk: str, generated_text: str):
-        """
-        Phase 1 (thinking): trigger when total double-newlines cross the
-            next multiple of ``newline_threshold`` (after warmup).
-        Phase 2 (after </think>): trigger to inject the expression prompt,
-            or when a ``{expression}`` or ``\\boxed{expression}`` appears.
-        """
-        # ===== PHASE 1: still inside <think> =====
-        if self._is_in_thinking_phase(generated_text):
-            if self._think_phase_corrections >= self.max_corrections:
-                return False, None
-
-            total_double_newlines = generated_text.count('\n\n')
-
-            # Skip until warmup period is reached
-            if total_double_newlines < self.warmup_newlines:
-                return False, None
-
-            # After warmup, trigger at every newline_threshold multiple
-            past_warmup = total_double_newlines - self.warmup_newlines
-            if (generated_text.endswith('\n\n')
-                    and past_warmup >= 0
-                    and past_warmup % self.newline_threshold == 0):
-                logger.info(
-                    f"[step_extractor] Phase 1 trigger: \\n\\n count={total_double_newlines} "
-                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
-                    f"threshold={self.newline_threshold})"
-                )
-                return True, generated_text
-
-            return False, None
-
-        # ===== PHASE 2: after </think> =====
-
-        # 2a: </think> present but we haven't injected the expression prompt yet
-        if FINAL_EXPRESSION_PROMPT.strip() not in generated_text:
-            logger.info(
-                "[step_extractor] Phase 2a: </think> detected, "
-                "expression prompt not yet injected."
-            )
-            return True, generated_text
-
-        # 2b: expression prompt was injected (ends with "\boxed").
-        #     The model should complete it with "{expression}".
-        #     Trigger once we see a complete \boxed{...} (with closing brace).
-        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
-        text_after_think = generated_text[think_end_pos:]
-
-        # Look past any previous feedback blocks
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text_after_think):
-            last_feedback_end = match.end()
-        text = text_after_think[last_feedback_end:]
-
-        has_boxed = _find_complete_boxed(text)
-        if has_boxed:
-            return True, generated_text
-
-        return False, None
-
-    # ------------------------------------------------------------------
-    #  verify
-    # ------------------------------------------------------------------
-    async def verify(self, step: str, token_index: int, event, event_info):
-        """
-        Case 1 -- still in thinking (no </think> in step):
-            Inject ``</think> The expression that I found till now is {``,
-            stream ~20 tokens, extract the expression, verify it.
-            - Error -> feedback ``Wait, <error>.``
-            - Correct & complete -> inject early-stop message
-            - Correct & partial -> do nothing, let model keep thinking
-
-        Case 2a -- natural </think> just appeared, expression prompt not
-            yet injected:
-            Signal fix() to append the expression prompt.
-
-        Case 2b -- after </think> + expression prompt already injected:
-            Verify the expression from the model's output.
-        """
-
-        # ==================================================================
-        # CASE 1: Thinking phase -- side-stream expression verification
-        # ==================================================================
-        if self.answer_start_token not in step:
-            total_dn = step.count('\n\n')
-            logger.info(
-                f"[Phase 1] Thinking-phase verification triggered\n"
-                f"  \\n\\n count  : {total_dn}\n"
-                f"  Thinking len : {len(step)} chars"
-            )
-
-            # Build text with injected prompt for expression extraction
-            text_with_prompt = step + "\n" + THINKING_PHASE_EXPRESSION_PROMPT
-
-            # Side-stream: get expression from the model (~20 tokens)
-            expr_str = await self._side_stream_expression(text_with_prompt, max_new_tokens=20)
-
-            if expr_str is None:
-                # Model didn't produce a parseable expression -- let it keep thinking
-                logger.info(
-                    "[Phase 1] No expression extracted from side-stream. "
-                    "Letting model continue thinking."
-                )
-                return step, None
-
-            # Verify the extracted expression
-            status, is_valid, errors, unused = verify_expression(
-                expr_str, self.original_numbers
-            )
-
-            if not is_valid:
-                # ---- WRONG: inject error feedback into thinking trace ----
-                error_summary = "; ".join(errors)
-                self._think_phase_corrections += 1
-                logger.info(
-                    f"[Phase 1] INVALID expression '{expr_str}'\n"
-                    f"  Error(s) : {error_summary}\n"
-                    f"  Action   : Inject feedback into thinking trace\n"
-                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
-                )
-                thinking_feedback = (
-                    f"\n\nWait, the expression {expr_str} does not work. "
-                    f"{error_summary} "
-                    f"I must NOT reuse {expr_str} or any expression I have already tried. "
-                    f"Let me try a completely different combination of "
-                    f"operations and grouping of numbers.\n"
-                )
-                if not event.is_set():
-                    event_info["generated_text"] = step
-                    event_info["feedback"] = thinking_feedback
-                    event_info["correction_index"] = token_index
-                    event_info["errors"] = errors
-                    event_info["phase"] = "rollback_to_thinking"
-                    event.set()
-                return step, thinking_feedback
-
-            elif status == "complete":
-                # ---- CORRECT & COMPLETE: early-stop, push to answer ----
-                self._verified_expression = expr_str
-                logger.info(
-                    f"[Phase 1] VALID COMPLETE expression '{expr_str}' == 24\n"
-                    f"  Action: Inject early-stop message and transition to answer."
-                )
-                early_stop_msg = (
-                    f"\n\nWait, the expression {expr_str} has been verified "
-                    f"to equal 24 using all the given numbers. This will be "
-                    f"my final answer.\n{self.answer_start_token}\n"
-                )
-                if not event.is_set():
-                    event_info["generated_text"] = step
-                    event_info["feedback"] = early_stop_msg
-                    event_info["correction_index"] = token_index
-                    event_info["phase"] = "early_stop_answer"
-                    event_info["verified_expression"] = expr_str
-                    event.set()
-                return step, early_stop_msg
-
-            else:
-                # ---- CORRECT & PARTIAL: let model keep thinking ----
-                unused_str = (
-                    "[" + ", ".join(self._fmt(n) for n in unused) + "]"
-                    if unused else "[]"
-                )
-                logger.info(
-                    f"[Phase 1] VALID PARTIAL expression '{expr_str}'\n"
-                    f"  Unused numbers: {unused_str}\n"
-                    f"  Action: No error, let model keep thinking."
-                )
-                return step, None
-
-        # ==================================================================
-        # CASE 2a: </think> present but expression prompt not yet injected
-        # ==================================================================
-        if FINAL_EXPRESSION_PROMPT.strip() not in step:
-            logger.info(
-                "[Phase 2a] Natural </think> detected. "
-                "Injecting expression extraction prompt."
-            )
-            prompt_text = FINAL_EXPRESSION_PROMPT
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = prompt_text
-                event_info["correction_index"] = token_index
-                event_info["phase"] = "inject_expression_prompt"
-                event.set()
-            return step, prompt_text
-
-        # ==================================================================
-        # CASE 2b: After </think> + expression prompt -- verify final answer
-        # ==================================================================
-
-        # Max-corrections guard
-        num_corrections = self._count_feedback_blocks(step)
-        if num_corrections >= self.max_corrections:
-            fb = "\nthe answer is \\boxed{no solution}"
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = fb
-                event_info["correction_index"] = token_index
-                event_info["errors"] = ["Max corrections reached"]
-                event_info["phase"] = "standard_verify"
-                event.set()
-            return step, fb
-
-        # Extract expression from \boxed{...} — only look at text after
-        # the last feedback block to avoid re-extracting old expressions.
-        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
-        text_after_think = step[think_end_pos:]
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text_after_think):
-            last_feedback_end = match.end()
-        recent_text = text_after_think[last_feedback_end:]
-
-        expr_str = self._extract_boxed_expression(recent_text)
-        if expr_str is not None:
-            logger.info(f"[Phase 2b] Extracted expression from \\boxed: '{expr_str}'")
-
-        if expr_str is None:
-            return step, None
-
-        # Verify the final expression (must use all 4 numbers and equal 24)
-        status, is_valid, errors, unused = verify_expression(
-            expr_str, self.original_numbers
-        )
-
-        if is_valid and status == "complete":
-            logger.info(f"[Phase 2b] Final expression '{expr_str}' is correct (= 24)")
-            # Signal STOP so the model doesn't keep generating
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = ""  # nothing to append
-                event_info["correction_index"] = token_index
-                event_info["phase"] = "final_answer_correct"
-                event_info["verified_expression"] = expr_str
-                event.set()
-            return step, None
-
-        # Build error messages for partial/wrong answers in phase 2
-        if is_valid and status == "partial":
-            # In phase 2 (after </think>) we need ALL numbers used
-            used_numbers = _extract_numbers_from_expr(expr_str)
-            errors = [
-                f"Expression '{expr_str}' only uses {len(used_numbers)} of "
-                f"{len(self.original_numbers)} numbers. After </think>, "
-                f"a COMPLETE expression using ALL numbers is required."
-            ]
-
-        if not errors:
-            errors = [f"Expression '{expr_str}' is not a valid solution."]
-
-        error_summary = "; ".join(errors)
-        logger.info(f"[Phase 2b] Final expression FAILED: {error_summary}")
-
-        orig_display = [int(n) if n == int(n) else n for n in self.original_numbers]
-        nums_str = ", ".join(str(n) for n in orig_display)
-        feedback = (
-            f"\n[VERIFIER FEEDBACK:\n"
-            f"  The expression {expr_str} is incorrect. {error_summary}\n"
-            f"  Do NOT reuse {expr_str} or any previously tried expression.\n"
-            f"  Try a completely different approach. Use ALL four numbers "
-            f"{nums_str} exactly once, "
-            f"evaluating to 24. Wrap in \\boxed{{}}. ]\n"
-        )
-        if not event.is_set():
-            event_info["generated_text"] = step
-            event_info["feedback"] = feedback
-            event_info["correction_index"] = token_index
-            event_info["errors"] = errors
-            event_info["phase"] = "standard_verify"
-            event.set()
-        return step, feedback
-
-    # ------------------------------------------------------------------
-    #  fix
-    # ------------------------------------------------------------------
-    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
-        """
-        Applies the appropriate fix depending on the phase:
-
-        - ``rollback_to_thinking``: Append error feedback into thinking trace.
-        - ``early_stop_answer``: Append early-stop message + </think> to
-          transition the model to answer generation.
-        - ``inject_expression_prompt``: Append expression prompt after </think>.
-        - ``standard_verify``: Append [VERIFIER FEEDBACK ...].
-        """
-        phase = event_info.get("phase", "standard_verify")
-
-        if phase == "rollback_to_thinking":
-            base_text = event_info["generated_text"]
-            result = base_text.rstrip() + event_info["feedback"]
-            logger.info(
-                f"[fix] Phase: rollback_to_thinking\n"
-                f"  -> Appended error feedback into <think> trace.\n"
-                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
-            )
-            return result
-
-        if phase == "early_stop_answer":
-            base_text = event_info["generated_text"]
-            result = base_text.rstrip() + event_info["feedback"]
-            logger.info(
-                f"[fix] Phase: early_stop_answer\n"
-                f"  -> Verified expression passed. Injecting early-stop + </think>.\n"
-                f"  -> Model will now generate the final answer."
-            )
-            return result
-
-        if phase == "final_answer_correct":
-            expr = event_info.get("verified_expression", "?")
-            logger.info(
-                f"[fix] Phase: final_answer_correct\n"
-                f"  -> Final expression '{expr}' verified correct. Stopping generation."
-            )
-            return event_info["generated_text"]
-
-        if phase == "inject_expression_prompt":
-            logger.info(
-                f"[fix] Phase: inject_expression_prompt\n"
-                f"  -> Natural </think> detected.\n"
-                f"  -> Appending expression extraction prompt."
-            )
-            return event_info["generated_text"] + event_info["feedback"]
-
-        # standard_verify
-        errors = event_info.get("errors", [])
-        error_summary = "; ".join(errors) if errors else "unknown"
-        logger.info(
-            f"[fix] Phase: standard_verify\n"
-            f"  -> Expression failed: {error_summary}\n"
-            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
-        )
-        return event_info["generated_text"] + event_info["feedback"]
-
-
-# =====================================================================
-#  Maze Thinking-Phase Prompts
-# =====================================================================
-
-
-def _build_maze_format_block(question_type: str) -> str:
-    """
-    Build the <format>...</format> block that describes the structured
-    output template.  Re-used by both the side-stream (Phase 1) and
-    the post-</think> injection (Phase 2a).
-    """
-    if question_type == "relative_position":
-        return (
-            "<format>\n"
-            ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
-            "    S position: (row, col)\n"
-            "    E position: (row, col)\n"
-            "\n"
-            ">>> COMPARE POSITIONS:\n"
-            "    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S\n"
-            "    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S\n"
-            "\n"
-            ">>> FINAL ANSWER:\n"
-            "    \\boxed{LETTER}\n"
-            "</format>"
-        )
-    else:
-        count_line = "    Running count: Right=0, Left=0"
-        if question_type == "total_turns":
-            count_line = "    Running count: Right=0, Left=0, Total=0"
-
-        return (
-            "<format>\n"
-            ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
-            "    S position: (row, col)\n"
-            "    E position: (row, col)\n"
-            "\n"
-            ">>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)\n"
-            "    Current position: (r2, c2)\n"
-            "    Previous direction: —\n"
-            "    Current direction: DOWN\n"
-            "    Turn type: STRAIGHT\n"
-            f"{count_line}\n"
-            "\n"
-            "[... continue for all steps until reaching E ...]\n"
-            "\n"
-            ">>> FINAL ANSWER:\n"
-            "    \\boxed{LETTER}\n"
-            "</format>"
-        )
-
-
-def _build_maze_thinking_phase_prompt(question_type: str) -> str:
-    """
-    Build the side-stream prompt injected during the thinking phase.
-
-    Written in the LLM's own first-person thinking voice so it blends
-    naturally with the ``<think>`` trace.  Includes the ``<format>``
-    block and the starting marker so the model begins filling in.
-    """
-    format_block = _build_maze_format_block(question_type)
-    return (
-        "\n\nLet me output the current steps I have traced so far "
-        "through the maze in the following format:\n"
-        f"{format_block}\n"
-        ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
-    )
-
-
-def _build_maze_structured_prompt(question_type: str) -> str:
-    """
-    Build the structured format prompt injected after </think>.
-
-    This is analogous to Game24's step format injection — it gives the
-    model a template to fill in so we can parse and verify each step.
-    Written in the LLM's own voice so it reads naturally.
-    """
-    format_block = _build_maze_format_block(question_type)
-    return (
-        "\nLet me trace the step by step solution through the maze "
-        "in the following format:\n"
-        f"{format_block}\n"
-        ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
-    )
-
-
-# =====================================================================
-#  ThinkingPhaseStepVerifierMazeMonitor
-# =====================================================================
-
-class ThinkingPhaseStepVerifierMazeMonitor(VerifyMonitor):
-    """
-    Monitor that verifies maze path-tracing during and after thinking.
-
-    **No meta-prompt required** — works with a plain user prompt containing
-    just the maze and question.  Structure is injected by this monitor
-    after ``</think>`` (natural or early-stop), exactly like Game24
-    injects its step format.
-
-    Phase 1 – During ``<think>...</think>``:
-        Every N double-newlines (after warmup), fork a side-stream that
-        injects ``</think>`` + a structured step prompt, stream ~300
-        tokens, parse and verify each step against the maze grid.
-
-    Phase 2a – ``</think>`` detected, structured prompt not yet injected:
-        Inject the structured step-by-step format template so the model
-        fills it in (LOCATE → STEPs → FINAL ANSWER → ``\\boxed{}``).
-
-    Phase 2b – Structured prompt injected, model is generating:
-        Verify each completed step as it appears.  Once ``\\boxed{}``
-        appears, signal completion.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        grid: list,
-        start_pos: tuple,
-        exit_pos: tuple,
-        llm_server: dict,
-        prompt: str,
-        question_type: str = "right_turns",
-        newline_threshold: int = 10,
-        max_corrections: int = 5,
-        answer_start_token: str = "</think>",
-        async_execution: bool = True,
-        warmup_newlines: int = 0,
-    ):
-        super().__init__(name)
-        self.grid = grid
-        self.start_pos = start_pos
-        self.exit_pos = exit_pos
-        self.llm_server = llm_server
-        self.prompt = prompt
-        self.question_type = question_type
-        self.newline_threshold = newline_threshold
-        self.max_corrections = max_corrections
-        self.answer_start_token = answer_start_token
-        self.async_execution = async_execution
-        self.warmup_newlines = warmup_newlines
-
-        # Build the structured prompt that will be injected after </think>
-        self._structured_prompt = _build_maze_structured_prompt(question_type)
-        # Build the thinking-phase side-stream prompt (in LLM's own voice)
-        self._thinking_phase_prompt = _build_maze_thinking_phase_prompt(question_type)
-        # A unique marker to detect whether we already injected it
-        self._structured_marker = ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):"
-
-        # ---- state ----
-        self._think_phase_corrections = 0
-        self._verified_path_complete = False  # True if path reaches E
-
-    # ------------------------------------------------------------------
-    #  helpers
-    # ------------------------------------------------------------------
-    def _count_feedback_blocks(self, text: str) -> int:
-        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
-
-    def _is_in_thinking_phase(self, generated_text: str) -> bool:
-        return self.answer_start_token not in generated_text
-
-    def _structured_prompt_injected(self, generated_text: str) -> bool:
-        """Check if structured format was already injected after </think>."""
-        if self.answer_start_token not in generated_text:
-            return False
-        after_think = generated_text.split(self.answer_start_token, 1)[1]
-        return self._structured_marker in after_think
-
-    @staticmethod
-    def detect_question_type(prompt: str) -> str:
-        """Auto-detect question type from prompt text."""
-        prompt_lower = prompt.lower()
-        if "right turn" in prompt_lower or "right-turn" in prompt_lower:
-            return "right_turns"
-        if "left turn" in prompt_lower or "left-turn" in prompt_lower:
-            return "total_turns"
-        if "total" in prompt_lower and "turn" in prompt_lower:
-            return "total_turns"
-        if "turn" in prompt_lower:
-            return "right_turns"
-        return "relative_position"
-
-    def _verify_relative_position_answer(self, boxed_answer: str) -> Tuple[bool, Optional[str]]:
-        """Verify a relative-position boxed answer (A=Yes / B=No).
-
-        Parses the question from ``self.prompt`` to determine the asked
-        direction, computes the true relative direction of E from S,
-        and checks whether the model's Yes/No answer is correct.
-
-        Returns ``(is_correct, feedback_or_None)``.
-        """
-        # Map boxed letter → Yes / No
-        answer_map = {"A": "Yes", "B": "No"}
-        model_yn = answer_map.get(boxed_answer.strip().upper())
-        if model_yn is None:
-            # Not A or B – can't verify
-            return True, None
-
-        # --- Parse the asked direction from the prompt ---
-        # Patterns: "directly to the left of the starting point (S)"
-        #           "directly below the starting point (S)"
-        #           "to the top right of the starting point (S)"
-        m = re.search(
-            r'Is the exit \(E\)\s+(.*?)\s+(?:of\s+)?the starting point \(S\)',
-            self.prompt, re.IGNORECASE,
-        )
-        if not m:
-            return True, None  # can't parse question, skip verification
-
-        asked_raw = m.group(1).strip().lower()
-        # Remove trailing comma and extra clauses like ", with no ..."
-        asked_raw = re.sub(r',.*', '', asked_raw).strip()
-
-        # --- Compute actual relative direction ---
-        actual = compute_relative_direction(self.start_pos, self.exit_pos)
-
-        # --- Determine expected Yes / No ---
-        # "directly to the left … with no vertical displacement"
-        #  → same row, E col < S col  → actual in {"west"}
-        # "directly below … with no horizontal displacement"
-        #  → same col, E row > S row  → actual in {"south"}
-        # "to the top right" → E north-east of S → actual == "northeast"
-        direction_keywords = {
-            "directly to the left":   {"west"},
-            "directly to the right":  {"east"},
-            "directly above":         {"north"},
-            "directly below":         {"south"},
-            "to the top left":        {"northwest"},
-            "to the top right":       {"northeast"},
-            "to the bottom left":     {"southwest"},
-            "to the bottom right":    {"southeast"},
-        }
-
-        expected_dirs = direction_keywords.get(asked_raw)
-        if expected_dirs is None:
-            return True, None  # unrecognised pattern, skip
-
-        expected_yn = "Yes" if actual in expected_dirs else "No"
-
-        if model_yn == expected_yn:
-            return True, None
-
-        # --- Build feedback ---
-        sr, sc = self.start_pos
-        er, ec = self.exit_pos
-        correct_letter = 'A' if expected_yn == 'Yes' else 'B'
-        feedback = (
-            f"\n\n[VERIFIER FEEDBACK for relative position:\n"
-            f"  ✗ Your answer {boxed_answer} ({model_yn}) is incorrect.\n"
-            f"  IMPORTANT: In this task, \"{asked_raw}\" means the GENERAL "
-            f"COMPASS DIRECTION, NOT immediate adjacency. It asks whether E "
-            f"is in the {actual} direction from S, regardless of distance or "
-            f"walls between them.]\n\n"
-        )
-        return False, feedback
-
-    # ------------------------------------------------------------------
-    #  _parse_steps_from_text – parse structured steps from side-stream
-    # ------------------------------------------------------------------
-    def _parse_steps_from_text(self, text: str):
-        """
-        Parse all structured maze steps from text.
-
-        Returns list of parsed step dicts.
-        """
-        steps = []
-
-        step_pattern = re.compile(
-            r'>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
-            r'Running count:\s*Right\s*=\s*\d+\s*,\s*Left\s*=\s*\d+[^\n]*',
-            re.IGNORECASE | re.DOTALL
-        )
-
-        for match in step_pattern.finditer(text):
-            parsed = parse_maze_step(match.group(0))
-            if parsed:
-                steps.append(parsed)
-
-        return steps
-
-    def _verify_all_steps(self, steps):
-        """
-        Verify a sequence of parsed maze steps against the grid.
-
-        Returns:
-            (all_valid, first_error_step_num, errors, final_pos, final_dir,
-             right_count, left_count, total_count)
-        """
-        pos = self.start_pos
-        direction = Direction.NONE
-        right_count = 0
-        left_count = 0
-        total_count = 0
-
-        for step in steps:
-            is_valid, errors, state = verify_maze_step(
-                step=step,
-                grid=self.grid,
-                expected_from_pos=pos,
-                prev_direction=direction,
-                expected_right_count=right_count,
-                expected_left_count=left_count,
-                expected_total_count=total_count,
-            )
-
-            if not is_valid:
-                return (False, step.get('step_num', 0), errors,
-                        pos, direction, right_count, left_count, total_count)
-
-            pos = state['new_pos']
-            direction = state['new_direction']
-            right_count = state['new_right']
-            left_count = state['new_left']
-            total_count = state['new_total']
-
-        return (True, None, [], pos, direction,
-                right_count, left_count, total_count)
-
-    # ------------------------------------------------------------------
-    #  _side_stream_maze_steps – streams tokens to get traced path
-    # ------------------------------------------------------------------
-    async def _side_stream_maze_steps(self, text_so_far: str, max_new_tokens: int = 300) -> str:
-        """
-        Send ``prompt + text_so_far`` to vLLM, stream at most
-        *max_new_tokens* tokens, and return the generated text.
-
-        ``text_so_far`` is expected to end with the structured maze step
-        prompt so the model outputs its traced steps.
-        """
-        logger.info(
-            f"[Maze Side-stream] Starting path extraction\n"
-            f"  Maze: S={self.start_pos}, E={self.exit_pos}\n"
-            f"  Max new tokens: {max_new_tokens}"
-        )
-
-        payload = deepcopy(self.llm_server["payload"])
-        payload["prompt"] = self.prompt + text_so_far
-        payload["max_tokens"] = max_new_tokens
-        payload.pop("logprobs", None)
-
-        generated = ""
-
-        async with httpx.AsyncClient(timeout=None) as client:
-            async with client.stream(
-                "POST",
-                self.llm_server["url"],
-                headers=self.llm_server["headers"],
-                json=payload,
-            ) as response:
-                async for line in response.aiter_lines():
-                    if line.startswith("data: "):
-                        data = line[len("data: "):].strip()
-                        if data == "[DONE]":
-                            break
-                        chunk = json.loads(data)["choices"][0]["text"]
-                        generated += chunk
-                        logger.debug(f"[Maze Side-stream] chunk: {chunk!r}")
-
-                        # Stop if we see FINAL ANSWER or \boxed
-                        if '\\boxed' in generated or '>>> FINAL ANSWER' in generated:
-                            break
-
-        logger.info(
-            f"[Maze Side-stream] Generated {len(generated)} chars"
-        )
-        return generated
-
-    # ------------------------------------------------------------------
-    #  _extract_boxed_answer
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _extract_boxed_answer(text: str) -> Optional[str]:
-        """Extract the content of the last \\boxed{...} in text."""
-        matches = list(re.finditer(r'\\boxed\{', text))
-        if not matches:
-            return None
-        last_match = matches[-1]
-        start = last_match.end()
-        brace_count = 1
-        end = start
-        while end < len(text) and brace_count > 0:
-            if text[end] == '{':
-                brace_count += 1
-            elif text[end] == '}':
-                brace_count -= 1
-            end += 1
-        return text[start:end - 1].strip()
-
-    # ------------------------------------------------------------------
-    #  step_extractor
-    # ------------------------------------------------------------------
-    def step_extractor(self, chunk: str, generated_text: str):
-        """
-        Phase 1 (thinking): trigger at every newline_threshold multiple
-            (after warmup).
-        Phase 2 (after </think>): trigger on structured steps or boxed
-            answer.
-        """
-        # ===== PHASE 1: still inside <think> =====
-        if self._is_in_thinking_phase(generated_text):
-            if self._think_phase_corrections >= self.max_corrections:
-                return False, None
-
-            total_double_newlines = generated_text.count('\n\n')
-
-            if total_double_newlines < self.warmup_newlines:
-                return False, None
-
-            past_warmup = total_double_newlines - self.warmup_newlines
-            if (generated_text.endswith('\n\n')
-                    and past_warmup >= 0
-                    and past_warmup % self.newline_threshold == 0):
-                logger.info(
-                    f"[Maze step_extractor] Phase 1 trigger: \\n\\n count={total_double_newlines} "
-                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
-                    f"threshold={self.newline_threshold})"
-                )
-                return True, generated_text
-
-            return False, None
-
-        # ===== PHASE 2: after </think> =====
-
-        # 2a: structured prompt not yet injected → trigger immediately
-        if not self._structured_prompt_injected(generated_text):
-            logger.info(
-                "[Maze step_extractor] Phase 2a: </think> detected, "
-                "structured prompt not yet injected."
-            )
-            return True, generated_text
-
-        # 2b: structured prompt injected — verify steps / boxed answer
-        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
-        text_after_think = generated_text[think_end_pos:]
-
-        # Strip out the injected <format>...</format> template so we only
-        # look at actual model output (which starts after the last
-        # ">>> LOCATE START AND EXIT (0-indexed, ...):\n" line that ends the injected prompt).
-        last_marker_pos = text_after_think.rfind(self._structured_marker)
-        if last_marker_pos >= 0:
-            # Model output starts right after the marker line
-            model_output_start = last_marker_pos + len(self._structured_marker)
-            text_after_think = text_after_think[model_output_start:]
-            text_start_offset = think_end_pos + model_output_start
-        else:
-            text_start_offset = think_end_pos
-
-        # Skip past feedback blocks
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text_after_think):
-            last_feedback_end = match.end()
-        text = text_after_think[last_feedback_end:]
-        text_start = text_start_offset + last_feedback_end
-
-        # For turn-counting questions, check for structured steps
-        if self.question_type in ("right_turns", "total_turns"):
-            # Check for complete step (with Running count including Right=N, Left=N)
-            step_pattern = re.compile(
-                r'(>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
-                r'Running count:\s*Right\s*=\s*\d+\s*,\s*Left\s*=\s*\d+[^\n]*)',
-                re.IGNORECASE | re.DOTALL
-            )
-            all_steps = list(step_pattern.finditer(text))
-
-            if all_steps:
-                last_step = all_steps[-1]
-                # Check if next step started (current already verified)
-                text_after = text[last_step.end():]
-                next_step = re.search(r'>>>\s*STEP\s+\d+', text_after, re.IGNORECASE)
-                if not next_step:
-                    end_pos = text_start + last_step.end()
-                    return True, generated_text[:end_pos]
-                return False, None
-
-            # Check LOCATE section
-            locate_pattern = re.compile(
-                r'(LOCATE START AND EXIT.*?E position:\s*\([^)]+\))',
-                re.IGNORECASE | re.DOTALL
-            )
-            locate_match = locate_pattern.search(text)
-            if locate_match:
-                step1_start = re.search(r'>>>\s*STEP\s+1', text[locate_match.end():], re.IGNORECASE)
-                if step1_start:
-                    end_pos = text_start + locate_match.end()
-                    return True, generated_text[:end_pos]
-
-        # Check for boxed answer (any question type)
-        boxed = _find_complete_boxed(text)
-        if boxed:
-            end_pos = text_start + boxed.end()
-            return True, generated_text[:end_pos]
-
-        return False, None
-
-    # ------------------------------------------------------------------
-    #  verify
-    # ------------------------------------------------------------------
-    async def verify(self, step: str, token_index: int, event, event_info):
-        """
-        Case 1 -- still in thinking (no </think>):
-            Fork side-stream to get traced path steps, verify each.
-        Case 2 -- after </think>:
-            Verify structured steps and/or final answer.
-        """
-
-        # ==================================================================
-        # CASE 1: Thinking phase – side-stream path verification
-        # ==================================================================
-        if self.answer_start_token not in step:
-            total_dn = step.count('\n\n')
-            logger.info(
-                f"[Maze Phase 1] Thinking-phase verification triggered\n"
-                f"  \\n\\n count  : {total_dn}\n"
-                f"  Thinking len : {len(step)} chars"
-            )
-
-            # Build text with injected prompt for step extraction
-            # Uses the LLM's own voice: "Let me output the current steps..."
-            text_with_prompt = step + self._thinking_phase_prompt
-
-            # Side-stream: get path steps from the model
-            side_output = await self._side_stream_maze_steps(
-                text_with_prompt, max_new_tokens=300
-            )
-
-            if not side_output or len(side_output.strip()) < 20:
-                logger.info(
-                    "[Maze Phase 1] Insufficient output from side-stream. "
-                    "Letting model continue thinking."
-                )
-                return step, None
-
-            # Combine the prompt header with side output for parsing
-            full_side_text = (
-                ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n" + side_output
-            )
-
-            # First verify LOCATE section
-            locate_valid, locate_errors = verify_locate_section(
-                full_side_text, self.start_pos, self.exit_pos
-            )
-
-            if not locate_valid:
-                self._think_phase_corrections += 1
-                error_summary = "; ".join(locate_errors)
-                logger.info(
-                    f"[Maze Phase 1] LOCATE section errors: {error_summary}\n"
-                    f"  Action: Inject feedback into thinking trace\n"
-                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
-                )
-                thinking_feedback = (
-                    f"\n\nWait, I think I have the wrong positions. "
-                    f"{error_summary}. "
-                    f"Let me re-examine the maze grid carefully to find S and E.\n"
-                )
-                if not event.is_set():
-                    event_info["generated_text"] = step
-                    event_info["feedback"] = thinking_feedback
-                    event_info["correction_index"] = token_index
-                    event_info["errors"] = locate_errors
-                    event_info["phase"] = "rollback_to_thinking"
-                    event.set()
-                return step, thinking_feedback
-
-            # Parse and verify steps
-            steps = self._parse_steps_from_text(full_side_text)
-
-            if not steps:
-                logger.info(
-                    "[Maze Phase 1] No structured steps found in side-stream. "
-                    "Letting model continue thinking."
-                )
-                return step, None
-
-            (all_valid, err_step_num, errors, final_pos,
-             final_dir, r_count, l_count, t_count) = self._verify_all_steps(steps)
-
-            if not all_valid:
-                error_summary = "; ".join(errors)
-                self._think_phase_corrections += 1
-                logger.info(
-                    f"[Maze Phase 1] INVALID step {err_step_num}\n"
-                    f"  Error(s) : {error_summary}\n"
-                    f"  Action   : Inject feedback into thinking trace\n"
-                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
-                )
-                thinking_feedback = (
-                    f"\n\nWait, I made an error at Step {err_step_num}. "
-                    f"{error_summary}. "
-                    f"Let me re-trace the path more carefully from the correct position.\n"
-                )
-                if not event.is_set():
-                    event_info["generated_text"] = step
-                    event_info["feedback"] = thinking_feedback
-                    event_info["correction_index"] = token_index
-                    event_info["errors"] = errors
-                    event_info["phase"] = "rollback_to_thinking"
-                    event.set()
-                return step, thinking_feedback
-
-            # All steps valid — check if path is complete (reached E)
-            if final_pos == self.exit_pos:
-                self._verified_path_complete = True
-                logger.info(
-                    f"[Maze Phase 1] VALID COMPLETE path to E={self.exit_pos}\n"
-                    f"  Steps: {len(steps)}, Right={r_count}, Left={l_count}, Total={t_count}\n"
-                    f"  Action: Inject early-stop + </think> + structured format."
-                )
-                # Include the structured prompt directly after </think>
-                # so the model immediately starts filling in the answer format
-                # (skips the separate Phase 2a injection round-trip).
-                early_stop_msg = (
-                    f"\n\nWait, I have successfully traced the path from "
-                    f"S={self.start_pos} to E={self.exit_pos} with "
-                    f"{len(steps)} steps. "
-                    f"Right turns={r_count}, Left turns={l_count}, "
-                    f"Total turns={t_count}. "
-                    f"This path has been verified as correct. "
-                    f"Let me give the final answer.\n"
-                    f"{self.answer_start_token}"
-                    f"{self._structured_prompt}"
-                )
-                if not event.is_set():
-                    event_info["generated_text"] = step
-                    event_info["feedback"] = early_stop_msg
-                    event_info["correction_index"] = token_index
-                    event_info["phase"] = "early_stop_answer"
-                    event_info["verified_counts"] = {
-                        "right": r_count,
-                        "left": l_count,
-                        "total": t_count,
-                        "steps": len(steps),
-                    }
-                    event.set()
-                return step, early_stop_msg
-
-            else:
-                logger.info(
-                    f"[Maze Phase 1] VALID PARTIAL path\n"
-                    f"  Current pos: {final_pos}, Target: {self.exit_pos}\n"
-                    f"  Steps so far: {len(steps)}\n"
-                    f"  Action: No error, let model keep thinking."
-                )
-                return step, None
-
-        # ==================================================================
-        # CASE 2a: </think> present but structured prompt not yet injected
-        # ==================================================================
-        if not self._structured_prompt_injected(step):
-            logger.info(
-                "[Maze Phase 2a] </think> detected. "
-                "Injecting structured step format."
-            )
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = self._structured_prompt
-                event_info["correction_index"] = token_index
-                event_info["phase"] = "inject_structured_prompt"
-                event.set()
-            return step, self._structured_prompt
-
-        # ==================================================================
-        # CASE 2b: Structured prompt injected — verify output
-        # ==================================================================
-
-        num_corrections = self._count_feedback_blocks(step)
-        if num_corrections >= self.max_corrections:
-            fb = "\nthe answer is \\boxed{no solution}"
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = fb
-                event_info["correction_index"] = token_index
-                event_info["errors"] = ["Max corrections reached"]
-                event_info["phase"] = "standard_verify"
-                event.set()
-            return step, fb
-
-        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
-        text_after_think = step[think_end_pos:]
-
-        # Strip the injected <format>...</format> template — only look at
-        # actual model output starting from the last ">>> LOCATE START AND EXIT (0-indexed, ...)" marker.
-        last_marker_pos = text_after_think.rfind(self._structured_marker)
-        if last_marker_pos >= 0:
-            text_after_think = text_after_think[last_marker_pos:]
-
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text_after_think):
-            last_feedback_end = match.end()
-        recent_text = text_after_think[last_feedback_end:]
-
-        # --- Verify LOCATE section ---
-        locate_match = re.search(r'LOCATE START AND EXIT', recent_text, re.IGNORECASE)
-        if locate_match:
-            step1_start = re.search(r'>>>\s*STEP\s+1', recent_text, re.IGNORECASE)
-            if step1_start or '\\boxed' in recent_text:
-                if step1_start:
-                    locate_text = recent_text[locate_match.start():step1_start.start()]
-                else:
-                    locate_text = recent_text[locate_match.start():]
-                is_valid, loc_errors = verify_locate_section(
-                    locate_text, self.start_pos, self.exit_pos
-                )
-                if not is_valid:
-                    feedback = format_locate_feedback(loc_errors)
-                    if not event.is_set():
-                        event_info["generated_text"] = step
-                        event_info["feedback"] = feedback
-                        event_info["correction_index"] = token_index
-                        event_info["errors"] = loc_errors
-                        event_info["phase"] = "standard_verify"
-                        event.set()
-                    return step, feedback
-
-        # --- Verify structured steps ---
-        if self.question_type in ("right_turns", "total_turns"):
-            step_pattern = re.compile(
-                r'(>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
-                r'Running count:[^\n]+)',
-                re.IGNORECASE | re.DOTALL
-            )
-            # Find steps in recent_text (after last feedback) to know what to verify
-            recent_step_matches = list(step_pattern.finditer(recent_text))
-
-            if recent_step_matches:
-                last_match = recent_step_matches[-1]
-                last_step_text = last_match.group(0)
-                last_step_num = int(last_match.group(2))
-                parsed = parse_maze_step(last_step_text)
-
-                if parsed:
-                    # For state reconstruction, gather ALL steps from the
-                    # full text (not just recent_text).  When a step number
-                    # appears multiple times (original + corrections), only
-                    # the LAST occurrence before the target step is used.
-                    all_full_matches = list(step_pattern.finditer(text_after_think))
-                    state = self._get_state_before_step_phase2(
-                        text_after_think, last_step_num, all_full_matches
-                    )
-
-                    is_valid, errors, new_state = verify_maze_step(
-                        step=parsed,
-                        grid=self.grid,
-                        expected_from_pos=state['position'],
-                        prev_direction=state['direction'],
-                        expected_right_count=state['right_count'],
-                        expected_left_count=state['left_count'],
-                        expected_total_count=state['total_count'],
-                    )
-
-                    if not is_valid:
-                        feedback = format_maze_feedback(errors, last_step_num)
-                        if not event.is_set():
-                            event_info["generated_text"] = step
-                            event_info["feedback"] = feedback
-                            event_info["correction_index"] = token_index
-                            event_info["errors"] = errors
-                            event_info["phase"] = "standard_verify"
-                            event.set()
-                        return step, feedback
-
-        # --- Check for boxed answer ---
-        boxed_answer = self._extract_boxed_answer(recent_text)
-        if boxed_answer is not None:
-            logger.info(f"[Maze Phase 2b] Extracted boxed answer: {boxed_answer}")
-
-            # For relative_position questions, verify the Yes/No answer
-            if self.question_type == "relative_position":
-                is_correct, rp_feedback = self._verify_relative_position_answer(boxed_answer)
-                if not is_correct and rp_feedback:
-                    logger.info(
-                        f"[Maze Phase 2b] Relative position answer '{boxed_answer}' is INCORRECT."
-                    )
-                    if not event.is_set():
-                        event_info["generated_text"] = step
-                        event_info["feedback"] = rp_feedback
-                        event_info["correction_index"] = token_index
-                        event_info["errors"] = [f"Wrong relative position answer: {boxed_answer}"]
-                        event_info["phase"] = "standard_verify"
-                        event.set()
-                    return step, rp_feedback
-
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = ""
-                event_info["correction_index"] = token_index
-                event_info["phase"] = "final_answer_correct"
-                event.set()
-            return step, None
-
-        return step, None
-
-    # ------------------------------------------------------------------
-    #  _get_state_before_step_phase2 – reconstruct state for Phase 2
-    # ------------------------------------------------------------------
-    def _get_state_before_step_phase2(self, text: str, target_step_num: int,
-                                       all_step_matches: list) -> dict:
-        """Reconstruct state before a given step from Phase 2 structured output.
-        
-        When a step number appears multiple times (original + corrections after
-        verifier feedback), only the LAST occurrence of each step number is used,
-        so that corrected steps override earlier invalid ones.
-        """
-        state = {
-            'position': self.start_pos,
-            'direction': Direction.NONE,
-            'right_count': 0,
-            'left_count': 0,
-            'total_count': 0,
-        }
-
-        # Collect the last occurrence of each step number before the target
-        last_by_num = {}
-        for match in all_step_matches:
-            step_num = int(match.group(2))
-            if step_num >= target_step_num:
-                continue
-            last_by_num[step_num] = match  # later occurrences overwrite earlier
-
-        # Replay in step-number order
-        for step_num in sorted(last_by_num.keys()):
-            parsed = parse_maze_step(last_by_num[step_num].group(0))
-            if not parsed:
-                continue
-
-            direction = parsed['direction']
-            to_pos = parsed['to_pos']
-
-            turn_type = get_expected_turn_type(state['direction'], direction)
-            if turn_type == 'RIGHT_TURN':
-                state['right_count'] += 1
-                state['total_count'] += 1
-            elif turn_type == 'LEFT_TURN':
-                state['left_count'] += 1
-                state['total_count'] += 1
-
-            state['position'] = to_pos
-            state['direction'] = direction
-
-        return state
-
-    # ------------------------------------------------------------------
-    #  fix
-    # ------------------------------------------------------------------
-    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
-        """Apply the appropriate fix depending on the phase."""
-        phase = event_info.get("phase", "standard_verify")
-
-        if phase == "rollback_to_thinking":
-            base_text = event_info["generated_text"]
-            result = base_text.rstrip() + event_info["feedback"]
-            logger.info(
-                f"[Maze fix] Phase: rollback_to_thinking\n"
-                f"  -> Appended error feedback into <think> trace.\n"
-                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
-            )
-            return result
-
-        if phase == "early_stop_answer":
-            base_text = event_info["generated_text"]
-            result = base_text.rstrip() + event_info["feedback"]
-            counts = event_info.get("verified_counts", {})
-            logger.info(
-                f"[Maze fix] Phase: early_stop_answer\n"
-                f"  -> Path verified: {counts.get('steps', '?')} steps, "
-                f"R={counts.get('right', '?')}, L={counts.get('left', '?')}, "
-                f"T={counts.get('total', '?')}\n"
-                f"  -> Injecting early-stop + </think> + structured format."
-            )
-            return result
-
-        if phase == "inject_structured_prompt":
-            logger.info(
-                "[Maze fix] Phase: inject_structured_prompt\n"
-                "  -> Appending structured step format after </think>."
-            )
-            return event_info["generated_text"] + event_info["feedback"]
-
-        if phase == "final_answer_correct":
-            logger.info(
-                f"[Maze fix] Phase: final_answer_correct\n"
-                f"  -> Stopping generation."
-            )
-            return event_info["generated_text"]
-
-        # standard_verify
-        errors = event_info.get("errors", [])
-        error_summary = "; ".join(errors) if errors else "unknown"
-        logger.info(
-            f"[Maze fix] Phase: standard_verify\n"
-            f"  -> Error: {error_summary}\n"
-            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
-        )
-        return event_info["generated_text"] + event_info["feedback"]
-
-
-# =====================================================================
-#  SpatialMap Thinking-Phase Prompts
-# =====================================================================
-
-
-def _build_spatialmap_format_block() -> str:
-    """
-    Build the ``<format>...</format>`` block that describes the structured
-    output template for SpatialMap tasks.
-
-    Re-used by both the side-stream (Phase 1) and the post-``</think>``
-    injection (Phase 2a).
-    """
-    return (
-        "<format>\n"
-        ">>> STEP 1: PARSE RELATIONSHIPS\n"
-        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
-        "    - [Full Name C] is to the [direction] of [Full Name D]\n"
-        "    [... list ALL given relationships using FULL names exactly as in the question ...]\n"
-        "    (NO abbreviations, NO short forms, NO parenthetical aliases like 'Police Supply Store (PSS)')\n"
-        "\n"
-        ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS\n"
-        "    - Looking for: [target relationship / direction / count]\n"
-        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
-        "    - [Full Name C] is to the [direction] of [Full Name D]\n"
-        "    [... list each derived relationship as a structured claim using FULL names ...]\n"
-        "    (Each claim MUST be in the form: '[Full Name] is to the [direction] of [Full Name]')\n"
-        "\n"
-        ">>> STEP 3: ANSWER\n"
-        "    - [state conclusion]\n"
-        "\n"
-        ">>> FINAL ANSWER: [answer text]\n"
-        "    \\boxed{LETTER}\n"
-        "</format>"
-    )
-
-
-def _build_spatialmap_thinking_phase_prompt(
-    parsed_relations: List[Dict],
-) -> str:
-    """
-    Build the side-stream prompt injected during the thinking phase.
-
-    Pre-fills STEP 1 with the known parsed relations (from the Z3 solver)
-    so the model jumps directly to STEP 2 analysis, maximising the chance
-    of producing verifiable directional claims within the token budget.
-
-    Written in the LLM's own first-person thinking voice so it blends
-    naturally with the ``<think>`` trace.
-    """
-    # Pre-fill STEP 1 from the ground-truth parsed relations
-    step1_lines = []
-    for rel in parsed_relations:
-        step1_lines.append(
-            f"    - {rel['A']} is to the {rel['direction']} of {rel['B']}"
-        )
-    step1_body = "\n".join(step1_lines) if step1_lines else "    (none)"
-
-    return (
-        "\n\nLet me organize what I have so far. I will list the given "
-        "relationships in STEP 1, then in STEP 2 I will state every "
-        "spatial claim I have derived.\n"
-        "IMPORTANT: I must use the FULL object names exactly as given in the question "
-        "(no abbreviations, no short forms, no aliases, no partial names, no parenthetical aliases like 'Store (S)').\n"
-        "Every claim must be in the form: '[Full Name] is to the [direction] of [Full Name]'\n"
-        "For direction I will use the full word: northeast, northwest, southeast, southwest, north, south, east, or west.\n\n"
-        ">>> STEP 1: PARSE RELATIONSHIPS (given)\n"
-        f"{step1_body}\n\n"
-        ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS (derived)\n"
-        "Based on my analysis so far, the derived relationships are:\n"
-    )
-
-
-def _build_spatialmap_structured_prompt() -> str:
-    """
-    Build the structured format prompt injected after ``</think>``.
-
-    Analogous to the maze's structured format injection — gives the
-    model a template to fill in so we can parse and verify each step.
-    """
-    format_block = _build_spatialmap_format_block()
-    return (
-        "\nLet me solve this step by step using the structured format.\n"
-        "IMPORTANT: I must use the FULL names of all objects exactly as they appear in the question. "
-        "NO abbreviations, NO short forms, NO parenthetical aliases.\n"
-        "Every relationship must be stated as: '[Full Name] is to the [direction] of [Full Name]'\n\n"
-        f"{format_block}\n"
-        ">>> STEP 1: PARSE RELATIONSHIPS\n"
-    )
-
-
-# =====================================================================
-#  ThinkingPhaseStepVerifierSpatialMapMonitor
-# =====================================================================
-
-
-class ThinkingPhaseStepVerifierSpatialMapMonitor(VerifyMonitor):
-    """
-    Monitor that verifies spatial-map directional claims during and after
-    thinking.
-
-    **No meta-prompt required** — works with a plain user prompt containing
-    just the map description and question.  Structure is injected by this
-    monitor after ``</think>`` (natural or early-stop), exactly like the
-    Maze monitor injects its step format.
-
-    Phase 1 – During ``<think>...</think>``:
-        Every N double-newlines (after warmup), fork a side-stream that
-        injects a structured step prompt, stream tokens, parse directional
-        claims from STEP 2, and verify them against Z3.
-
-    Phase 2a – ``</think>`` detected, structured prompt not yet injected:
-        Inject the structured step-by-step format template so the model
-        fills it in (STEP 1 → STEP 2 → STEP 3 → FINAL ANSWER → ``\\boxed{}``).
-
-    Phase 2b – Structured prompt injected, model is generating:
-        Verify directional claims in STEP 2 as they appear.  Once
-        ``\\boxed{}`` appears, signal completion.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        problem_text: str,
-        llm_server: dict,
-        prompt: str,
-        newline_threshold: int = 15,
-        max_corrections: int = 5,
-        answer_start_token: str = "</think>",
-        async_execution: bool = True,
-        warmup_newlines: int = 0,
-    ):
-        super().__init__(name)
-        self.problem_text = problem_text
-        self.llm_server = llm_server
-        self.prompt = prompt
-        self.newline_threshold = newline_threshold
-        self.max_corrections = max_corrections
-        self.answer_start_token = answer_start_token
-        self.async_execution = async_execution
-        self.warmup_newlines = warmup_newlines
-
-        # Initialize Z3 solver with problem constraints
-        self.z3_solver = SpatialMapZ3Solver(problem_text)
-
-        # Build prompts for injection
-        self._structured_prompt = _build_spatialmap_structured_prompt()
-        self._thinking_phase_prompt = _build_spatialmap_thinking_phase_prompt(
-            self.z3_solver.parsed_relations,
-        )
-        # Marker to detect if structured prompt was already injected
-        self._structured_marker = ">>> STEP 1: PARSE RELATIONSHIPS"
-
-        # ---- state ----
-        self._think_phase_corrections = 0
-        self.verified_claims: Set[Tuple[str, str, str]] = set()
-
-        # ---- counting-question verification ----
-        self._counting_question = parse_counting_question(problem_text)
-        self._counting_options: Dict[str, str] = {}
-        # Strip trailing instruction paragraph for clean option parsing
-        _opts_text = re.split(r'\nFirst,', problem_text, maxsplit=1)[0]
-        if self._counting_question:
-            # Parse MCQ options from problem text (e.g., "A. 5\nB. 3\nC. 0\nD. 1")
-            raw_opts = re.findall(
-                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
-                _opts_text, flags=re.DOTALL,
-            )
-            self._counting_options = {
-                k: v.strip().rstrip(".") for k, v in raw_opts
-            }
-            logger.info(
-                f"[SpatialMap] Counting question detected: "
-                f"direction={self._counting_question['direction']}, "
-                f"reference={self._counting_question['reference']}, "
-                f"options={self._counting_options}"
-            )
-        self._count_feedback_given = False
-        self._count_feedback_blocks_count = 0  # tracks cardinal count retry attempts
-
-        # ---- direction-question verification ----
-        self._direction_question = parse_direction_question(problem_text)
-        if self._direction_question:
-            logger.info(
-                f"[SpatialMap] Direction question detected: "
-                f"entity_a={self._direction_question['entity_a']}, "
-                f"entity_b={self._direction_question['entity_b']}"
-            )
-
-        # ---- object-question verification ----
-        self._object_question = parse_object_question(problem_text)
-        if self._object_question:
-            logger.info(
-                f"[SpatialMap] Object question detected: "
-                f"direction={self._object_question['direction']}, "
-                f"reference={self._object_question['reference']}"
-            )
-
-        # ---- Generic MCQ options (for direction & object Qs too) ----
-        if not self._counting_options:
-            raw_opts = re.findall(
-                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
-                _opts_text, flags=re.DOTALL,
-            )
-            self._mcq_options: Dict[str, str] = {
-                k: v.strip().rstrip(".") for k, v in raw_opts
-            }
-        else:
-            self._mcq_options = dict(self._counting_options)
-
-        # Allow multiple retries for final-answer verification
-        self._max_final_answer_retries = 3
-        self._direction_feedback_count = 0
-        self._object_feedback_count = 0
-        self._diag_count_feedback_count = 0
-
-    @classmethod
-    def from_prompt(
-        cls,
-        problem_text: str,
-        llm_server: dict,
-        prompt: str,
-        newline_threshold: int = 15,
-        max_corrections: int = 5,
-        warmup_newlines: int = 0,
-        name: str = "spatialmap_thinking_verifier",
-    ) -> "ThinkingPhaseStepVerifierSpatialMapMonitor":
-        """
-        Convenience factory method.
-        """
-        return cls(
-            name=name,
-            problem_text=problem_text,
-            llm_server=llm_server,
-            prompt=prompt,
-            newline_threshold=newline_threshold,
-            max_corrections=max_corrections,
-            warmup_newlines=warmup_newlines,
-        )
-
-    # ------------------------------------------------------------------
-    #  helpers
-    # ------------------------------------------------------------------
-    def _count_feedback_blocks(self, text: str) -> int:
-        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
-
-    def _is_in_thinking_phase(self, generated_text: str) -> bool:
-        return self.answer_start_token not in generated_text
-
-    def _structured_prompt_injected(self, generated_text: str) -> bool:
-        """Check if structured format was already injected after </think>."""
-        if self.answer_start_token not in generated_text:
-            return False
-        after_think = generated_text.split(self.answer_start_token, 1)[1]
-        return self._structured_marker in after_think
-
-    def _extract_new_claims(self, text: str) -> List[Dict]:
-        """
-        Extract new (not yet verified) directional claims from STEP 2 of
-        the most recent attempt (after last feedback block).
-        """
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text):
-            last_feedback_end = match.end()
-
-        text_to_check = text[last_feedback_end:]
-
-        # Get full entity names from Z3 solver for abbreviation resolution
-        entity_names = list({
-            k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
-        })
-
-        all_claims = extract_step2_claims(text_to_check, entity_names=entity_names)
-
-        new_claims = []
-        for claim in all_claims:
-            claim_key = (claim['A'], claim['direction'], claim['B'])
-            if claim_key not in self.verified_claims:
-                new_claims.append(claim)
-
-        return new_claims
-
-    # ------------------------------------------------------------------
-    #  _side_stream_spatialmap – streams tokens to get analysis
-    # ------------------------------------------------------------------
-    async def _side_stream_spatialmap(self, text_so_far: str, max_new_tokens: int = 400) -> str:
-        """
-        Send ``prompt + text_so_far`` to vLLM, stream at most
-        *max_new_tokens* tokens, and return the generated text.
-
-        ``text_so_far`` is expected to end with the structured spatial map
-        prompt so the model outputs its analysis steps.
-        """
-        logger.info(
-            f"[SpatialMap Side-stream] Starting analysis extraction\n"
-            f"  Relations: {len(self.z3_solver.parsed_relations)}\n"
-            f"  Max new tokens: {max_new_tokens}"
-        )
-
-        payload = deepcopy(self.llm_server["payload"])
-        payload["prompt"] = self.prompt + text_so_far
-        payload["max_tokens"] = max_new_tokens
-        payload.pop("logprobs", None)
-
-        generated = ""
-
-        async with httpx.AsyncClient(timeout=None) as client:
-            async with client.stream(
-                "POST",
-                self.llm_server["url"],
-                headers=self.llm_server["headers"],
-                json=payload,
-            ) as response:
-                async for line in response.aiter_lines():
-                    if line.startswith("data: "):
-                        data = line[len("data: "):].strip()
-                        if data == "[DONE]":
-                            break
-                        chunk = json.loads(data)["choices"][0]["text"]
-                        generated += chunk
-                        logger.debug(f"[SpatialMap Side-stream] chunk: {chunk!r}")
-
-                        # Stop if we see FINAL ANSWER or \boxed
-                        if '\\boxed' in generated or '>>> FINAL ANSWER' in generated:
-                            break
-
-        logger.info(
-            f"[SpatialMap Side-stream] Generated {len(generated)} chars"
-        )
-        return generated
-
-    # ------------------------------------------------------------------
-    #  step_extractor
-    # ------------------------------------------------------------------
-    def step_extractor(self, chunk: str, generated_text: str):
-        """
-        Phase 1 (thinking): trigger at every newline_threshold multiple
-            (after warmup).
-        Phase 2 (after </think>): trigger on structured steps or boxed
-            answer.
-        """
-        # ===== PHASE 1: still inside <think> =====
-        if self._is_in_thinking_phase(generated_text):
-            if self._think_phase_corrections >= self.max_corrections:
-                return False, None
-
-            total_double_newlines = generated_text.count('\n\n')
-
-            if total_double_newlines < self.warmup_newlines:
-                return False, None
-
-            past_warmup = total_double_newlines - self.warmup_newlines
-            if (generated_text.endswith('\n\n')
-                    and past_warmup >= 0
-                    and past_warmup % self.newline_threshold == 0):
-                logger.info(
-                    f"[SpatialMap step_extractor] Phase 1 trigger: \\n\\n count={total_double_newlines} "
-                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
-                    f"threshold={self.newline_threshold})"
-                )
-                return True, generated_text
-
-            return False, None
-
-        # ===== PHASE 2: after </think> =====
-
-        # 2a: structured prompt not yet injected → trigger immediately
-        if not self._structured_prompt_injected(generated_text):
-            logger.info(
-                "[SpatialMap step_extractor] Phase 2a: </think> detected, "
-                "structured prompt not yet injected."
-            )
-            return True, generated_text
-
-        # 2b: structured prompt injected — verify STEP 2 claims / boxed answer
-        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
-        text_after_think = generated_text[think_end_pos:]
-
-        # Strip out the injected <format>...</format> template so we only
-        # look at actual model output (which starts after the last marker).
-        last_marker_pos = text_after_think.rfind(self._structured_marker)
-        if last_marker_pos >= 0:
-            model_output_start = last_marker_pos + len(self._structured_marker)
-            text_after_think = text_after_think[model_output_start:]
-            text_start_offset = think_end_pos + model_output_start
-        else:
-            text_start_offset = think_end_pos
-
-        # Skip past feedback blocks
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text_after_think):
-            last_feedback_end = match.end()
-        text = text_after_think[last_feedback_end:]
-        text_start = text_start_offset + last_feedback_end
-
-        # Check for STEP 2 section with claims
-        step2_pattern = re.compile(
-            r'>>>\s*STEP\s*2[:\s].*?(?=>>>\s*STEP\s*3|>>>\s*FINAL|\\boxed|$)',
-            re.DOTALL | re.IGNORECASE
-        )
-        step2_match = step2_pattern.search(text)
-
-        if step2_match:
-            # Check if STEP 3 or FINAL has started (STEP 2 is complete)
-            text_after_step2 = text[step2_match.end():]
-            step3_or_final = re.search(
-                r'>>>\s*(STEP\s*3|FINAL)',
-                text_after_step2,
-                re.IGNORECASE
-            )
-
-            if step3_or_final:
-                new_claims = self._extract_new_claims(text)
-                if new_claims:
-                    end_pos = text_start + step2_match.end()
-                    return True, generated_text[:end_pos]
-
-        # Check for boxed answer (trigger final verification)
-        boxed_match = _find_complete_boxed(text)
-        if boxed_match:
-            new_claims = self._extract_new_claims(text)
-            if new_claims:
-                end_pos = text_start + boxed_match.end()
-                return True, generated_text[:end_pos]
-            # Even if no new claims, boxed answer signals completion
-            end_pos = text_start + boxed_match.end()
-            return True, generated_text[:end_pos]
-
-        return False, None
-
-    # ------------------------------------------------------------------
-    #  verify
-    # ------------------------------------------------------------------
-    async def verify(self, step: str, token_index: int, event, event_info):
-        """
-        Case 1 -- still in thinking (no </think>):
-            Fork side-stream, parse claims, verify with Z3.
-        Case 2 -- after </think>:
-            2a: Inject structured prompt.
-            2b: Verify STEP 2 claims and/or final answer.
-        """
-
-        # ==================================================================
-        # CASE 1: Thinking phase – side-stream verification
-        # ==================================================================
-        if self.answer_start_token not in step:
-            total_dn = step.count('\n\n')
-            logger.info(
-                f"[SpatialMap Phase 1] Thinking-phase verification triggered\n"
-                f"  \\n\\n count  : {total_dn}\n"
-                f"  Thinking len : {len(step)} chars"
-            )
-
-            # Build text with injected prompt for analysis extraction
-            text_with_prompt = step + self._thinking_phase_prompt
-
-            # Side-stream: get analysis from the model
-            side_output = await self._side_stream_spatialmap(
-                text_with_prompt, max_new_tokens=800
-            )
-
-            if not side_output or len(side_output.strip()) < 20:
-                logger.info(
-                    "[SpatialMap Phase 1] Insufficient output from side-stream. "
-                    "Letting model continue thinking."
-                )
-                return step, None
-
-            # Parse directional claims directly from the side-stream output.
-            # The prompt pre-fills STEP 1 and ends at ">>> STEP 2:", so the
-            # model's output is already STEP 2 content — no header to search for.
-            entity_names = list({
-                k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
-            })
-            claims = parse_directional_claims_from_text(
-                side_output, entity_names=entity_names
-            )
-
-            logger.info(
-                f"[SpatialMap Phase 1] Parsed {len(claims)} claims from side-stream.\n"
-                f"  Side-stream output (first 500 chars): {side_output[:500]!r}"
-            )
-
-            if not claims:
-                logger.info(
-                    "[SpatialMap Phase 1] No directional claims found in side-stream. "
-                    "Letting model continue thinking."
-                )
-                return step, None
-
-            # Verify each claim against Z3
-            for claim in claims:
-                claim_key = (claim['A'], claim['direction'], claim['B'])
-                if claim_key in self.verified_claims:
-                    continue
-
-                is_valid, errors = verify_spatialmap_step(
-                    claim=claim,
-                    z3_solver=self.z3_solver,
-                    add_if_valid=True,
-                )
-                self.verified_claims.add(claim_key)
-
-                if not is_valid:
-                    self._think_phase_corrections += 1
-                    error_summary = "; ".join(errors)
-                    logger.info(
-                        f"[SpatialMap Phase 1] INVALID claim: "
-                        f"{claim['A']} is {claim['direction']} of {claim['B']}\n"
-                        f"  Error(s) : {error_summary}\n"
-                        f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
-                    )
-                    thinking_feedback = (
-                        f"\n\nWait, I think I made an error in my spatial reasoning. "
-                        f"{error_summary}. "
-                        f"Let me re-examine the relationships more carefully.\n"
-                    )
-                    if not event.is_set():
-                        event_info["generated_text"] = step
-                        event_info["feedback"] = thinking_feedback
-                        event_info["correction_index"] = token_index
-                        event_info["errors"] = errors
-                        event_info["phase"] = "rollback_to_thinking"
-                        event.set()
-                    return step, thinking_feedback
-
-            # All claims valid
-            logger.info(
-                f"[SpatialMap Phase 1] All {len(claims)} claims valid. "
-                f"Letting model continue thinking."
-            )
-            return step, None
-
-        # ==================================================================
-        # CASE 2a: </think> present but structured prompt not yet injected
-        # ==================================================================
-        if not self._structured_prompt_injected(step):
-            logger.info(
-                "[SpatialMap Phase 2a] </think> detected. "
-                "Injecting structured step format."
-            )
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = self._structured_prompt
-                event_info["correction_index"] = token_index
-                event_info["phase"] = "inject_structured_prompt"
-                event.set()
-            return step, self._structured_prompt
-
-        # ==================================================================
-        # CASE 2b: Structured prompt injected — verify output
-        # ==================================================================
-        num_corrections = self._count_feedback_blocks(step)
-        if num_corrections >= self.max_corrections:
-            fb = "\nthe answer is \\boxed{no solution}"
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = fb
-                event_info["correction_index"] = token_index
-                event_info["errors"] = ["Max corrections reached"]
-                event_info["phase"] = "standard_verify"
-                event.set()
-            return step, fb
-
-        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
-        text_after_think = step[think_end_pos:]
-
-        # Strip the injected template — only look at model output after marker
-        last_marker_pos = text_after_think.rfind(self._structured_marker)
-        if last_marker_pos >= 0:
-            text_after_think = text_after_think[last_marker_pos:]
-
-        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
-        last_feedback_end = 0
-        for match in feedback_pattern.finditer(text_after_think):
-            last_feedback_end = match.end()
-        recent_text = text_after_think[last_feedback_end:]
-
-        # --- Verify STEP 2 claims ---
-        new_claims = self._extract_new_claims(recent_text)
-
-        for claim in new_claims:
-            claim_key = (claim['A'], claim['direction'], claim['B'])
-
-            is_valid, errors = verify_spatialmap_step(
-                claim=claim,
-                z3_solver=self.z3_solver,
-                add_if_valid=True,
-            )
-            self.verified_claims.add(claim_key)
-
-            if not is_valid:
-                feedback = format_spatialmap_feedback(errors, claim)
-                if not event.is_set():
-                    event_info["generated_text"] = step
-                    event_info["feedback"] = feedback
-                    event_info["correction_index"] = token_index
-                    event_info["errors"] = errors
-                    event_info["failed_step"] = claim
-                    event_info["phase"] = "standard_verify"
-                    event.set()
-                return step, feedback
-
-        # --- Check for boxed answer ---
-        boxed_match = _find_complete_boxed(recent_text)
-        if boxed_match:
-
-            # ==========================================================
-            # Direction-question verification
-            # ==========================================================
-            if (
-                self._direction_question
-                and num_corrections < self.max_corrections
-                and self._direction_feedback_count < self._max_final_answer_retries
-            ):
-                model_dir_text = parse_model_boxed_answer(
-                    recent_text, self._mcq_options
-                )
-                if model_dir_text:
-                    possible = get_possible_directions(
-                        self.z3_solver,
-                        self._direction_question["entity_a"],
-                        self._direction_question["entity_b"],
-                    )
-                    logger.info(
-                        f"[SpatialMap Phase 2b] Direction check: "
-                        f"model={model_dir_text}, possible={possible}"
-                    )
-                    if model_dir_text not in possible:
-                        self._direction_feedback_count += 1
-                        # Find which MCQ options are consistent
-                        valid_options = [
-                            letter for letter, val in self._mcq_options.items()
-                            if val.strip().lower().rstrip(".") in possible
-                        ]
-                        if len(valid_options) == 1:
-                            # Force correct answer
-                            feedback = (
-                                f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
-                                f"  '{model_dir_text.title()}' is "
-                                f"impossible for "
-                                f"{self._direction_question['entity_a']} "
-                                f"relative to "
-                                f"{self._direction_question['entity_b']} "
-                                f"based on the given constraints.]\n\n"
-                                f">>> STEP 3: ANSWER\n"
-                            )
-                        else:
-                            possible_str = ", ".join(
-                                d.title() for d in possible
-                            )
-                            feedback = (
-                                f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
-                                f"  '{model_dir_text.title()}' is "
-                                f"impossible for "
-                                f"{self._direction_question['entity_a']} "
-                                f"relative to "
-                                f"{self._direction_question['entity_b']} "
-                                f"based on the given constraints.\n"
-                                f"  Please reconsider and choose the "
-                                f"correct option.]\n\n"
-                                f">>> STEP 3: ANSWER\n"
-                            )
-                        if not event.is_set():
-                            event_info["generated_text"] = step
-                            event_info["feedback"] = feedback
-                            event_info["correction_index"] = token_index
-                            event_info["errors"] = [
-                                f"Direction '{model_dir_text}' impossible; "
-                                f"possible: {possible}"
-                            ]
-                            event_info["phase"] = "standard_verify"
-                            event.set()
-                        return step, feedback
-
-            # ==========================================================
-            # Object-question verification
-            # ==========================================================
-            if (
-                self._object_question
-                and num_corrections < self.max_corrections
-                and self._object_feedback_count < self._max_final_answer_retries
-            ):
-                model_obj_text = parse_model_boxed_answer(
-                    recent_text, self._mcq_options
-                )
-                boxed_raw = re.findall(
-                    r'\\boxed\{([^}]*)\}', recent_text
-                )
-                model_letter = (
-                    boxed_raw[-1].strip().upper() if boxed_raw else None
-                )
-
-                if model_letter:
-                    consistent = get_consistent_object_options(
-                        self.z3_solver,
-                        self._object_question["direction"],
-                        self._object_question["reference"],
-                        self._mcq_options,
-                    )
-                    logger.info(
-                        f"[SpatialMap Phase 2b] Object check: "
-                        f"model={model_letter}, "
-                        f"consistent_options={consistent}"
-                    )
-                    if model_letter not in consistent:
-                        self._object_feedback_count += 1
-                        odir = self._object_question["direction"]
-                        oref = self._object_question["reference"]
-                        if len(consistent) == 1:
-                            correct_name = self._mcq_options.get(
-                                consistent[0], consistent[0]
-                            )
-                            feedback = (
-                                f"\n\n[VERIFIER FEEDBACK: Object error!\n"
-                                f"  '{model_obj_text}' cannot be "
-                                f"{odir} of {oref} based on the "
-                                f"given constraints.\n"
-                                f"  The only consistent option is "
-                                f"{consistent[0]}. {correct_name}.\n"
-                                f"  Please select option "
-                                f"{consistent[0]}.]\n\n"
-                                f">>> STEP 3: ANSWER\n"
-                            )
-                        else:
-                            valid_names = [
-                                f"{l}. {self._mcq_options.get(l, l)}"
-                                for l in consistent
-                            ]
-                            feedback = (
-                                f"\n\n[VERIFIER FEEDBACK: Object error!\n"
-                                f"  '{model_obj_text}' cannot be "
-                                f"{odir} of {oref} based on the "
-                                f"given constraints.\n"
-                                f"  The consistent options are: "
-                                f"{', '.join(valid_names)}.\n"
-                                f"  Please reconsider and choose the "
-                                f"correct option.]\n\n"
-                                f">>> STEP 3: ANSWER\n"
-                            )
-                        if not event.is_set():
-                            event_info["generated_text"] = step
-                            event_info["feedback"] = feedback
-                            event_info["correction_index"] = token_index
-                            event_info["errors"] = [
-                                f"Object '{model_obj_text}' impossible "
-                                f"in {odir} of {oref}; "
-                                f"consistent: {consistent}"
-                            ]
-                            event_info["phase"] = "standard_verify"
-                            event.set()
-                        return step, feedback
-
-            # ==========================================================
-            # Counting-question verification (cardinal + diagonal)
-            # ==========================================================
-            if (
-                self._counting_question
-                and num_corrections < self.max_corrections
-            ):
-                direction = self._counting_question["direction"]
-                reference = self._counting_question["reference"]
-                is_cardinal = direction in (
-                    "north", "south", "east", "west"
-                )
-
-                if is_cardinal:
-                    # --- Cardinal: GT is always 0 ---
-                    # All spatial constraints in this dataset are diagonal
-                    # (NE, NW, SE, SW), so no object can be strictly
-                    # north/south/east/west of another. The answer is
-                    # always 0.
-                    model_count = parse_model_count_from_answer(
-                        recent_text, self._counting_options
-                    )
-                    z3_count = 0
-
-                    logger.info(
-                        f"[SpatialMap Phase 2b] Cardinal count check: "
-                        f"model={model_count}, expected={z3_count}, "
-                        f"direction={direction}, reference={reference}"
-                    )
-
-                    if (
-                        model_count is not None
-                        and model_count != z3_count
-                    ):
-                        self._count_feedback_given = True
-                        count_corrections = self._count_feedback_blocks_count
-                        self._count_feedback_blocks_count = count_corrections + 1
-
-                        # Build direction-specific examples of what does NOT count
-                        if direction in ("north", "south"):
-                            diag_examples = "northeast or northwest"
-                        elif direction == "west":
-                            diag_examples = "northwest or southwest"
-                        else:  # east
-                            diag_examples = "northeast or southeast"
-
-                        feedback = (
-                            f"\n\n[VERIFIER FEEDBACK: Count mismatch!\n"
-                            f"  You answered {model_count} objects "
-                            f"'{direction}' of {reference}, but this "
-                            f"count is incorrect.\n"
-                            f"  IMPORTANT: '{direction}' is a strict "
-                            f"cardinal direction — it means ONLY "
-                            f"exactly {direction}, NOT {diag_examples}."
-                            f"\n"
-                            f"  An object that is {diag_examples.split(' or ')[0]} of "
-                            f"{reference} is NOT {direction} of "
-                            f"{reference}.\n"
-                            f"  Re-examine each object: is it described "
-                            f"as being strictly '{direction} of' "
-                            f"{reference}, or is the relationship "
-                            f"actually a diagonal direction like "
-                            f"{diag_examples}? Only count objects that "
-                            f"are strictly {direction}.]\n\n"
-                            f">>> STEP 3: ANSWER\n"
-                        )
-
-                        logger.info(
-                            f"[SpatialMap Phase 2b] Cardinal count "
-                            f"mismatch: model={model_count}, "
-                            f"expected=0. Injecting feedback "
-                            f"(attempt={'1st' if not self._count_feedback_given else '2nd'})."
-                        )
-                        if not event.is_set():
-                            event_info["generated_text"] = step
-                            event_info["feedback"] = feedback
-                            event_info["correction_index"] = token_index
-                            event_info["errors"] = [
-                                f"Cardinal count mismatch: expected 0, "
-                                f"got {model_count}"
-                            ]
-                            event_info["phase"] = "standard_verify"
-                            event.set()
-                        return step, feedback
-
-                else:
-                    # --- Diagonal: use Z3 range check ---
-                    if self._diag_count_feedback_count < self._max_final_answer_retries:
-                        model_count = parse_model_count_from_answer(
-                            recent_text, self._counting_options
-                        )
-                        count_range = get_possible_count_range(
-                            self.z3_solver, reference, direction
-                        )
-
-                        if (
-                            model_count is not None
-                            and count_range is not None
-                        ):
-                            min_c, max_c = count_range
-                            logger.info(
-                                f"[SpatialMap Phase 2b] Diagonal count "
-                                f"check: model={model_count}, "
-                                f"range=[{min_c}, {max_c}], "
-                                f"direction={direction}, "
-                                f"reference={reference}"
-                            )
-
-                            if not (min_c <= model_count <= max_c):
-                                self._diag_count_feedback_count += 1
-                                # Find valid MCQ options
-                                valid_opts = []
-                                for opt, val in (
-                                    self._counting_options.items()
-                                ):
-                                    try:
-                                        v = int(val)
-                                        if min_c <= v <= max_c:
-                                            valid_opts.append(
-                                                (opt, v)
-                                            )
-                                    except (ValueError, TypeError):
-                                        pass
-
-                                if len(valid_opts) == 1:
-                                    feedback = (
-                                        f"\n\n[VERIFIER FEEDBACK: "
-                                        f"Count error!\n"
-                                        f"  {model_count} objects "
-                                        f"'{direction}' of {reference}"
-                                        f" is impossible.\n"
-                                        f"  The valid count is "
-                                        f"{valid_opts[0][1]}.\n"
-                                        f"  Please select option "
-                                        f"{valid_opts[0][0]}.]\n\n"
-                                        f">>> STEP 3: ANSWER\n"
-                                    )
-                                else:
-                                    feedback = (
-                                        f"\n\n[VERIFIER FEEDBACK: "
-                                        f"Count error!\n"
-                                        f"  {model_count} objects "
-                                        f"'{direction}' of {reference}"
-                                        f" is impossible.\n"
-                                        f"  The possible count range "
-                                        f"is [{min_c}, {max_c}].\n"
-                                        f"  Please reconsider and "
-                                        f"choose the correct "
-                                        f"option.]\n\n"
-                                        f">>> STEP 3: ANSWER\n"
-                                    )
-
-                                if not event.is_set():
-                                    event_info["generated_text"] = step
-                                    event_info["feedback"] = feedback
-                                    event_info["correction_index"] = (
-                                        token_index
-                                    )
-                                    event_info["errors"] = [
-                                        f"Diagonal count "
-                                        f"{model_count} outside "
-                                        f"range [{min_c}, {max_c}]"
-                                    ]
-                                    event_info["phase"] = (
-                                        "standard_verify"
-                                    )
-                                    event.set()
-                                return step, feedback
-
-            logger.info(
-                f"[SpatialMap Phase 2b] Boxed answer found. Stopping."
-            )
-            if not event.is_set():
-                event_info["generated_text"] = step
-                event_info["feedback"] = ""
-                event_info["correction_index"] = token_index
-                event_info["phase"] = "final_answer_correct"
-                event.set()
-            return step, None
-
-        # All claims valid, no boxed yet
-        return step, None
-
-    # ------------------------------------------------------------------
-    #  fix
-    # ------------------------------------------------------------------
-    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
-        """Apply the appropriate fix depending on the phase."""
-        phase = event_info.get("phase", "standard_verify")
-
-        if phase == "rollback_to_thinking":
-            base_text = event_info["generated_text"]
-            result = base_text.rstrip() + event_info["feedback"]
-            logger.info(
-                f"[SpatialMap fix] Phase: rollback_to_thinking\n"
-                f"  -> Appended error feedback into <think> trace.\n"
-                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
-            )
-            return result
-
-        if phase == "inject_structured_prompt":
-            logger.info(
-                "[SpatialMap fix] Phase: inject_structured_prompt\n"
-                "  -> Appending structured step format after </think>."
-            )
-            return event_info["generated_text"] + event_info["feedback"]
-
-        if phase == "final_answer_correct":
-            logger.info(
-                "[SpatialMap fix] Phase: final_answer_correct\n"
-                "  -> Stopping generation."
-            )
-            return event_info["generated_text"]
-
-        # standard_verify
-        errors = event_info.get("errors", [])
-        error_summary = "; ".join(errors) if errors else "unknown"
-        logger.info(
-            f"[SpatialMap fix] Phase: standard_verify\n"
-            f"  -> Error: {error_summary}\n"
-            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
-        )
-        return event_info["generated_text"] + event_info["feedback"]
diff --git a/interwhen/monitors/thinkingPhaseVerifierGame24.py b/interwhen/monitors/thinkingPhaseVerifierGame24.py
new file mode 100644
index 00000000..13b7af37
--- /dev/null
+++ b/interwhen/monitors/thinkingPhaseVerifierGame24.py
@@ -0,0 +1,608 @@
+"""
+Thinking-phase verifier for Game of 24.
+
+Verifies expressions by forking a side-stream during the thinking phase
+to ask the model about its current progress.
+
+Workflow
+--------
+A) **DURING the thinking phase** (inside ``<think>...</think>``):
+   After a warmup period, every *N* newlines in the thinking trace:
+   1. Inject ``</think> The expression that I found till now is {`` and
+      stream ~20 tokens to extract the expression the model outputs.
+   2. Verify the expression against Game-of-24 rules.
+   3. If **wrong** -> inject error feedback into thinking trace.
+   4. If **correct AND complete** -> inject early-stop message + ``</think>``.
+   5. If **correct AND partial** -> no feedback, let model keep thinking.
+
+B) **AFTER a natural ``</think>``**:
+   Inject the expression extraction prompt so the model outputs its
+   answer expression, then verify in the same way.
+"""
+
+import re
+import json
+import logging
+from typing import List, Tuple, Optional
+from copy import deepcopy
+
+import httpx
+
+from .base import VerifyMonitor
+from ._common import find_complete_boxed
+from ..utils.game24_verifier import (
+    can_reach_24, is_close, format_number, safe_eval,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+#  Prompts injected to elicit an expression from the model.
+# ---------------------------------------------------------------------------
+
+# Injected during the thinking phase (after </think>)
+THINKING_PHASE_EXPRESSION_PROMPT = (
+    "</think>\nThe expression that I found till now is {"
+)
+
+# Injected after a natural </think> to force the model to emit \boxed{expr}
+FINAL_EXPRESSION_PROMPT = (
+    "\nThe final expression is \\boxed"
+)
+
+
+# ---------------------------------------------------------------------------
+#  Expression verification helpers
+# ---------------------------------------------------------------------------
+
+def _extract_numbers_from_expr(expr: str) -> List[float]:
+    """Extract all numbers (integers and decimals) from an expression string."""
+    numbers = re.findall(r'\d+\.?\d*', expr)
+    return [int(float(n)) if float(n) == int(float(n)) else float(n) for n in numbers]
+
+
+def _normalize_number(n) -> float:
+    """Normalize a number for comparison."""
+    return float(n)
+
+
+def verify_expression(expr_str: str, original_numbers: List[float]) -> Tuple[str, bool, List[str], Optional[List[float]]]:
+    """
+    Verify an expression against the Game of 24 rules.
+
+    Returns:
+        (status, is_valid, errors, unused_numbers_or_None)
+        - status: "complete" | "partial" | "error"
+        - is_valid: True if the expression is valid (no errors)
+        - errors: List of error messages
+        - unused_numbers: Numbers from original not used in expr (None if errors)
+    """
+    errors = []
+    fmt = format_number
+
+    used_numbers = _extract_numbers_from_expr(expr_str)
+    if not used_numbers:
+        errors.append(f"No numbers found in expression: {expr_str}")
+        return "error", False, errors, None
+
+    original_copy = [_normalize_number(n) for n in original_numbers]
+    matched_indices = []
+    for used_n in used_numbers:
+        used_norm = _normalize_number(used_n)
+        found = False
+        for i, orig_n in enumerate(original_copy):
+            if i not in matched_indices and is_close(used_norm, orig_n):
+                matched_indices.append(i)
+                found = True
+                break
+        if not found:
+            errors.append(
+                f"Number {fmt(used_norm)} in expression is not available in "
+                f"original numbers {[fmt(n) for n in original_numbers]} "
+                f"(or was already used)"
+            )
+
+    if errors:
+        return "error", False, errors, None
+
+    unused = [original_copy[i] for i in range(len(original_copy)) if i not in matched_indices]
+
+    try:
+        value = eval(expr_str, {"__builtins__": None}, {})
+        value = float(value)
+    except Exception as e:
+        errors.append(f"Cannot evaluate expression '{expr_str}': {e}")
+        return "error", False, errors, None
+
+    all_used = len(unused) == 0
+
+    if all_used:
+        if not is_close(value, 24):
+            errors.append(
+                f"Expression '{expr_str}' evaluates to {fmt(value)}, not 24."
+            )
+            return "error", False, errors, None
+        return "complete", True, [], []
+    else:
+        remaining = [value] + unused
+        can_reach, example = can_reach_24(remaining)
+        if not can_reach:
+            remaining_str = [fmt(n) for n in remaining]
+            errors.append(
+                f"Expression '{expr_str}' evaluates to {fmt(value)}. "
+                f"Remaining numbers (including result) are {remaining_str}. "
+                f"Cannot reach 24 from these numbers. This is a dead end."
+            )
+            return "error", False, errors, None
+        return "partial", True, [], unused
+
+
+# ---------------------------------------------------------------------------
+#  Monitor
+# ---------------------------------------------------------------------------
+
+class ThinkingPhaseStepVerifierGame24Monitor(VerifyMonitor):
+    """
+    Monitor that verifies Game-of-24 expressions during and after thinking.
+
+    During thinking: every N newlines (after warmup) -> fork a
+        side-stream asking for the current expression, verify it, and
+        give appropriate feedback.
+
+    After natural ``</think>``: inject expression prompt, verify the
+        final answer.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        original_numbers: List[int],
+        llm_server: dict,
+        prompt: str,
+        newline_threshold: int = 15,
+        max_corrections: int = 5,
+        answer_start_token: str = "</think>",
+        async_execution: bool = True,
+        warmup_newlines: int = 0,
+    ):
+        super().__init__(name)
+        self.original_numbers = [float(x) for x in original_numbers]
+        self.llm_server = llm_server
+        self.prompt = prompt
+        self.newline_threshold = newline_threshold
+        self.max_corrections = max_corrections
+        self.answer_start_token = answer_start_token
+        self.async_execution = async_execution
+        self.warmup_newlines = warmup_newlines
+
+        # ---- state ----
+        self._think_phase_corrections = 0
+        self._verified_expression = None  # set by Phase 1 early-stop
+
+    # ------------------------------------------------------------------
+    #  helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _fmt(n: float) -> str:
+        if abs(n - round(n)) < 1e-9:
+            return str(int(round(n)))
+        return f"{n:.4f}".rstrip('0').rstrip('.')
+
+    def _count_feedback_blocks(self, text: str) -> int:
+        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
+
+    def _is_in_thinking_phase(self, generated_text: str) -> bool:
+        return self.answer_start_token not in generated_text
+
+    @staticmethod
+    def _extract_braced_expression(text: str) -> Optional[str]:
+        """Extract the first expression wrapped in { } from *text*.
+
+        Handles nested braces so that e.g. ``{(3+5)*7}`` is extracted correctly.
+        """
+        start = text.find('{')
+        if start == -1:
+            return None
+        brace_count = 0
+        end = start
+        while end < len(text):
+            if text[end] == '{':
+                brace_count += 1
+            elif text[end] == '}':
+                brace_count -= 1
+                if brace_count == 0:
+                    break
+            end += 1
+        if brace_count != 0:
+            return None
+        expr = text[start + 1:end].strip()
+        if not expr:
+            return None
+        # Basic cleanup: remove LaTeX
+        expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
+        expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        expr = expr.replace(r'\left', '').replace(r'\right', '')
+        # Replace Unicode math operators (QwQ frequently uses these)
+        expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+        expr = expr.replace('\u2013', '-').replace('\u2014', '-')  # en-dash, em-dash
+        frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+        while re.search(frac_pattern, expr):
+            expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+        # Handle implicit multiplication
+        expr = re.sub(r'\)\s*\(', ')*(', expr)
+        expr = re.sub(r'\)\s*(\d)', r')*\1', expr)
+        expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
+        return expr
+
+    @staticmethod
+    def _extract_boxed_expression(text: str) -> Optional[str]:
+        """Extract expression from \\boxed{...} in text."""
+        boxed_pattern = r"\\boxed\{"
+        matches = list(re.finditer(boxed_pattern, text))
+        if not matches:
+            return None
+        last_match = matches[-1]
+        start = last_match.end()
+        brace_count = 1
+        end = start
+        while end < len(text) and brace_count > 0:
+            if text[end] == '{':
+                brace_count += 1
+            elif text[end] == '}':
+                brace_count -= 1
+            end += 1
+        expr = text[start:end - 1].strip()
+        expr = expr.replace(r'\times', '*').replace(r'\cdot', '*').replace(r'\div', '/')
+        expr = expr.replace(r'\,', '').replace(r'\ ', '')
+        expr = expr.replace(r'\left', '').replace(r'\right', '')
+        expr = expr.replace('\u00d7', '*').replace('\u00f7', '/').replace('\u2212', '-')
+        expr = expr.replace('\u2013', '-').replace('\u2014', '-')
+        frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+        while re.search(frac_pattern, expr):
+            expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+        expr = re.sub(r'\)\s*\(', ')*(', expr)
+        expr = re.sub(r'\)\s*(\d)', r')*\1', expr)
+        expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
+        return expr
+
+    # ------------------------------------------------------------------
+    #  _side_stream_expression  (streams ~20 tokens to get {expr})
+    # ------------------------------------------------------------------
+    async def _side_stream_expression(self, text_so_far: str, max_new_tokens: int = 20) -> Optional[str]:
+        """
+        Send ``prompt + text_so_far`` to vLLM, stream at most
+        *max_new_tokens* tokens, and try to extract an expression from
+        the output that appears inside ``{ }``.
+        """
+        fmt = self._fmt
+        nums_str = ", ".join(fmt(n) for n in self.original_numbers)
+        logger.info(
+            f"[Side-stream] Starting expression extraction\n"
+            f"  Original numbers : [{nums_str}]\n"
+            f"  Max new tokens   : {max_new_tokens}"
+        )
+
+        payload = deepcopy(self.llm_server["payload"])
+        payload["prompt"] = self.prompt + text_so_far
+        payload["max_tokens"] = max_new_tokens
+        payload.pop("logprobs", None)
+
+        generated = ""
+
+        async with httpx.AsyncClient(timeout=None) as client:
+            async with client.stream(
+                "POST",
+                self.llm_server["url"],
+                headers=self.llm_server["headers"],
+                json=payload,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[len("data: "):].strip()
+                        if data == "[DONE]":
+                            break
+                        chunk = json.loads(data)["choices"][0]["text"]
+                        generated += chunk
+                        logger.debug(f"[Side-stream] chunk: {chunk!r}")
+
+                        if '}' in generated:
+                            break
+
+        full_text = "{" + generated
+        expr = self._extract_braced_expression(full_text)
+        if expr:
+            logger.info(f"[Side-stream] Extracted expression: {expr}")
+        else:
+            logger.info(
+                f"[Side-stream] No expression found in side-stream "
+                f"(generated {len(generated)} chars: {generated!r})"
+            )
+        return expr
+
+    # ------------------------------------------------------------------
+    #  step_extractor
+    # ------------------------------------------------------------------
+    def step_extractor(self, chunk: str, generated_text: str):
+        # ===== PHASE 1: still inside <think> =====
+        if self._is_in_thinking_phase(generated_text):
+            if self._think_phase_corrections >= self.max_corrections:
+                return False, None
+
+            total_newlines = generated_text.count('\n')
+
+            if total_newlines < self.warmup_newlines:
+                return False, None
+
+            past_warmup = total_newlines - self.warmup_newlines
+            if (generated_text.endswith('\n')
+                    and past_warmup >= 0
+                    and past_warmup % self.newline_threshold == 0):
+                logger.info(
+                    f"[step_extractor] Phase 1 trigger: \\n count={total_newlines} "
+                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
+                    f"threshold={self.newline_threshold})"
+                )
+                return True, generated_text
+
+            return False, None
+
+        # ===== PHASE 2: after </think> =====
+
+        # 2a: </think> present but we haven't injected the expression prompt yet
+        if FINAL_EXPRESSION_PROMPT.strip() not in generated_text:
+            logger.info(
+                "[step_extractor] Phase 2a: </think> detected, "
+                "expression prompt not yet injected."
+            )
+            return True, generated_text
+
+        # 2b: trigger once we see a complete \boxed{...}
+        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = generated_text[think_end_pos:]
+
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        text = text_after_think[last_feedback_end:]
+
+        has_boxed = find_complete_boxed(text)
+        if has_boxed:
+            return True, generated_text
+
+        return False, None
+
+    # ------------------------------------------------------------------
+    #  verify
+    # ------------------------------------------------------------------
+    async def verify(self, step: str, token_index: int, event, event_info):
+        # ==================================================================
+        # CASE 1: Thinking phase -- side-stream expression verification
+        # ==================================================================
+        if self.answer_start_token not in step:
+            total_dn = step.count('\n')
+            logger.info(
+                f"[Phase 1] Thinking-phase verification triggered\n"
+                f"  \\n count    : {total_dn}\n"
+                f"  Thinking len : {len(step)} chars"
+            )
+
+            text_with_prompt = step + "\n" + THINKING_PHASE_EXPRESSION_PROMPT
+
+            expr_str = await self._side_stream_expression(text_with_prompt, max_new_tokens=20)
+
+            if expr_str is None:
+                logger.info(
+                    "[Phase 1] No expression extracted from side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            status, is_valid, errors, unused = verify_expression(
+                expr_str, self.original_numbers
+            )
+
+            if not is_valid:
+                error_summary = "; ".join(errors)
+                self._think_phase_corrections += 1
+                logger.info(
+                    f"[Phase 1] INVALID expression '{expr_str}'\n"
+                    f"  Error(s) : {error_summary}\n"
+                    f"  Action   : Inject feedback into thinking trace\n"
+                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                )
+                thinking_feedback = (
+                    f"\n\nWait, the expression {expr_str} does not work. "
+                    f"{error_summary} "
+                    f"I must NOT reuse {expr_str} or any expression I have already tried. "
+                    f"Let me try a completely different combination of "
+                    f"operations and grouping of numbers.\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = thinking_feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = errors
+                    event_info["phase"] = "rollback_to_thinking"
+                    event.set()
+                return step, thinking_feedback
+
+            elif status == "complete":
+                self._verified_expression = expr_str
+                logger.info(
+                    f"[Phase 1] VALID COMPLETE expression '{expr_str}' == 24\n"
+                    f"  Action: Inject early-stop message and transition to answer."
+                )
+                early_stop_msg = (
+                    f"\n\nWait, the expression {expr_str} has been verified "
+                    f"to equal 24 using all the given numbers. This will be "
+                    f"my final answer.\n{self.answer_start_token}\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = early_stop_msg
+                    event_info["correction_index"] = token_index
+                    event_info["phase"] = "early_stop_answer"
+                    event_info["verified_expression"] = expr_str
+                    event.set()
+                return step, early_stop_msg
+
+            else:
+                unused_str = (
+                    "[" + ", ".join(self._fmt(n) for n in unused) + "]"
+                    if unused else "[]"
+                )
+                logger.info(
+                    f"[Phase 1] VALID PARTIAL expression '{expr_str}'\n"
+                    f"  Unused numbers: {unused_str}\n"
+                    f"  Action: No error, let model keep thinking."
+                )
+                return step, None
+
+        # ==================================================================
+        # CASE 2a: </think> present but expression prompt not yet injected
+        # ==================================================================
+        if FINAL_EXPRESSION_PROMPT.strip() not in step:
+            logger.info(
+                "[Phase 2a] Natural </think> detected. "
+                "Injecting expression extraction prompt."
+            )
+            prompt_text = FINAL_EXPRESSION_PROMPT
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = prompt_text
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "inject_expression_prompt"
+                event.set()
+            return step, prompt_text
+
+        # ==================================================================
+        # CASE 2b: After </think> + expression prompt -- verify final answer
+        # ==================================================================
+
+        num_corrections = self._count_feedback_blocks(step)
+        if num_corrections >= self.max_corrections:
+            fb = "\nthe answer is \\boxed{no solution}"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = fb
+                event_info["correction_index"] = token_index
+                event_info["errors"] = ["Max corrections reached"]
+                event_info["phase"] = "standard_verify"
+                event.set()
+            return step, fb
+
+        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = step[think_end_pos:]
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        recent_text = text_after_think[last_feedback_end:]
+
+        expr_str = self._extract_boxed_expression(recent_text)
+        if expr_str is not None:
+            logger.info(f"[Phase 2b] Extracted expression from \\boxed: '{expr_str}'")
+
+        if expr_str is None:
+            return step, None
+
+        status, is_valid, errors, unused = verify_expression(
+            expr_str, self.original_numbers
+        )
+
+        if is_valid and status == "complete":
+            logger.info(f"[Phase 2b] Final expression '{expr_str}' is correct (= 24)")
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = ""
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "final_answer_correct"
+                event_info["verified_expression"] = expr_str
+                event.set()
+            return step, None
+
+        if is_valid and status == "partial":
+            used_numbers = _extract_numbers_from_expr(expr_str)
+            errors = [
+                f"Expression '{expr_str}' only uses {len(used_numbers)} of "
+                f"{len(self.original_numbers)} numbers. After </think>, "
+                f"a COMPLETE expression using ALL numbers is required."
+            ]
+
+        if not errors:
+            errors = [f"Expression '{expr_str}' is not a valid solution."]
+
+        error_summary = "; ".join(errors)
+        logger.info(f"[Phase 2b] Final expression FAILED: {error_summary}")
+
+        orig_display = [int(n) if n == int(n) else n for n in self.original_numbers]
+        nums_str = ", ".join(str(n) for n in orig_display)
+        feedback = (
+            f"\n[VERIFIER FEEDBACK:\n"
+            f"  The expression {expr_str} is incorrect. {error_summary}\n"
+            f"  Do NOT reuse {expr_str} or any previously tried expression.\n"
+            f"  Try a completely different approach. Use ALL four numbers "
+            f"{nums_str} exactly once, "
+            f"evaluating to 24. Wrap in \\boxed{{}}. ]\n"
+        )
+        if not event.is_set():
+            event_info["generated_text"] = step
+            event_info["feedback"] = feedback
+            event_info["correction_index"] = token_index
+            event_info["errors"] = errors
+            event_info["phase"] = "standard_verify"
+            event.set()
+        return step, feedback
+
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        phase = event_info.get("phase", "standard_verify")
+
+        if phase == "rollback_to_thinking":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[fix] Phase: rollback_to_thinking\n"
+                f"  -> Appended error feedback into <think> trace.\n"
+                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+            )
+            return result
+
+        if phase == "early_stop_answer":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[fix] Phase: early_stop_answer\n"
+                f"  -> Verified expression passed. Injecting early-stop + </think>.\n"
+                f"  -> Model will now generate the final answer."
+            )
+            return result
+
+        if phase == "final_answer_correct":
+            expr = event_info.get("verified_expression", "?")
+            logger.info(
+                f"[fix] Phase: final_answer_correct\n"
+                f"  -> Final expression '{expr}' verified correct. Stopping generation."
+            )
+            return event_info["generated_text"]
+
+        if phase == "inject_expression_prompt":
+            logger.info(
+                f"[fix] Phase: inject_expression_prompt\n"
+                f"  -> Natural </think> detected.\n"
+                f"  -> Appending expression extraction prompt."
+            )
+            return event_info["generated_text"] + event_info["feedback"]
+
+        # standard_verify
+        errors = event_info.get("errors", [])
+        error_summary = "; ".join(errors) if errors else "unknown"
+        logger.info(
+            f"[fix] Phase: standard_verify\n"
+            f"  -> Expression failed: {error_summary}\n"
+            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
+        )
+        return event_info["generated_text"] + event_info["feedback"]
diff --git a/interwhen/monitors/thinkingPhaseVerifierMaze.py b/interwhen/monitors/thinkingPhaseVerifierMaze.py
new file mode 100644
index 00000000..1664a273
--- /dev/null
+++ b/interwhen/monitors/thinkingPhaseVerifierMaze.py
@@ -0,0 +1,878 @@
+"""
+Thinking-phase verifier for Maze tasks.
+
+Verifies maze path-tracing by forking a side-stream during the
+thinking phase to ask the model about its current traced path.
+
+Workflow
+--------
+A) **DURING the thinking phase** (inside ``<think>...</think>``):
+   After a warmup period, every *N* newlines in the thinking trace:
+   1. Inject a first-person prompt to extract the traced path steps.
+   2. Parse and verify each step against the maze grid.
+   3. If **errors** -> inject feedback into thinking trace.
+   4. If **path reaches E** -> inject early-stop + ``</think>`` +
+      structured format.
+   5. If **partial but correct** -> no feedback, keep thinking.
+
+B) **AFTER ``</think>``**:
+   Phase 2a: Inject structured step format template.
+   Phase 2b: Verify each step as the model fills in the template.
+   Once ``\\boxed{}`` appears, stop generation.
+"""
+
+import re
+import json
+import logging
+from typing import Tuple, Optional
+from copy import deepcopy
+
+import httpx
+
+from .base import VerifyMonitor
+from ._common import find_complete_boxed
+from ..utils.maze_verifier import (
+    Direction, parse_direction, get_expected_turn_type,
+    parse_maze_from_prompt, parse_maze_step, verify_maze_step,
+    verify_locate_section, format_maze_feedback, format_locate_feedback,
+    DIRECTION_DELTAS, compute_relative_direction,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =====================================================================
+#  Maze Thinking-Phase Prompts
+# =====================================================================
+
+
+def _build_maze_format_block(question_type: str) -> str:
+    """
+    Build the <format>...</format> block that describes the structured
+    output template.  Re-used by both the side-stream (Phase 1) and
+    the post-</think> injection (Phase 2a).
+    """
+    if question_type == "relative_position":
+        return (
+            "<format>\n"
+            ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
+            "    S position: (row, col)\n"
+            "    E position: (row, col)\n"
+            "\n"
+            ">>> COMPARE POSITIONS:\n"
+            "    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S\n"
+            "    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S\n"
+            "\n"
+            ">>> FINAL ANSWER:\n"
+            "    \\boxed{LETTER}\n"
+            "</format>"
+        )
+    else:
+        count_line = "    Running count: Right=0, Left=0"
+        if question_type == "total_turns":
+            count_line = "    Running count: Right=0, Left=0, Total=0"
+
+        return (
+            "<format>\n"
+            ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
+            "    S position: (row, col)\n"
+            "    E position: (row, col)\n"
+            "\n"
+            ">>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)\n"
+            "    Current position: (r2, c2)\n"
+            "    Previous direction: —\n"
+            "    Current direction: DOWN\n"
+            "    Turn type: STRAIGHT\n"
+            f"{count_line}\n"
+            "\n"
+            "[... continue for all steps until reaching E ...]\n"
+            "\n"
+            ">>> FINAL ANSWER:\n"
+            "    \\boxed{LETTER}\n"
+            "</format>"
+        )
+
+
+def _build_maze_thinking_phase_prompt(question_type: str) -> str:
+    """
+    Build the side-stream prompt injected during the thinking phase.
+
+    Written in the LLM's own first-person thinking voice so it blends
+    naturally with the ``<think>`` trace.  Includes the ``<format>``
+    block and the starting marker so the model begins filling in.
+    """
+    format_block = _build_maze_format_block(question_type)
+    return (
+        "\n\nLet me output the current steps I have traced so far "
+        "through the maze in the following format:\n"
+        f"{format_block}\n"
+        ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
+    )
+
+
+def _build_maze_structured_prompt(question_type: str) -> str:
+    """
+    Build the structured format prompt injected after </think>.
+
+    This is analogous to Game24's step format injection — it gives the
+    model a template to fill in so we can parse and verify each step.
+    Written in the LLM's own voice so it reads naturally.
+    """
+    format_block = _build_maze_format_block(question_type)
+    return (
+        "\nLet me trace the step by step solution through the maze "
+        "in the following format:\n"
+        f"{format_block}\n"
+        ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n"
+    )
+
+
+# =====================================================================
+#  ThinkingPhaseStepVerifierMazeMonitor
+# =====================================================================
+
+class ThinkingPhaseStepVerifierMazeMonitor(VerifyMonitor):
+    """
+    Monitor that verifies maze path-tracing during and after thinking.
+
+    **No meta-prompt required** — works with a plain user prompt containing
+    just the maze and question.  Structure is injected by this monitor
+    after ``</think>`` (natural or early-stop), exactly like Game24
+    injects its step format.
+
+    Phase 1 – During ``<think>...</think>``:
+        Every N newlines (after warmup), fork a side-stream that
+        injects ``</think>`` + a structured step prompt, stream ~300
+        tokens, parse and verify each step against the maze grid.
+
+    Phase 2a – ``</think>`` detected, structured prompt not yet injected:
+        Inject the structured step-by-step format template so the model
+        fills it in (LOCATE → STEPs → FINAL ANSWER → ``\\boxed{}``).
+
+    Phase 2b – Structured prompt injected, model is generating:
+        Verify each completed step as it appears.  Once ``\\boxed{}``
+        appears, signal completion.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        grid: list,
+        start_pos: tuple,
+        exit_pos: tuple,
+        llm_server: dict,
+        prompt: str,
+        question_type: str = "right_turns",
+        newline_threshold: int = 10,
+        max_corrections: int = 5,
+        answer_start_token: str = "</think>",
+        async_execution: bool = True,
+        warmup_newlines: int = 0,
+    ):
+        super().__init__(name)
+        self.grid = grid
+        self.start_pos = start_pos
+        self.exit_pos = exit_pos
+        self.llm_server = llm_server
+        self.prompt = prompt
+        self.question_type = question_type
+        self.newline_threshold = newline_threshold
+        self.max_corrections = max_corrections
+        self.answer_start_token = answer_start_token
+        self.async_execution = async_execution
+        self.warmup_newlines = warmup_newlines
+
+        # Build the structured prompt that will be injected after </think>
+        self._structured_prompt = _build_maze_structured_prompt(question_type)
+        # Build the thinking-phase side-stream prompt (in LLM's own voice)
+        self._thinking_phase_prompt = _build_maze_thinking_phase_prompt(question_type)
+        # A unique marker to detect whether we already injected it
+        self._structured_marker = ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):"
+
+        # ---- state ----
+        self._think_phase_corrections = 0
+        self._verified_path_complete = False  # True if path reaches E
+
+    # ------------------------------------------------------------------
+    #  helpers
+    # ------------------------------------------------------------------
+    def _count_feedback_blocks(self, text: str) -> int:
+        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
+
+    def _is_in_thinking_phase(self, generated_text: str) -> bool:
+        return self.answer_start_token not in generated_text
+
+    def _structured_prompt_injected(self, generated_text: str) -> bool:
+        """Check if structured format was already injected after </think>."""
+        if self.answer_start_token not in generated_text:
+            return False
+        after_think = generated_text.split(self.answer_start_token, 1)[1]
+        return self._structured_marker in after_think
+
+    @staticmethod
+    def detect_question_type(prompt: str) -> str:
+        """Auto-detect question type from prompt text."""
+        prompt_lower = prompt.lower()
+        if "right turn" in prompt_lower or "right-turn" in prompt_lower:
+            return "right_turns"
+        if "left turn" in prompt_lower or "left-turn" in prompt_lower:
+            return "total_turns"
+        if "total" in prompt_lower and "turn" in prompt_lower:
+            return "total_turns"
+        if "turn" in prompt_lower:
+            return "right_turns"
+        return "relative_position"
+
+    def _verify_relative_position_answer(self, boxed_answer: str) -> Tuple[bool, Optional[str]]:
+        """Verify a relative-position boxed answer (A=Yes / B=No).
+
+        Parses the question from ``self.prompt`` to determine the asked
+        direction, computes the true relative direction of E from S,
+        and checks whether the model's Yes/No answer is correct.
+
+        Returns ``(is_correct, feedback_or_None)``.
+        """
+        answer_map = {"A": "Yes", "B": "No"}
+        model_yn = answer_map.get(boxed_answer.strip().upper())
+        if model_yn is None:
+            return True, None
+
+        m = re.search(
+            r'Is the exit \(E\)\s+(.*?)\s+(?:of\s+)?the starting point \(S\)',
+            self.prompt, re.IGNORECASE,
+        )
+        if not m:
+            return True, None
+
+        asked_raw = m.group(1).strip().lower()
+        asked_raw = re.sub(r',.*', '', asked_raw).strip()
+
+        actual = compute_relative_direction(self.start_pos, self.exit_pos)
+
+        direction_keywords = {
+            "directly to the left":   {"west"},
+            "directly to the right":  {"east"},
+            "directly above":         {"north"},
+            "directly below":         {"south"},
+            "to the top left":        {"northwest"},
+            "to the top right":       {"northeast"},
+            "to the bottom left":     {"southwest"},
+            "to the bottom right":    {"southeast"},
+        }
+
+        expected_dirs = direction_keywords.get(asked_raw)
+        if expected_dirs is None:
+            return True, None
+
+        expected_yn = "Yes" if actual in expected_dirs else "No"
+
+        if model_yn == expected_yn:
+            return True, None
+
+        sr, sc = self.start_pos
+        er, ec = self.exit_pos
+        correct_letter = 'A' if expected_yn == 'Yes' else 'B'
+        feedback = (
+            f"\n\n[VERIFIER FEEDBACK for relative position:\n"
+            f"  ✗ Your answer {boxed_answer} ({model_yn}) is incorrect.\n"
+            f"  IMPORTANT: In this task, \"{asked_raw}\" means the GENERAL "
+            f"COMPASS DIRECTION, NOT immediate adjacency. It asks whether E "
+            f"is in the {actual} direction from S, regardless of distance or "
+            f"walls between them.]\n\n"
+        )
+        return False, feedback
+
+    # ------------------------------------------------------------------
+    #  _parse_steps_from_text – parse structured steps from side-stream
+    # ------------------------------------------------------------------
+    def _parse_steps_from_text(self, text: str):
+        """
+        Parse all structured maze steps from text.
+
+        Returns list of parsed step dicts.
+        """
+        steps = []
+
+        step_pattern = re.compile(
+            r'>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
+            r'Running count:\s*Right\s*=\s*\d+\s*,\s*Left\s*=\s*\d+[^\n]*',
+            re.IGNORECASE | re.DOTALL
+        )
+
+        for match in step_pattern.finditer(text):
+            parsed = parse_maze_step(match.group(0))
+            if parsed:
+                steps.append(parsed)
+
+        return steps
+
+    def _verify_all_steps(self, steps):
+        """
+        Verify a sequence of parsed maze steps against the grid.
+
+        Returns:
+            (all_valid, first_error_step_num, errors, final_pos, final_dir,
+             right_count, left_count, total_count)
+        """
+        pos = self.start_pos
+        direction = Direction.NONE
+        right_count = 0
+        left_count = 0
+        total_count = 0
+
+        for step in steps:
+            is_valid, errors, state = verify_maze_step(
+                step=step,
+                grid=self.grid,
+                expected_from_pos=pos,
+                prev_direction=direction,
+                expected_right_count=right_count,
+                expected_left_count=left_count,
+                expected_total_count=total_count,
+            )
+
+            if not is_valid:
+                return (False, step.get('step_num', 0), errors,
+                        pos, direction, right_count, left_count, total_count)
+
+            pos = state['new_pos']
+            direction = state['new_direction']
+            right_count = state['new_right']
+            left_count = state['new_left']
+            total_count = state['new_total']
+
+        return (True, None, [], pos, direction,
+                right_count, left_count, total_count)
+
+    # ------------------------------------------------------------------
+    #  _side_stream_maze_steps – streams tokens to get traced path
+    # ------------------------------------------------------------------
+    async def _side_stream_maze_steps(self, text_so_far: str, max_new_tokens: int = 300) -> str:
+        """
+        Send ``prompt + text_so_far`` to vLLM, stream at most
+        *max_new_tokens* tokens, and return the generated text.
+        """
+        logger.info(
+            f"[Maze Side-stream] Starting path extraction\n"
+            f"  Maze: S={self.start_pos}, E={self.exit_pos}\n"
+            f"  Max new tokens: {max_new_tokens}"
+        )
+
+        payload = deepcopy(self.llm_server["payload"])
+        payload["prompt"] = self.prompt + text_so_far
+        payload["max_tokens"] = max_new_tokens
+        payload.pop("logprobs", None)
+
+        generated = ""
+
+        async with httpx.AsyncClient(timeout=None) as client:
+            async with client.stream(
+                "POST",
+                self.llm_server["url"],
+                headers=self.llm_server["headers"],
+                json=payload,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[len("data: "):].strip()
+                        if data == "[DONE]":
+                            break
+                        chunk = json.loads(data)["choices"][0]["text"]
+                        generated += chunk
+                        logger.debug(f"[Maze Side-stream] chunk: {chunk!r}")
+
+                        if '\\boxed' in generated or '>>> FINAL ANSWER' in generated:
+                            break
+
+        logger.info(
+            f"[Maze Side-stream] Generated {len(generated)} chars"
+        )
+        return generated
+
+    # ------------------------------------------------------------------
+    #  _extract_boxed_answer
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _extract_boxed_answer(text: str) -> Optional[str]:
+        """Extract the content of the last \\boxed{...} in text."""
+        matches = list(re.finditer(r'\\boxed\{', text))
+        if not matches:
+            return None
+        last_match = matches[-1]
+        start = last_match.end()
+        brace_count = 1
+        end = start
+        while end < len(text) and brace_count > 0:
+            if text[end] == '{':
+                brace_count += 1
+            elif text[end] == '}':
+                brace_count -= 1
+            end += 1
+        return text[start:end - 1].strip()
+
+    # ------------------------------------------------------------------
+    #  step_extractor
+    # ------------------------------------------------------------------
+    def step_extractor(self, chunk: str, generated_text: str):
+        # ===== PHASE 1: still inside <think> =====
+        if self._is_in_thinking_phase(generated_text):
+            if self._think_phase_corrections >= self.max_corrections:
+                return False, None
+
+            total_newlines = generated_text.count('\n')
+
+            if total_newlines < self.warmup_newlines:
+                return False, None
+
+            past_warmup = total_newlines - self.warmup_newlines
+            if (generated_text.endswith('\n')
+                    and past_warmup >= 0
+                    and past_warmup % self.newline_threshold == 0):
+                logger.info(
+                    f"[Maze step_extractor] Phase 1 trigger: \\n count={total_newlines} "
+                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
+                    f"threshold={self.newline_threshold})"
+                )
+                return True, generated_text
+
+            return False, None
+
+        # ===== PHASE 2: after </think> =====
+
+        # 2a: structured prompt not yet injected → trigger immediately
+        if not self._structured_prompt_injected(generated_text):
+            logger.info(
+                "[Maze step_extractor] Phase 2a: </think> detected, "
+                "structured prompt not yet injected."
+            )
+            return True, generated_text
+
+        # 2b: structured prompt injected — verify steps / boxed answer
+        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = generated_text[think_end_pos:]
+
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            model_output_start = last_marker_pos + len(self._structured_marker)
+            text_after_think = text_after_think[model_output_start:]
+            text_start_offset = think_end_pos + model_output_start
+        else:
+            text_start_offset = think_end_pos
+
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        text = text_after_think[last_feedback_end:]
+        text_start = text_start_offset + last_feedback_end
+
+        if self.question_type in ("right_turns", "total_turns"):
+            step_pattern = re.compile(
+                r'(>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
+                r'Running count:\s*Right\s*=\s*\d+\s*,\s*Left\s*=\s*\d+[^\n]*)',
+                re.IGNORECASE | re.DOTALL
+            )
+            all_steps = list(step_pattern.finditer(text))
+
+            if all_steps:
+                last_step = all_steps[-1]
+                text_after = text[last_step.end():]
+                next_step = re.search(r'>>>\s*STEP\s+\d+', text_after, re.IGNORECASE)
+                if not next_step:
+                    end_pos = text_start + last_step.end()
+                    return True, generated_text[:end_pos]
+                return False, None
+
+            locate_pattern = re.compile(
+                r'(LOCATE START AND EXIT.*?E position:\s*\([^)]+\))',
+                re.IGNORECASE | re.DOTALL
+            )
+            locate_match = locate_pattern.search(text)
+            if locate_match:
+                step1_start = re.search(r'>>>\s*STEP\s+1', text[locate_match.end():], re.IGNORECASE)
+                if step1_start:
+                    end_pos = text_start + locate_match.end()
+                    return True, generated_text[:end_pos]
+
+        boxed = find_complete_boxed(text)
+        if boxed:
+            end_pos = text_start + boxed.end()
+            return True, generated_text[:end_pos]
+
+        return False, None
+
+    # ------------------------------------------------------------------
+    #  verify
+    # ------------------------------------------------------------------
+    async def verify(self, step: str, token_index: int, event, event_info):
+        # ==================================================================
+        # CASE 1: Thinking phase – side-stream path verification
+        # ==================================================================
+        if self.answer_start_token not in step:
+            total_dn = step.count('\n')
+            logger.info(
+                f"[Maze Phase 1] Thinking-phase verification triggered\n"
+                f"  \\n count    : {total_dn}\n"
+                f"  Thinking len : {len(step)} chars"
+            )
+
+            text_with_prompt = step + self._thinking_phase_prompt
+
+            side_output = await self._side_stream_maze_steps(
+                text_with_prompt, max_new_tokens=300
+            )
+
+            if not side_output or len(side_output.strip()) < 20:
+                logger.info(
+                    "[Maze Phase 1] Insufficient output from side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            full_side_text = (
+                ">>> LOCATE START AND EXIT (0-indexed, top-left is (0,0)):\n" + side_output
+            )
+
+            locate_valid, locate_errors = verify_locate_section(
+                full_side_text, self.start_pos, self.exit_pos
+            )
+
+            if not locate_valid:
+                self._think_phase_corrections += 1
+                error_summary = "; ".join(locate_errors)
+                logger.info(
+                    f"[Maze Phase 1] LOCATE section errors: {error_summary}\n"
+                    f"  Action: Inject feedback into thinking trace\n"
+                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                )
+                thinking_feedback = (
+                    f"\n\nWait, I think I have the wrong positions. "
+                    f"{error_summary}. "
+                    f"Let me re-examine the maze grid carefully to find S and E.\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = thinking_feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = locate_errors
+                    event_info["phase"] = "rollback_to_thinking"
+                    event.set()
+                return step, thinking_feedback
+
+            steps = self._parse_steps_from_text(full_side_text)
+
+            if not steps:
+                logger.info(
+                    "[Maze Phase 1] No structured steps found in side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            (all_valid, err_step_num, errors, final_pos,
+             final_dir, r_count, l_count, t_count) = self._verify_all_steps(steps)
+
+            if not all_valid:
+                error_summary = "; ".join(errors)
+                self._think_phase_corrections += 1
+                logger.info(
+                    f"[Maze Phase 1] INVALID step {err_step_num}\n"
+                    f"  Error(s) : {error_summary}\n"
+                    f"  Action   : Inject feedback into thinking trace\n"
+                    f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                )
+                thinking_feedback = (
+                    f"\n\nWait, I made an error at Step {err_step_num}. "
+                    f"{error_summary}. "
+                    f"Let me re-trace the path more carefully from the correct position.\n"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = thinking_feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = errors
+                    event_info["phase"] = "rollback_to_thinking"
+                    event.set()
+                return step, thinking_feedback
+
+            # All steps valid — check if path is complete (reached E)
+            if final_pos == self.exit_pos:
+                self._verified_path_complete = True
+                logger.info(
+                    f"[Maze Phase 1] VALID COMPLETE path to E={self.exit_pos}\n"
+                    f"  Steps: {len(steps)}, Right={r_count}, Left={l_count}, Total={t_count}\n"
+                    f"  Action: Inject early-stop + </think> + structured format."
+                )
+                early_stop_msg = (
+                    f"\n\nWait, I have successfully traced the path from "
+                    f"S={self.start_pos} to E={self.exit_pos} with "
+                    f"{len(steps)} steps. "
+                    f"Right turns={r_count}, Left turns={l_count}, "
+                    f"Total turns={t_count}. "
+                    f"This path has been verified as correct. "
+                    f"Let me give the final answer.\n"
+                    f"{self.answer_start_token}"
+                    f"{self._structured_prompt}"
+                )
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = early_stop_msg
+                    event_info["correction_index"] = token_index
+                    event_info["phase"] = "early_stop_answer"
+                    event_info["verified_counts"] = {
+                        "right": r_count,
+                        "left": l_count,
+                        "total": t_count,
+                        "steps": len(steps),
+                    }
+                    event.set()
+                return step, early_stop_msg
+
+            else:
+                logger.info(
+                    f"[Maze Phase 1] VALID PARTIAL path\n"
+                    f"  Current pos: {final_pos}, Target: {self.exit_pos}\n"
+                    f"  Steps so far: {len(steps)}\n"
+                    f"  Action: No error, let model keep thinking."
+                )
+                return step, None
+
+        # ==================================================================
+        # CASE 2a: </think> present but structured prompt not yet injected
+        # ==================================================================
+        if not self._structured_prompt_injected(step):
+            logger.info(
+                "[Maze Phase 2a] </think> detected. "
+                "Injecting structured step format."
+            )
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = self._structured_prompt
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "inject_structured_prompt"
+                event.set()
+            return step, self._structured_prompt
+
+        # ==================================================================
+        # CASE 2b: Structured prompt injected — verify output
+        # ==================================================================
+
+        num_corrections = self._count_feedback_blocks(step)
+        if num_corrections >= self.max_corrections:
+            fb = "\nthe answer is \\boxed{no solution}"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = fb
+                event_info["correction_index"] = token_index
+                event_info["errors"] = ["Max corrections reached"]
+                event_info["phase"] = "standard_verify"
+                event.set()
+            return step, fb
+
+        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = step[think_end_pos:]
+
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            text_after_think = text_after_think[last_marker_pos:]
+
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        recent_text = text_after_think[last_feedback_end:]
+
+        # --- Verify LOCATE section ---
+        locate_match = re.search(r'LOCATE START AND EXIT', recent_text, re.IGNORECASE)
+        if locate_match:
+            step1_start = re.search(r'>>>\s*STEP\s+1', recent_text, re.IGNORECASE)
+            if step1_start or '\\boxed' in recent_text:
+                if step1_start:
+                    locate_text = recent_text[locate_match.start():step1_start.start()]
+                else:
+                    locate_text = recent_text[locate_match.start():]
+                is_valid, loc_errors = verify_locate_section(
+                    locate_text, self.start_pos, self.exit_pos
+                )
+                if not is_valid:
+                    feedback = format_locate_feedback(loc_errors)
+                    if not event.is_set():
+                        event_info["generated_text"] = step
+                        event_info["feedback"] = feedback
+                        event_info["correction_index"] = token_index
+                        event_info["errors"] = loc_errors
+                        event_info["phase"] = "standard_verify"
+                        event.set()
+                    return step, feedback
+
+        # --- Verify structured steps ---
+        if self.question_type in ("right_turns", "total_turns"):
+            step_pattern = re.compile(
+                r'(>>>\s*STEP\s+(\d+):\s*Move\s+\w+\s+from\s+\([^)]+\)\s+to\s+\([^)]+\).*?'
+                r'Running count:[^\n]+)',
+                re.IGNORECASE | re.DOTALL
+            )
+            recent_step_matches = list(step_pattern.finditer(recent_text))
+
+            if recent_step_matches:
+                last_match = recent_step_matches[-1]
+                last_step_text = last_match.group(0)
+                last_step_num = int(last_match.group(2))
+                parsed = parse_maze_step(last_step_text)
+
+                if parsed:
+                    all_full_matches = list(step_pattern.finditer(text_after_think))
+                    state = self._get_state_before_step_phase2(
+                        text_after_think, last_step_num, all_full_matches
+                    )
+
+                    is_valid, errors, new_state = verify_maze_step(
+                        step=parsed,
+                        grid=self.grid,
+                        expected_from_pos=state['position'],
+                        prev_direction=state['direction'],
+                        expected_right_count=state['right_count'],
+                        expected_left_count=state['left_count'],
+                        expected_total_count=state['total_count'],
+                    )
+
+                    if not is_valid:
+                        feedback = format_maze_feedback(errors, last_step_num)
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = errors
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+        # --- Check for boxed answer ---
+        boxed_answer = self._extract_boxed_answer(recent_text)
+        if boxed_answer is not None:
+            logger.info(f"[Maze Phase 2b] Extracted boxed answer: {boxed_answer}")
+
+            if self.question_type == "relative_position":
+                is_correct, rp_feedback = self._verify_relative_position_answer(boxed_answer)
+                if not is_correct and rp_feedback:
+                    logger.info(
+                        f"[Maze Phase 2b] Relative position answer '{boxed_answer}' is INCORRECT."
+                    )
+                    if not event.is_set():
+                        event_info["generated_text"] = step
+                        event_info["feedback"] = rp_feedback
+                        event_info["correction_index"] = token_index
+                        event_info["errors"] = [f"Wrong relative position answer: {boxed_answer}"]
+                        event_info["phase"] = "standard_verify"
+                        event.set()
+                    return step, rp_feedback
+
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = ""
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "final_answer_correct"
+                event.set()
+            return step, None
+
+        return step, None
+
+    # ------------------------------------------------------------------
+    #  _get_state_before_step_phase2 – reconstruct state for Phase 2
+    # ------------------------------------------------------------------
+    def _get_state_before_step_phase2(self, text: str, target_step_num: int,
+                                       all_step_matches: list) -> dict:
+        """Reconstruct state before a given step from Phase 2 structured output.
+
+        When a step number appears multiple times (original + corrections after
+        verifier feedback), only the LAST occurrence of each step number is used,
+        so that corrected steps override earlier invalid ones.
+        """
+        state = {
+            'position': self.start_pos,
+            'direction': Direction.NONE,
+            'right_count': 0,
+            'left_count': 0,
+            'total_count': 0,
+        }
+
+        last_by_num = {}
+        for match in all_step_matches:
+            step_num = int(match.group(2))
+            if step_num >= target_step_num:
+                continue
+            last_by_num[step_num] = match
+
+        for step_num in sorted(last_by_num.keys()):
+            parsed = parse_maze_step(last_by_num[step_num].group(0))
+            if not parsed:
+                continue
+
+            direction = parsed['direction']
+            to_pos = parsed['to_pos']
+
+            turn_type = get_expected_turn_type(state['direction'], direction)
+            if turn_type == 'RIGHT_TURN':
+                state['right_count'] += 1
+                state['total_count'] += 1
+            elif turn_type == 'LEFT_TURN':
+                state['left_count'] += 1
+                state['total_count'] += 1
+
+            state['position'] = to_pos
+            state['direction'] = direction
+
+        return state
+
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        """Apply the appropriate fix depending on the phase."""
+        phase = event_info.get("phase", "standard_verify")
+
+        if phase == "rollback_to_thinking":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[Maze fix] Phase: rollback_to_thinking\n"
+                f"  -> Appended error feedback into <think> trace.\n"
+                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+            )
+            return result
+
+        if phase == "early_stop_answer":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            counts = event_info.get("verified_counts", {})
+            logger.info(
+                f"[Maze fix] Phase: early_stop_answer\n"
+                f"  -> Path verified: {counts.get('steps', '?')} steps, "
+                f"R={counts.get('right', '?')}, L={counts.get('left', '?')}, "
+                f"T={counts.get('total', '?')}\n"
+                f"  -> Injecting early-stop + </think> + structured format."
+            )
+            return result
+
+        if phase == "inject_structured_prompt":
+            logger.info(
+                "[Maze fix] Phase: inject_structured_prompt\n"
+                "  -> Appending structured step format after </think>."
+            )
+            return event_info["generated_text"] + event_info["feedback"]
+
+        if phase == "final_answer_correct":
+            logger.info(
+                f"[Maze fix] Phase: final_answer_correct\n"
+                f"  -> Stopping generation."
+            )
+            return event_info["generated_text"]
+
+        # standard_verify
+        errors = event_info.get("errors", [])
+        error_summary = "; ".join(errors) if errors else "unknown"
+        logger.info(
+            f"[Maze fix] Phase: standard_verify\n"
+            f"  -> Error: {error_summary}\n"
+            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
+        )
+        return event_info["generated_text"] + event_info["feedback"]
diff --git a/interwhen/monitors/thinkingPhaseVerifierSpatialMap.py b/interwhen/monitors/thinkingPhaseVerifierSpatialMap.py
new file mode 100644
index 00000000..2c4d0e75
--- /dev/null
+++ b/interwhen/monitors/thinkingPhaseVerifierSpatialMap.py
@@ -0,0 +1,1023 @@
+"""
+Thinking-phase verifier for SpatialMap tasks.
+
+Verifies spatial-map directional claims by forking a side-stream during
+the thinking phase.  Uses Z3 constraint solving to check whether
+directional claims (e.g. "A is northeast of B") are consistent with
+the stated problem constraints.
+
+Workflow
+--------
+A) **DURING the thinking phase** (inside ``<think>...</think>``):
+   After a warmup period, every *N* newlines in the thinking trace:
+   1. Inject a first-person prompt to extract parsed and derived
+      spatial relationships (STEP 1 pre-filled, STEP 2 generated).
+   2. Parse directional claims from STEP 2 output.
+   3. Verify each claim using a Z3 solver.
+   4. If **errors** -> inject feedback into thinking trace.
+   5. If **all valid** -> no feedback, keep thinking.
+
+B) **AFTER ``</think>``**:
+   Phase 2a: Inject structured step format template.
+   Phase 2b: Verify directional claims and final answer
+   (direction / object / counting questions) as model fills template.
+   Once ``\\boxed{}`` appears, stop generation.
+"""
+
+import re
+import json
+import logging
+from typing import Dict, List, Set, Tuple, Optional
+from copy import deepcopy
+
+import httpx
+
+from .base import VerifyMonitor
+from ._common import find_complete_boxed
+from ..utils.spatialmap_verifier import (
+    SpatialMapZ3Solver, extract_step2_claims,
+    parse_directional_claims_from_text,
+    parse_counting_question, parse_model_count_from_answer,
+    parse_direction_question, parse_object_question,
+    parse_model_boxed_answer,
+    get_possible_directions, get_consistent_object_options,
+    get_possible_count_range,
+    verify_spatialmap_step, format_spatialmap_feedback,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =====================================================================
+#  SpatialMap Thinking-Phase Prompts
+# =====================================================================
+
+
+def _build_spatialmap_format_block() -> str:
+    """
+    Build the ``<format>...</format>`` block that describes the structured
+    output template for SpatialMap tasks.
+
+    Re-used by both the side-stream (Phase 1) and the post-``</think>``
+    injection (Phase 2a).
+    """
+    return (
+        "<format>\n"
+        ">>> STEP 1: PARSE RELATIONSHIPS\n"
+        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
+        "    - [Full Name C] is to the [direction] of [Full Name D]\n"
+        "    [... list ALL given relationships using FULL names exactly as in the question ...]\n"
+        "    (NO abbreviations, NO short forms, NO parenthetical aliases like 'Police Supply Store (PSS)')\n"
+        "\n"
+        ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS\n"
+        "    - Looking for: [target relationship / direction / count]\n"
+        "    - [Full Name A] is to the [direction] of [Full Name B]\n"
+        "    - [Full Name C] is to the [direction] of [Full Name D]\n"
+        "    [... list each derived relationship as a structured claim using FULL names ...]\n"
+        "    (Each claim MUST be in the form: '[Full Name] is to the [direction] of [Full Name]')\n"
+        "\n"
+        ">>> STEP 3: ANSWER\n"
+        "    - [state conclusion]\n"
+        "\n"
+        ">>> FINAL ANSWER: [answer text]\n"
+        "    \\boxed{LETTER}\n"
+        "</format>"
+    )
+
+
+def _build_spatialmap_thinking_phase_prompt(
+    parsed_relations: List[Dict],
+) -> str:
+    """
+    Build the side-stream prompt injected during the thinking phase.
+
+    Pre-fills STEP 1 with the known parsed relations (from the Z3 solver)
+    so the model jumps directly to STEP 2 analysis, maximising the chance
+    of producing verifiable directional claims within the token budget.
+
+    Written in the LLM's own first-person thinking voice so it blends
+    naturally with the ``<think>`` trace.
+    """
+    # Pre-fill STEP 1 from the ground-truth parsed relations
+    step1_lines = []
+    for rel in parsed_relations:
+        step1_lines.append(
+            f"    - {rel['A']} is to the {rel['direction']} of {rel['B']}"
+        )
+    step1_body = "\n".join(step1_lines) if step1_lines else "    (none)"
+
+    return (
+        "\n\nLet me organize what I have so far. I will list the given "
+        "relationships in STEP 1, then in STEP 2 I will state every "
+        "spatial claim I have derived.\n"
+        "IMPORTANT: I must use the FULL object names exactly as given in the question "
+        "(no abbreviations, no short forms, no aliases, no partial names, no parenthetical aliases like 'Store (S)').\n"
+        "Every claim must be in the form: '[Full Name] is to the [direction] of [Full Name]'\n"
+        "For direction I will use the full word: northeast, northwest, southeast, southwest, north, south, east, or west.\n\n"
+        ">>> STEP 1: PARSE RELATIONSHIPS (given)\n"
+        f"{step1_body}\n\n"
+        ">>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS (derived)\n"
+        "Based on my analysis so far, the derived relationships are:\n"
+    )
+
+
+def _build_spatialmap_structured_prompt() -> str:
+    """
+    Build the structured format prompt injected after ``</think>``.
+
+    Analogous to the maze's structured format injection — gives the
+    model a template to fill in so we can parse and verify each step.
+    """
+    format_block = _build_spatialmap_format_block()
+    return (
+        "\nLet me solve this step by step using the structured format.\n"
+        "IMPORTANT: I must use the FULL names of all objects exactly as they appear in the question. "
+        "NO abbreviations, NO short forms, NO parenthetical aliases.\n\n"
+        f"{format_block}\n"
+        ">>> STEP 1: PARSE RELATIONSHIPS\n"
+    )
+
+
+# =====================================================================
+#  ThinkingPhaseStepVerifierSpatialMapMonitor
+# =====================================================================
+
+
+class ThinkingPhaseStepVerifierSpatialMapMonitor(VerifyMonitor):
+    """
+    Monitor that verifies spatial-map directional claims during and after
+    thinking.
+
+    **No meta-prompt required** — works with a plain user prompt containing
+    just the map description and question.  Structure is injected by this
+    monitor after ``</think>`` (natural or early-stop), exactly like the
+    Maze monitor injects its step format.
+
+    Phase 1 – During ``<think>...</think>``:
+        Every N newlines (after warmup), fork a side-stream that
+        injects a structured step prompt, stream tokens, parse directional
+        claims from STEP 2, and verify them against Z3.
+
+    Phase 2a – ``</think>`` detected, structured prompt not yet injected:
+        Inject the structured step-by-step format template so the model
+        fills it in (STEP 1 → STEP 2 → STEP 3 → FINAL ANSWER → ``\\boxed{}``).
+
+    Phase 2b – Structured prompt injected, model is generating:
+        Verify directional claims in STEP 2 as they appear.  Once
+        ``\\boxed{}`` appears, signal completion.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        problem_text: str,
+        llm_server: dict,
+        prompt: str,
+        newline_threshold: int = 15,
+        max_corrections: int = 5,
+        answer_start_token: str = "</think>",
+        async_execution: bool = True,
+        warmup_newlines: int = 0,
+    ):
+        super().__init__(name)
+        self.problem_text = problem_text
+        self.llm_server = llm_server
+        self.prompt = prompt
+        self.newline_threshold = newline_threshold
+        self.max_corrections = max_corrections
+        self.answer_start_token = answer_start_token
+        self.async_execution = async_execution
+        self.warmup_newlines = warmup_newlines
+
+        # Initialize Z3 solver with problem constraints
+        self.z3_solver = SpatialMapZ3Solver(problem_text)
+
+        # Build prompts for injection
+        self._structured_prompt = _build_spatialmap_structured_prompt()
+        self._thinking_phase_prompt = _build_spatialmap_thinking_phase_prompt(
+            self.z3_solver.parsed_relations,
+        )
+        # Marker to detect if structured prompt was already injected
+        self._structured_marker = ">>> STEP 1: PARSE RELATIONSHIPS"
+
+        # ---- state ----
+        self._think_phase_corrections = 0
+        self.verified_claims: Set[Tuple[str, str, str]] = set()
+
+        # ---- counting-question verification ----
+        self._counting_question = parse_counting_question(problem_text)
+        self._counting_options: Dict[str, str] = {}
+        # Strip trailing instruction paragraph for clean option parsing
+        _opts_text = re.split(r'\nFirst,', problem_text, maxsplit=1)[0]
+        if self._counting_question:
+            # Parse MCQ options from problem text (e.g., "A. 5\nB. 3\nC. 0\nD. 1")
+            raw_opts = re.findall(
+                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
+                _opts_text, flags=re.DOTALL,
+            )
+            self._counting_options = {
+                k: v.strip().rstrip(".") for k, v in raw_opts
+            }
+            logger.info(
+                f"[SpatialMap] Counting question detected: "
+                f"direction={self._counting_question['direction']}, "
+                f"reference={self._counting_question['reference']}, "
+                f"options={self._counting_options}"
+            )
+        self._count_feedback_given = False
+        self._count_feedback_blocks_count = 0  # tracks cardinal count retry attempts
+
+        # ---- direction-question verification ----
+        self._direction_question = parse_direction_question(problem_text)
+        if self._direction_question:
+            logger.info(
+                f"[SpatialMap] Direction question detected: "
+                f"entity_a={self._direction_question['entity_a']}, "
+                f"entity_b={self._direction_question['entity_b']}"
+            )
+
+        # ---- object-question verification ----
+        self._object_question = parse_object_question(problem_text)
+        if self._object_question:
+            logger.info(
+                f"[SpatialMap] Object question detected: "
+                f"direction={self._object_question['direction']}, "
+                f"reference={self._object_question['reference']}"
+            )
+
+        # ---- Generic MCQ options (for direction & object Qs too) ----
+        if not self._counting_options:
+            raw_opts = re.findall(
+                r'([A-D])\.\s*(.+?)\s*(?=[A-D]\.|$)',
+                _opts_text, flags=re.DOTALL,
+            )
+            self._mcq_options: Dict[str, str] = {
+                k: v.strip().rstrip(".") for k, v in raw_opts
+            }
+        else:
+            self._mcq_options = dict(self._counting_options)
+
+        # Allow multiple retries for final-answer verification
+        self._max_final_answer_retries = 3
+        self._direction_feedback_count = 0
+        self._object_feedback_count = 0
+        self._diag_count_feedback_count = 0
+
+    @classmethod
+    def from_prompt(
+        cls,
+        problem_text: str,
+        llm_server: dict,
+        prompt: str,
+        newline_threshold: int = 15,
+        max_corrections: int = 5,
+        warmup_newlines: int = 0,
+        name: str = "spatialmap_thinking_verifier",
+    ) -> "ThinkingPhaseStepVerifierSpatialMapMonitor":
+        """
+        Convenience factory method.
+        """
+        return cls(
+            name=name,
+            problem_text=problem_text,
+            llm_server=llm_server,
+            prompt=prompt,
+            newline_threshold=newline_threshold,
+            max_corrections=max_corrections,
+            warmup_newlines=warmup_newlines,
+        )
+
+    # ------------------------------------------------------------------
+    #  helpers
+    # ------------------------------------------------------------------
+    def _count_feedback_blocks(self, text: str) -> int:
+        return len(re.findall(r'\[VERIFIER FEEDBACK[^\]]*\]', text))
+
+    def _is_in_thinking_phase(self, generated_text: str) -> bool:
+        return self.answer_start_token not in generated_text
+
+    def _structured_prompt_injected(self, generated_text: str) -> bool:
+        """Check if structured format was already injected after </think>."""
+        if self.answer_start_token not in generated_text:
+            return False
+        after_think = generated_text.split(self.answer_start_token, 1)[1]
+        return self._structured_marker in after_think
+
+    def _extract_new_claims(self, text: str) -> List[Dict]:
+        """
+        Extract new (not yet verified) directional claims from STEP 2 of
+        the most recent attempt (after last feedback block).
+        """
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text):
+            last_feedback_end = match.end()
+
+        text_to_check = text[last_feedback_end:]
+
+        # Get full entity names from Z3 solver for abbreviation resolution
+        entity_names = list({
+            k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
+        })
+
+        all_claims = extract_step2_claims(text_to_check, entity_names=entity_names)
+
+        new_claims = []
+        for claim in all_claims:
+            claim_key = (claim['A'], claim['direction'], claim['B'])
+            if claim_key not in self.verified_claims:
+                new_claims.append(claim)
+
+        return new_claims
+
+    # ------------------------------------------------------------------
+    #  _side_stream_spatialmap – streams tokens to get analysis
+    # ------------------------------------------------------------------
+    async def _side_stream_spatialmap(self, text_so_far: str, max_new_tokens: int = 400) -> str:
+        """
+        Send ``prompt + text_so_far`` to vLLM, stream at most
+        *max_new_tokens* tokens, and return the generated text.
+
+        ``text_so_far`` is expected to end with the structured spatial map
+        prompt so the model outputs its analysis steps.
+        """
+        logger.info(
+            f"[SpatialMap Side-stream] Starting analysis extraction\n"
+            f"  Relations: {len(self.z3_solver.parsed_relations)}\n"
+            f"  Max new tokens: {max_new_tokens}"
+        )
+
+        payload = deepcopy(self.llm_server["payload"])
+        payload["prompt"] = self.prompt + text_so_far
+        payload["max_tokens"] = max_new_tokens
+        payload.pop("logprobs", None)
+
+        generated = ""
+
+        async with httpx.AsyncClient(timeout=None) as client:
+            async with client.stream(
+                "POST",
+                self.llm_server["url"],
+                headers=self.llm_server["headers"],
+                json=payload,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[len("data: "):].strip()
+                        if data == "[DONE]":
+                            break
+                        chunk = json.loads(data)["choices"][0]["text"]
+                        generated += chunk
+                        logger.debug(f"[SpatialMap Side-stream] chunk: {chunk!r}")
+
+                        # Stop if we see FINAL ANSWER or \boxed
+                        if '\\boxed' in generated or '>>> FINAL ANSWER' in generated:
+                            break
+
+        logger.info(
+            f"[SpatialMap Side-stream] Generated {len(generated)} chars"
+        )
+        return generated
+
+    # ------------------------------------------------------------------
+    #  step_extractor
+    # ------------------------------------------------------------------
+    def step_extractor(self, chunk: str, generated_text: str):
+        """
+        Phase 1 (thinking): trigger at every newline_threshold multiple
+            (after warmup).
+        Phase 2 (after </think>): trigger on structured steps or boxed
+            answer.
+        """
+        # ===== PHASE 1: still inside <think> =====
+        if self._is_in_thinking_phase(generated_text):
+            if self._think_phase_corrections >= self.max_corrections:
+                return False, None
+
+            total_newlines = generated_text.count('\n')
+
+            if total_newlines < self.warmup_newlines:
+                return False, None
+
+            past_warmup = total_newlines - self.warmup_newlines
+            if (generated_text.endswith('\n')
+                    and past_warmup >= 0
+                    and past_warmup % self.newline_threshold == 0):
+                logger.info(
+                    f"[SpatialMap step_extractor] Phase 1 trigger: \\n count={total_newlines} "
+                    f"(warmup={self.warmup_newlines}, past_warmup={past_warmup}, "
+                    f"threshold={self.newline_threshold})"
+                )
+                return True, generated_text
+
+            return False, None
+
+        # ===== PHASE 2: after </think> =====
+
+        # 2a: structured prompt not yet injected → trigger immediately
+        if not self._structured_prompt_injected(generated_text):
+            logger.info(
+                "[SpatialMap step_extractor] Phase 2a: </think> detected, "
+                "structured prompt not yet injected."
+            )
+            return True, generated_text
+
+        # 2b: structured prompt injected — verify STEP 2 claims / boxed answer
+        think_end_pos = generated_text.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = generated_text[think_end_pos:]
+
+        # Strip out the injected <format>...</format> template so we only
+        # look at actual model output (which starts after the last marker).
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            model_output_start = last_marker_pos + len(self._structured_marker)
+            text_after_think = text_after_think[model_output_start:]
+            text_start_offset = think_end_pos + model_output_start
+        else:
+            text_start_offset = think_end_pos
+
+        # Skip past feedback blocks
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        text = text_after_think[last_feedback_end:]
+        text_start = text_start_offset + last_feedback_end
+
+        # Check for STEP 2 section with claims
+        step2_pattern = re.compile(
+            r'>>>\s*STEP\s*2[:\s].*?(?=>>>\s*STEP\s*3|>>>\s*FINAL|\\boxed|$)',
+            re.DOTALL | re.IGNORECASE
+        )
+        step2_match = step2_pattern.search(text)
+
+        if step2_match:
+            # Check if STEP 3 or FINAL has started (STEP 2 is complete)
+            text_after_step2 = text[step2_match.end():]
+            step3_or_final = re.search(
+                r'>>>\s*(STEP\s*3|FINAL)',
+                text_after_step2,
+                re.IGNORECASE
+            )
+
+            if step3_or_final:
+                new_claims = self._extract_new_claims(text)
+                if new_claims:
+                    end_pos = text_start + step2_match.end()
+                    return True, generated_text[:end_pos]
+
+        # Check for boxed answer (trigger final verification)
+        boxed_match = find_complete_boxed(text)
+        if boxed_match:
+            new_claims = self._extract_new_claims(text)
+            if new_claims:
+                end_pos = text_start + boxed_match.end()
+                return True, generated_text[:end_pos]
+            # Even if no new claims, boxed answer signals completion
+            end_pos = text_start + boxed_match.end()
+            return True, generated_text[:end_pos]
+
+        return False, None
+
+    # ------------------------------------------------------------------
+    #  verify
+    # ------------------------------------------------------------------
+    async def verify(self, step: str, token_index: int, event, event_info):
+        """
+        Case 1 -- still in thinking (no </think>):
+            Fork side-stream, parse claims, verify with Z3.
+        Case 2 -- after </think>:
+            2a: Inject structured prompt.
+            2b: Verify STEP 2 claims and/or final answer.
+        """
+
+        # ==================================================================
+        # CASE 1: Thinking phase – side-stream verification
+        # ==================================================================
+        if self.answer_start_token not in step:
+            total_dn = step.count('\n')
+            logger.info(
+                f"[SpatialMap Phase 1] Thinking-phase verification triggered\n"
+                f"  \\n count    : {total_dn}\n"
+                f"  Thinking len : {len(step)} chars"
+            )
+
+            # Build text with injected prompt for analysis extraction
+            text_with_prompt = step + self._thinking_phase_prompt
+
+            # Side-stream: get analysis from the model
+            side_output = await self._side_stream_spatialmap(
+                text_with_prompt, max_new_tokens=800
+            )
+
+            if not side_output or len(side_output.strip()) < 20:
+                logger.info(
+                    "[SpatialMap Phase 1] Insufficient output from side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            # Parse directional claims directly from the side-stream output.
+            # The prompt pre-fills STEP 1 and ends at ">>> STEP 2:", so the
+            # model's output is already STEP 2 content — no header to search for.
+            entity_names = list({
+                k[:-2] for k in self.z3_solver.entities if k.endswith('_x')
+            })
+            claims = parse_directional_claims_from_text(
+                side_output, entity_names=entity_names
+            )
+
+            logger.info(
+                f"[SpatialMap Phase 1] Parsed {len(claims)} claims from side-stream.\n"
+                f"  Side-stream output (first 500 chars): {side_output[:500]!r}"
+            )
+
+            if not claims:
+                logger.info(
+                    "[SpatialMap Phase 1] No directional claims found in side-stream. "
+                    "Letting model continue thinking."
+                )
+                return step, None
+
+            # Verify each claim against Z3
+            for claim in claims:
+                claim_key = (claim['A'], claim['direction'], claim['B'])
+                if claim_key in self.verified_claims:
+                    continue
+
+                is_valid, errors = verify_spatialmap_step(
+                    claim=claim,
+                    z3_solver=self.z3_solver,
+                    add_if_valid=True,
+                )
+                self.verified_claims.add(claim_key)
+
+                if not is_valid:
+                    self._think_phase_corrections += 1
+                    error_summary = "; ".join(errors)
+                    logger.info(
+                        f"[SpatialMap Phase 1] INVALID claim: "
+                        f"{claim['A']} is {claim['direction']} of {claim['B']}\n"
+                        f"  Error(s) : {error_summary}\n"
+                        f"  Corrections: {self._think_phase_corrections}/{self.max_corrections}"
+                    )
+                    thinking_feedback = (
+                        f"\n\nWait, I think I made an error in my spatial reasoning. "
+                        f"{error_summary}. "
+                        f"Let me re-examine the relationships more carefully.\n"
+                    )
+                    if not event.is_set():
+                        event_info["generated_text"] = step
+                        event_info["feedback"] = thinking_feedback
+                        event_info["correction_index"] = token_index
+                        event_info["errors"] = errors
+                        event_info["phase"] = "rollback_to_thinking"
+                        event.set()
+                    return step, thinking_feedback
+
+            # All claims valid
+            logger.info(
+                f"[SpatialMap Phase 1] All {len(claims)} claims valid. "
+                f"Letting model continue thinking."
+            )
+            return step, None
+
+        # ==================================================================
+        # CASE 2a: </think> present but structured prompt not yet injected
+        # ==================================================================
+        if not self._structured_prompt_injected(step):
+            logger.info(
+                "[SpatialMap Phase 2a] </think> detected. "
+                "Injecting structured step format."
+            )
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = self._structured_prompt
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "inject_structured_prompt"
+                event.set()
+            return step, self._structured_prompt
+
+        # ==================================================================
+        # CASE 2b: Structured prompt injected — verify output
+        # ==================================================================
+        num_corrections = self._count_feedback_blocks(step)
+        if num_corrections >= self.max_corrections:
+            fb = "\nthe answer is \\boxed{no solution}"
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = fb
+                event_info["correction_index"] = token_index
+                event_info["errors"] = ["Max corrections reached"]
+                event_info["phase"] = "standard_verify"
+                event.set()
+            return step, fb
+
+        think_end_pos = step.find(self.answer_start_token) + len(self.answer_start_token)
+        text_after_think = step[think_end_pos:]
+
+        # Strip the injected template — only look at model output after marker
+        last_marker_pos = text_after_think.rfind(self._structured_marker)
+        if last_marker_pos >= 0:
+            text_after_think = text_after_think[last_marker_pos:]
+
+        feedback_pattern = re.compile(r'\[VERIFIER FEEDBACK[^\]]*\]\s*', re.DOTALL)
+        last_feedback_end = 0
+        for match in feedback_pattern.finditer(text_after_think):
+            last_feedback_end = match.end()
+        recent_text = text_after_think[last_feedback_end:]
+
+        # --- Verify STEP 2 claims ---
+        new_claims = self._extract_new_claims(recent_text)
+
+        for claim in new_claims:
+            claim_key = (claim['A'], claim['direction'], claim['B'])
+
+            is_valid, errors = verify_spatialmap_step(
+                claim=claim,
+                z3_solver=self.z3_solver,
+                add_if_valid=True,
+            )
+            self.verified_claims.add(claim_key)
+
+            if not is_valid:
+                feedback = format_spatialmap_feedback(errors, claim)
+                if not event.is_set():
+                    event_info["generated_text"] = step
+                    event_info["feedback"] = feedback
+                    event_info["correction_index"] = token_index
+                    event_info["errors"] = errors
+                    event_info["failed_step"] = claim
+                    event_info["phase"] = "standard_verify"
+                    event.set()
+                return step, feedback
+
+        # --- Check for boxed answer ---
+        boxed_match = find_complete_boxed(recent_text)
+        if boxed_match:
+
+            # ==========================================================
+            # Direction-question verification
+            # ==========================================================
+            if (
+                self._direction_question
+                and num_corrections < self.max_corrections
+                and self._direction_feedback_count < self._max_final_answer_retries
+            ):
+                model_dir_text = parse_model_boxed_answer(
+                    recent_text, self._mcq_options
+                )
+                if model_dir_text:
+                    possible = get_possible_directions(
+                        self.z3_solver,
+                        self._direction_question["entity_a"],
+                        self._direction_question["entity_b"],
+                    )
+                    logger.info(
+                        f"[SpatialMap Phase 2b] Direction check: "
+                        f"model={model_dir_text}, possible={possible}"
+                    )
+                    if model_dir_text not in possible:
+                        self._direction_feedback_count += 1
+                        # Find which MCQ options are consistent
+                        valid_options = [
+                            letter for letter, val in self._mcq_options.items()
+                            if val.strip().lower().rstrip(".") in possible
+                        ]
+                        if len(valid_options) == 1:
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
+                                f"  '{model_dir_text.title()}' is "
+                                f"impossible for "
+                                f"{self._direction_question['entity_a']} "
+                                f"relative to "
+                                f"{self._direction_question['entity_b']} "
+                                f"based on the given constraints.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        else:
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Direction error!\n"
+                                f"  '{model_dir_text.title()}' is "
+                                f"impossible for "
+                                f"{self._direction_question['entity_a']} "
+                                f"relative to "
+                                f"{self._direction_question['entity_b']} "
+                                f"based on the given constraints.\n"
+                                f"  Please reconsider and choose the "
+                                f"correct option.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [
+                                f"Direction '{model_dir_text}' impossible; "
+                                f"possible: {possible}"
+                            ]
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+            # ==========================================================
+            # Object-question verification
+            # ==========================================================
+            if (
+                self._object_question
+                and num_corrections < self.max_corrections
+                and self._object_feedback_count < self._max_final_answer_retries
+            ):
+                model_obj_text = parse_model_boxed_answer(
+                    recent_text, self._mcq_options
+                )
+                boxed_raw = re.findall(
+                    r'\\boxed\{([^}]*)\}', recent_text
+                )
+                model_letter = (
+                    boxed_raw[-1].strip().upper() if boxed_raw else None
+                )
+
+                if model_letter:
+                    consistent = get_consistent_object_options(
+                        self.z3_solver,
+                        self._object_question["direction"],
+                        self._object_question["reference"],
+                        self._mcq_options,
+                    )
+                    logger.info(
+                        f"[SpatialMap Phase 2b] Object check: "
+                        f"model={model_letter}, "
+                        f"consistent_options={consistent}"
+                    )
+                    if model_letter not in consistent:
+                        self._object_feedback_count += 1
+                        odir = self._object_question["direction"]
+                        oref = self._object_question["reference"]
+                        if len(consistent) == 1:
+                            correct_name = self._mcq_options.get(
+                                consistent[0], consistent[0]
+                            )
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Object error!\n"
+                                f"  '{model_obj_text}' cannot be "
+                                f"{odir} of {oref} based on the "
+                                f"given constraints.\n"
+                                f"  The only consistent option is "
+                                f"{consistent[0]}. {correct_name}.\n"
+                                f"  Please select option "
+                                f"{consistent[0]}.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        else:
+                            valid_names = [
+                                f"{l}. {self._mcq_options.get(l, l)}"
+                                for l in consistent
+                            ]
+                            feedback = (
+                                f"\n\n[VERIFIER FEEDBACK: Object error!\n"
+                                f"  '{model_obj_text}' cannot be "
+                                f"{odir} of {oref} based on the "
+                                f"given constraints.\n"
+                                f"  The consistent options are: "
+                                f"{', '.join(valid_names)}.\n"
+                                f"  Please reconsider and choose the "
+                                f"correct option.]\n\n"
+                                f">>> STEP 3: ANSWER\n"
+                            )
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [
+                                f"Object '{model_obj_text}' impossible "
+                                f"in {odir} of {oref}; "
+                                f"consistent: {consistent}"
+                            ]
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+            # ==========================================================
+            # Counting-question verification (cardinal + diagonal)
+            # ==========================================================
+            if (
+                self._counting_question
+                and num_corrections < self.max_corrections
+            ):
+                direction = self._counting_question["direction"]
+                reference = self._counting_question["reference"]
+                is_cardinal = direction in (
+                    "north", "south", "east", "west"
+                )
+
+                if is_cardinal:
+                    # --- Cardinal: GT is always 0 ---
+                    # All spatial constraints in this dataset are diagonal
+                    # (NE, NW, SE, SW), so no object can be strictly
+                    # north/south/east/west of another. The answer is
+                    # always 0.
+                    model_count = parse_model_count_from_answer(
+                        recent_text, self._counting_options
+                    )
+                    z3_count = 0
+
+                    logger.info(
+                        f"[SpatialMap Phase 2b] Cardinal count check: "
+                        f"model={model_count}, expected={z3_count}, "
+                        f"direction={direction}, reference={reference}"
+                    )
+
+                    if (
+                        model_count is not None
+                        and model_count != z3_count
+                    ):
+                        self._count_feedback_given = True
+                        count_corrections = self._count_feedback_blocks_count
+                        self._count_feedback_blocks_count = count_corrections + 1
+
+                        # Build direction-specific examples of what does NOT count
+                        if direction in ("north", "south"):
+                            diag_examples = "northeast or northwest"
+                        elif direction == "west":
+                            diag_examples = "northwest or southwest"
+                        else:  # east
+                            diag_examples = "northeast or southeast"
+
+                        feedback = (
+                            f"\n\n[VERIFIER FEEDBACK: Count mismatch!\n"
+                            f"  You answered {model_count} objects "
+                            f"'{direction}' of {reference}, but this "
+                            f"count is incorrect.\n"
+                            f"  IMPORTANT: '{direction}' is a strict "
+                            f"cardinal direction — it means ONLY "
+                            f"exactly {direction}, NOT {diag_examples}."
+                            f"\n"
+                            f"  An object that is {diag_examples.split(' or ')[0]} of "
+                            f"{reference} is NOT {direction} of "
+                            f"{reference}.\n"
+                            f"  Re-examine each object: is it described "
+                            f"as being strictly '{direction} of' "
+                            f"{reference}, or is the relationship "
+                            f"actually a diagonal direction like "
+                            f"{diag_examples}? Only count objects that "
+                            f"are strictly {direction}.]\n\n"
+                            f">>> STEP 3: ANSWER\n"
+                        )
+
+                        logger.info(
+                            f"[SpatialMap Phase 2b] Cardinal count "
+                            f"mismatch: model={model_count}, "
+                            f"expected=0. Injecting feedback "
+                            f"(attempt={'1st' if not self._count_feedback_given else '2nd'})."
+                        )
+                        if not event.is_set():
+                            event_info["generated_text"] = step
+                            event_info["feedback"] = feedback
+                            event_info["correction_index"] = token_index
+                            event_info["errors"] = [
+                                f"Cardinal count mismatch: expected 0, "
+                                f"got {model_count}"
+                            ]
+                            event_info["phase"] = "standard_verify"
+                            event.set()
+                        return step, feedback
+
+                else:
+                    # --- Diagonal: use Z3 range check ---
+                    if self._diag_count_feedback_count < self._max_final_answer_retries:
+                        model_count = parse_model_count_from_answer(
+                            recent_text, self._counting_options
+                        )
+                        count_range = get_possible_count_range(
+                            self.z3_solver, reference, direction
+                        )
+
+                        if (
+                            model_count is not None
+                            and count_range is not None
+                        ):
+                            min_c, max_c = count_range
+                            logger.info(
+                                f"[SpatialMap Phase 2b] Diagonal count "
+                                f"check: model={model_count}, "
+                                f"range=[{min_c}, {max_c}], "
+                                f"direction={direction}, "
+                                f"reference={reference}"
+                            )
+
+                            if not (min_c <= model_count <= max_c):
+                                self._diag_count_feedback_count += 1
+                                # Find valid MCQ options
+                                valid_opts = []
+                                for opt, val in (
+                                    self._counting_options.items()
+                                ):
+                                    try:
+                                        v = int(val)
+                                        if min_c <= v <= max_c:
+                                            valid_opts.append(
+                                                (opt, v)
+                                            )
+                                    except (ValueError, TypeError):
+                                        pass
+
+                                if len(valid_opts) == 1:
+                                    feedback = (
+                                        f"\n\n[VERIFIER FEEDBACK: "
+                                        f"Count error!\n"
+                                        f"  {model_count} objects "
+                                        f"'{direction}' of {reference}"
+                                        f" is impossible.\n"
+                                        f"  The valid count is "
+                                        f"{valid_opts[0][1]}.\n"
+                                        f"  Please select option "
+                                        f"{valid_opts[0][0]}.]\n\n"
+                                        f">>> STEP 3: ANSWER\n"
+                                    )
+                                else:
+                                    feedback = (
+                                        f"\n\n[VERIFIER FEEDBACK: "
+                                        f"Count error!\n"
+                                        f"  {model_count} objects "
+                                        f"'{direction}' of {reference}"
+                                        f" is impossible.\n"
+                                        f"  The possible count range "
+                                        f"is [{min_c}, {max_c}].\n"
+                                        f"  Please reconsider and "
+                                        f"choose the correct "
+                                        f"option.]\n\n"
+                                        f">>> STEP 3: ANSWER\n"
+                                    )
+
+                                if not event.is_set():
+                                    event_info["generated_text"] = step
+                                    event_info["feedback"] = feedback
+                                    event_info["correction_index"] = (
+                                        token_index
+                                    )
+                                    event_info["errors"] = [
+                                        f"Diagonal count "
+                                        f"{model_count} outside "
+                                        f"range [{min_c}, {max_c}]"
+                                    ]
+                                    event_info["phase"] = (
+                                        "standard_verify"
+                                    )
+                                    event.set()
+                                return step, feedback
+
+            logger.info(
+                f"[SpatialMap Phase 2b] Boxed answer found. Stopping."
+            )
+            if not event.is_set():
+                event_info["generated_text"] = step
+                event_info["feedback"] = ""
+                event_info["correction_index"] = token_index
+                event_info["phase"] = "final_answer_correct"
+                event.set()
+            return step, None
+
+        # All claims valid, no boxed yet
+        return step, None
+
+    # ------------------------------------------------------------------
+    #  fix
+    # ------------------------------------------------------------------
+    async def fix(self, generated_text: str, event_info: dict, fix_method=None):
+        """Apply the appropriate fix depending on the phase."""
+        phase = event_info.get("phase", "standard_verify")
+
+        if phase == "rollback_to_thinking":
+            base_text = event_info["generated_text"]
+            result = base_text.rstrip() + event_info["feedback"]
+            logger.info(
+                f"[SpatialMap fix] Phase: rollback_to_thinking\n"
+                f"  -> Appended error feedback into <think> trace.\n"
+                f"  -> Think-phase corrections: {self._think_phase_corrections}/{self.max_corrections}"
+            )
+            return result
+
+        if phase == "inject_structured_prompt":
+            logger.info(
+                "[SpatialMap fix] Phase: inject_structured_prompt\n"
+                "  -> Appending structured step format after </think>."
+            )
+            return event_info["generated_text"] + event_info["feedback"]
+
+        if phase == "final_answer_correct":
+            logger.info(
+                "[SpatialMap fix] Phase: final_answer_correct\n"
+                "  -> Stopping generation."
+            )
+            return event_info["generated_text"]
+
+        # standard_verify
+        errors = event_info.get("errors", [])
+        error_summary = "; ".join(errors) if errors else "unknown"
+        logger.info(
+            f"[SpatialMap fix] Phase: standard_verify\n"
+            f"  -> Error: {error_summary}\n"
+            f"  -> Appending [VERIFIER FEEDBACK] so model retries."
+        )
+        return event_info["generated_text"] + event_info["feedback"]

From 7f8fd47e61e42c5a74e6d7dc811e8097f6aecea0 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen1-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen1-8gpus.bonete11.svc.cluster.local>
Date: Tue, 17 Mar 2026 10:24:58 +0000
Subject: [PATCH 15/17] Remove Outputs_TTS from repo

---
 .../EAT_10examples_results.txt                |   24 -
 .../EAT_3examples_results.txt                 |   24 -
 .../Reasoning_output/reason_3013.txt          | 1521 -----------------
 .../Reasoning_output/reason_3016.txt          |  582 -------
 .../Reasoning_output/reason_3017.txt          |  760 --------
 .../Reasoning_output/reason_3036.txt          |  579 -------
 .../Reasoning_output/reason_3047.txt          |  598 -------
 .../Reasoning_output/reason_4005.txt          |  265 ---
 .../Reasoning_output/reason_4007.txt          |  281 ---
 .../Reasoning_output/reason_4011.txt          |  240 ---
 .../Reasoning_output/reason_4012.txt          |  184 --
 .../Reasoning_output/reason_4014.txt          |  380 ----
 .../csv_saved/results_10examples.csv          |   11 -
 .../csv_saved/results_3examples.csv           |    4 -
 .../Qwen3-30B-A3B-Thinking-2507/summary.json  |   40 -
 .../EAT_2examples_results.txt                 |   29 -
 .../EAT_5examples_results.txt                 |   30 -
 .../Reasoning_output/reason_0.txt             |  229 ---
 .../Reasoning_output/reason_1124.txt          |  782 ---------
 .../Reasoning_output/reason_1499.txt          |  843 ---------
 .../Reasoning_output/reason_374.txt           |  296 ----
 .../Reasoning_output/reason_749.txt           |  622 -------
 .../csv_saved/results_2examples.csv           |    3 -
 .../csv_saved/results_5examples.csv           |    6 -
 .../Qwen3-30B-A3B-Thinking-2507/summary.json  |   49 -
 25 files changed, 8382 deletions(-)
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv
 delete mode 100644 Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv
 delete mode 100644 Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json

diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt
deleted file mode 100644
index e3642117..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_10examples_results.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Maze Step Verification Results
-==================================================
-
-Model: Qwen/Qwen3-30B-A3B-Thinking-2507
-Number of Examples: 10
-Max Corrections: 5
-Newline Threshold: 20
-Warmup: 0
-
-Results:
----------
-Correct: 7/10
-Accuracy: 70.00%
-Attempted (produced \boxed answer): 7/10
-Soundness (correct/attempted): 100.00%
-
-Token Statistics:
----------------------------
-Total Tokens: 66043
-Average Tokens: 6604.30
-Median Tokens: 5706
-Min Tokens: 3826
-Max Tokens: 15431
-Std Dev: 3255.06
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt
deleted file mode 100644
index 9f2bef8d..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/EAT_3examples_results.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Maze Step Verification Results
-==================================================
-
-Model: Qwen/Qwen3-30B-A3B-Thinking-2507
-Number of Examples: 3
-Max Corrections: 5
-Newline Threshold: 20
-Warmup: 0
-
-Results:
----------
-Correct: 3/3
-Accuracy: 100.00%
-Attempted (produced \boxed answer): 3/3
-Soundness (correct/attempted): 100.00%
-
-Token Statistics:
----------------------------
-Total Tokens: 8168
-Average Tokens: 2722.67
-Median Tokens: 2896
-Min Tokens: 2152
-Max Tokens: 3120
-Std Dev: 413.75
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt
deleted file mode 100644
index de214ba3..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3013.txt
+++ /dev/null
@@ -1,1521 +0,0 @@
-Okay, let's try to figure out how many right turns are in the path marked by X from S to E. First, I need to visualize the maze properly. The ASCII maze given is:
-
-#######
-#XXS  #
-#X### #
-#X# # #
-#X# # #
-#E#   #
-#######
-
-Hmm, let me parse this. Each line represents a row. Let's count the rows and columns to get a better sense.
-
-Rows:
-1: #######
-2: #XXS  #
-3: #X### #
-4: #X# # #
-5: #X# # #
-6: #E#   #
-7: #######
-
-Wait, the maze is 7 rows high. Let's write them out with row numbers:
-
-Row 1: # # # # # # # (7 columns)
-Row 2: # X X S   # (assuming the spaces are there)
-Wait, the second line is "#XXS  #", so breaking it down:
-
-Row 2: positions 1-7:
-1: #
-2: X
-3: X
-4: S
-5: (space)
-6: (space)
-7: #
-
-Wait, but the maze might have the S in the middle. Let's list all rows properly.
-
-Let me write each row with their characters:
-
-Row 1: # # # # # # #
-Row 2: # X X S   #
-Row 3: # X # # # #
-Row 4: # X # # #
-Wait, no. Wait the third line is "#X### #", which would be:
-
-Row 3: # X # # # # ?
-
-Wait, the user wrote:
-
-#######
-#XXS  #
-#X### #
-#X# # #
-#X# # #
-#E#   #
-#######
-
-Wait, each line should have the same number of characters. Let's check:
-
-First line: 7 #s, so 7 characters.
-
-Second line: #XXS  # → that's 7 characters: #, X, X, S, space, space, #.
-
-Third line: #X### # → # X # # # # ? Wait, #X### # would be 7 characters: #, X, #, #, #, #, ?
-
-Wait, no. Let's count:
-
-# X # # # # → no, #X### # → positions 1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: ? Wait, the line is written as "#X### #", which might be a typo. Wait, maybe it's #X### #, but that's 7 characters. Let's count:
-
-# X # # # # → that's 6, but the line should be 7. Hmm. Maybe the user made a typo, but assuming it's correctly formatted.
-
-Wait, maybe the third line is "#X### #" → so 7 characters: #, X, #, #, #, #, ? Wait, no, the last character would be #. Wait:
-
-Wait, the third line: #X### #. Let's split into characters:
-
-Character 1: #
-2: X
-3: #
-4: #
-5: #
-6: #
-7: ?
-
-Wait, no, the line is written as "#X### #", which would be 7 characters if the last # is the 7th. Wait, #X### # would be 7 characters: #, X, #, #, #, space, #? No, that's not right. Maybe the user missed a character. Wait, perhaps the maze is:
-
-Row 1: ####### (7)
-Row 2: #XXS  # (7)
-Row 3: #X### # (7)
-Row 4: #X# # # (7)
-Row 5: #X# # # (7)
-Row 6: #E#   # (7)
-Row 7: ####### (7)
-
-Wait, let's check each row's length:
-
-Row 2: #XXS  # → 7 characters (positions 1-7)
-Row 3: #X### # → Let's count: # X # # # # → but that's 6. Wait, maybe it's #X### # with a space? Hmm, maybe the third row is #X### # meaning # X # # # #, but that's six characters. Wait, maybe the user made a typo. Alternatively, perhaps it's #X### #, which would be seven characters: #, X, #, #, #, #, #? No, that would be #X#####. Hmm.
-
-Wait, maybe the ASCII art is:
-
-Row 1: #######
-Row 2: #XXS  #
-Row 3: #X### #
-Row 4: #X# # #
-Row 5: #X# # #
-Row 6: #E#   #
-Row 7: #######
-
-Ah, okay, maybe the third row is #X### #, but written as #X### #, which would be 7 characters. Let's see:
-
-Row 3: # X # # # # → but no, #X### # → positions:
-
-1: #
-2: X
-3: #
-4: #
-5: #
-6: #
-7: # ?
-
-Wait, that would make it #X#####. But the user wrote #X### #, which might have a space in there. Wait, maybe it's a typo and it's supposed to be #X### # with a space, but that's confusing. Let's assume that the maze is correctly formatted as per the user's input. Let's try to reconstruct it.
-
-Let me write all rows as given:
-
-Line 1: ####### → 7 #
-Line 2: #XXS  # → # X X S [space] [space] #
-Line 3: #X### # → # X # # # # [space] ? Wait, no. Wait, the user wrote "#X### #", which would be 7 characters: #, X, #, #, #, space, #? But that's a bit odd. Alternatively, maybe it's a typo and should be #X### #, meaning # X # # # # #, but that's 7 characters. Wait, #X### # would be:
-
-Character 1: #
-2: X
-3: #
-4: #
-5: #
-6: #
-7: # → but that's #X#####. But the user wrote "#X### #", so maybe there's a space. Let's think differently.
-
-Perhaps the maze is:
-
-Row 1: #######
-Row 2: #XXS  #
-Row 3: #X### #
-Row 4: #X# # #
-Row 5: #X# # #
-Row 6: #E#   #
-Row 7: #######
-
-Assuming each row has 7 characters. Let's count Row 3: #X### # → if it's 7 characters, then:
-
-1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: #? No, that's #X#####. But the user wrote #X### #, which might have a space in position 6. So maybe:
-
-Row 3: # X # # # [space] # → but that would be 7 characters. Wait, #X### # would be:
-
-Position: 1: #, 2: X, 3: #, 4: #, 5: #, 6: [space], 7: #. So the third row has a space in position 6. Hmm, but that might not make sense. Alternatively, maybe the user made a typo and it's supposed to be #X### # as in # X # # # #, but missing a character. This is a bit confusing. Let's try to map it out.
-
-Let's create a grid. Let's index rows from top to bottom (row 1 to row 7), columns left to right (col 1 to col 7).
-
-Row 1: 1-7: all #
-Row 2: # X X S [space] [space] #
-So columns 1: #, 2: X, 3: X, 4: S, 5: space, 6: space, 7: #
-Row 3: # X # # # # # → but user wrote #X### #, so maybe columns 1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: #? Not sure. Wait, if it's #X### #, then maybe columns 1: #, 2: X, 3: #, 4: #, 5: #, 6: space, 7: #. Let's assume that. So row 3: # X # # # [space] #.
-
-Row 4: #X# # # → columns: # X # space # space # ? Or # X # # space # ?
-
-Wait, the user wrote "#X# # #", which would be 7 characters. Let's split:
-
-# X # [space] # [space] # → but that's 7 characters. So positions:
-
-1: #, 2: X, 3: #, 4: space, 5: #, 6: space, 7: #.
-
-Similarly, row 5: same as row 4: #X# # #.
-
-Row 6: #E#   # → # E # [space] [space] [space] #.
-
-Okay, this is getting complicated. Maybe I should sketch it out.
-
-Let me try to write the grid:
-
-Row 1:  # # # # # # #
-Row 2:  # X X S   #
-Row 3:  # X # # # #
-Row 4:  # X # # #
-Row 5:  # X # # #
-Row 6:  # E #   #
-Row 7:  # # # # # # #
-
-Wait, no. Let's try to get each row's characters.
-
-Row 1: 7 #s.
-
-Row 2: # X X S [space] [space] # → so columns 1: #, 2:X, 3:X,4:S,5: ,6: ,7:#.
-
-Row 3: # X # # # # → but user wrote #X### #. If it's #X### #, perhaps it's # X # # # [space] #, but that's not matching. Alternatively, maybe it's a typo and should be #X###, but then it's 6 characters. Hmm.
-
-Wait, maybe the maze is:
-
-Row 1: #######
-Row 2: #XXS  #
-Row 3: #X### #
-Row 4: #X# # #
-Row 5: #X# # #
-Row 6: #E#   #
-Row 7: #######
-
-Assuming that each row has 7 characters. Let's check row 3: #X### # → if it's 7 characters, then:
-
-Characters: #, X, #, #, #, #, # → but that's #X#####. But user wrote #X### #, so maybe the sixth character is a space. So row 3: # X # # # [space] #.
-
-Similarly, row 4: #X# # # → # X # [space] # [space] #.
-
-Row 5: same as row 4.
-
-Row 6: #E#   # → # E # [space][space][space]#.
-
-Okay, let's proceed with that.
-
-Now, the path marked by X is the route from S to E. So S is at row 2, column 4 (assuming row 2, columns 1-7: # X X S ... so S is at (2,4)).
-
-We need to find the path from S (2,4) to E (row 6, column 2, since row 6 starts with # E # ... so E is at (6,2)).
-
-But the path is marked by X's. So the X's are the correct route. So we need to trace the X's from S to E.
-
-Wait, S is a separate symbol, not X. So the path starts at S, and the X's are the path. Wait, the problem says: "the path marked by X". So the X's are the route from S to E. So S is the start, and the X's lead to E. Wait, but S is not an X. So the path starts at S, then follows X's.
-
-Wait, looking at row 2: #XXS  #. So the S is at position (2,4), and to the left of S are two X's (positions 2 and 3). So the path from S would go left along the X's? But S is at (2,4), X's at (2,2), (2,3). So maybe the path is going left from S to the X's, but S is the starting point. Wait, but the problem states: "navigate from S to E by following the path marked by X". So the X's form the path from S to E. So perhaps S is connected to the X's. Let's see.
-
-Wait, row 2: columns 2 and 3 are X, column 4 is S. So S is to the right of the X's. So maybe the path starts at S (2,4), then moves left to (2,3) which is X, then left to (2,2) which is X. Then where does it go next?
-
-Row 3: column 2 is X. So (3,2) is X. So from (2,2), moving down to (3,2), which is X.
-
-Row 4: column 2 is X. (4,2) is X.
-
-Row 5: column 2 is X. (5,2) is X.
-
-Row 6: column 2 is E. Wait, row 6: #E#   #, so (6,2) is E. So the path goes down column 2 from row 2 down to row 6.
-
-Wait, but row 2, column 2 is X, row 3 column 2 is X, row 4 column 2 X, row 5 column 2 X, row 6 column 2 E. So the path is moving down column 2 from row 2 to row 6.
-
-Wait, but S is at (2,4). So to get to the X at (2,2), you have to move left from S (2,4) to (2,3), which is X, then left to (2,2), which is X. Then down to row 3, column 2 (X), then down to row 4, etc.
-
-So the path would be: S (2,4) → (2,3) X → (2,2) X → (3,2) X → (4,2) X → (5,2) X → E (6,2).
-
-Wait, but row 6, column 2 is E. So the path from S to E would go left two steps, then down four steps.
-
-But let's confirm the coordinates.
-
-Let's map all X positions:
-
-Row 2: columns 2,3 are X.
-
-Row 3: column 2 is X.
-
-Row 4: column 2 is X.
-
-Row 5: column 2 is X.
-
-Row 6: column 2 is E.
-
-So the path starts at S (2,4), moves left to (2,3) [X], left to (2,2) [X], then down to (3,2) [X], down to (4,2) [X], down to (5,2) [X], down to (6,2) [E].
-
-So the movement sequence:
-
-Start at S (2,4). First move: left to (2,3). Direction: left (from S, facing left? Or initial direction? Wait, movement direction: when moving from S to (2,3), direction is left. Then next move from (2,3) to (2,2) is also left. Then from (2,2) to (3,2) is down.
-
-Wait, the path is made of X's, so the X's are the path. So the path is S → X → X → X → X → X → E. So the X's are the path, with S being the start and E the end.
-
-So the actual path segments are:
-
-From S (2,4) to (2,3): move left.
-
-(2,3) to (2,2): move left.
-
-(2,2) to (3,2): move down.
-
-(3,2) to (4,2): move down.
-
-(4,2) to (5,2): move down.
-
-(5,2) to (6,2): move down.
-
-So the directions taken are: left, left, down, down, down, down.
-
-Now, to count right turns. A right turn is a 90-degree clockwise change from the previous direction.
-
-But we need to know the direction of movement at each step.
-
-Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
-Wait, rows are numbered 1 to 7 from top to bottom.
-
-Row 1: ####### (top wall)
-Row 2: #XXS  # → so the S is in row 2, column 4 (assuming columns 1-7 left to right).
-
-Row 6: #E#   # → E is in row 6, column 2.
-
-Now, the X's:
-
-Row 2: columns 2 and 3 are X.
-
-Row 3: column 2 is X.
-
-Row 4: column 2 is X.
-
-Row 5: column 2 is X.
-
-So the path from S (row 2, col 4) must go left to col 3 (X), left to col 2 (X), then down to row 3 col 2 (X), down to row 4 col 2, etc., until row 6 col 2 (E).
-
-So the movement steps:
-
-1. Start at S (2,4). First move: left to (2,3). Direction: left.
-
-2. Next move: left to (2,2). Direction still left.
-
-3. Next move: down to (3,2). Direction changes from left to down. Is that a right or left turn?
-
-Assuming that when moving left (west), a right turn would be to face north or south? Wait, direction changes.
-
-Let's define directions:
-
-- North: up (row decreases)
-- South: down (row increases)
-- East: right (column increases)
-- West: left (column decreases)
-
-Assuming the movement direction:
-
-At step 1: moving west (left) from S to (2,3).
-
-Step 2: still moving west to (2,2).
-
-Step 3: now moving south (down) to (3,2). So the previous direction was west, now changing to south.
-
-Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
-Wait, the user wrote:
-
-Row 2: #XXS  #
-So the S is the fourth character in row 2. Since row 2 starts with #, then X, X, S, then two spaces, then #.
-
-So columns:
-
-1: #
-2: X
-3: X
-4: S
-5: (space)
-6: (space)
-7: #
-
-So S is at row 2, column 4.
-
-Row 6: #E#   #
-Columns:
-
-1: #
-2: E
-3: #
-4: (space)
-5: (space)
-6: (space)
-7: #
-
-So E is at row 6, column 2.
-
-Now, the X's are at:
-
-Row 2: columns 2,3
-
-Row 3: column 2 (assuming row 3: #X### # → column 2 is X)
-
-Row 4: column 2 (row 4: #X# # # → column 2 is X)
-
-Row 5: column 2 (same as row 4)
-
-So the path from S (2,4) must go left along row 2 to column 3 (X), then left to column 2 (X), then down column 2 through rows 3,4,5, to row 6 (E).
-
-So the path coordinates:
-
-(2,4) → (2,3) → (2,2) → (3,2) → (4,2) → (5,2) → (6,2)
-
-Now, movement directions:
-
-From (2,4) to (2,3): west (left)
-
-From (2,3) to (2,2): west (left)
-
-From (2,2) to (3,2): south (down)
-
-Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
-Wait, no. Rows are numbered top to bottom. So row 1 is top, row 7 is bottom.
-
-So S is in row 2 (second row), column 4.
-
-E is in row 6 (sixth row), column 2.
-
-Now, the path:
-
-Start at S (row 2, col 4).
-
-Move west (left) to row 2, col 3 (X).
-
-Move west to row 2, col 2 (X).
-
-Move south (down) to row 3, col 2 (X).
-
-Move south to row 4, col 2 (X).
-
-Move south to row 5, col 2 (X).
-
-Move south to row 6, col 2 (E).
-
-So the directions of movement:
-
-1. West (from S to first X)
-2. West (to next X)
-3. South (down)
-4. South
-5. South
-6. South
-
-Now, we need to find right turns. A right turn is a 90-degree clockwise change from the previous direction.
-
-Let's list the directions in order:
-
-Step 1: West
-
-Step 2: West (same direction, no turn)
-
-Step 3: South (change from West to South)
-
-What's the turn direction here?
-
-If you're moving west (left), then turning to south (down). A right turn from west would be south (since west → right turn is south; left turn would be north).
-
-Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
-Wait, perhaps I'm confused about the direction of movement. Let's think in terms of compass directions.
-
-Assume facing north (up), right turn would be east, etc.
-
-But when moving along the path, the direction you're facing when moving between two points.
-
-Let's consider each movement segment:
-
-- From (2,4) to (2,3): moving west. So direction is west.
-
-- From (2,3) to (2,2): still west.
-
-- From (2,2) to (3,2): moving south. So direction changes from west to south.
-
-Now, changing direction from west to south: what's the turn?
-
-If you're moving west (left), then to turn south (down), which is a right turn. Because west → south is a clockwise 90-degree turn.
-
-West → South: right turn.
-
-Similarly, west → North would be left turn.
-
-So from west to south is a right turn.
-
-Then, from south to south (next moves), no turn.
-
-So the only turn in the path is at step 3: from west to south, which is a right turn.
-
-But wait, the path has two west movements followed by four south movements.
-
-So the direction changes only once: west → south, which is a right turn.
-
-Wait, but the question is asking for the number of right turns. So that would be 1. But 1 isn't one of the options. Options are A.0 B.2 C.5 D.9.
-
-Hmm, this suggests I made a mistake. Maybe the path is different.
-
-Wait, maybe the X's are not all along column 2. Let's re-examine the maze.
-
-Wait, row 3: #X### # → if it's # X # # # #, but perhaps row 3, column 3 is a wall. Wait, row 3: #X### #. If it's # X # # # #, then column 3 is #, so row 3, column 2 is X, and columns 3-6 are #.
-
-Wait, I think I have the wrong positions. S position (2, 4) should be (1, 3); E position (6, 2) should be (5, 1). Let me re-examine the maze grid carefully to find S and E.
-Wait, perhaps I misread the maze. Let's write it out in a grid:
-
-Row 1: # # # # # # #
-Row 2: # X X S   #
-Row 3: # X # # # #
-Row 4: # X # # #
-Row 5: # X # # #
-Row 6: # E #   #
-Row 7: # # # # # # #
-
-Wait, but row 3 as per user input is "#X### #", which would be # X # # # #, but maybe with a space. Let's assume row 3 is # X # # # #, so columns 1-7:
-
-1: #, 2: X, 3: #, 4: #, 5: #, 6: #, 7: #.
-
-But that would make row 3: #X#####.
-
-Row 4: "#X# # #" → # X # space # space #, so columns:
-
-1: #, 2: X, 3: #, 4: space, 5: #, 6: space, 7: #.
-
-Row 5: same as row 4.
-
-Row 6: # E # space space space #.
-
-So in row 3, column 2 is X, but column 3 is #, so it's a wall. So from row 2, column 2 (X), moving down to row 3, column 2 (X), then row 4 column 2 (X), row 5 column 2 (X), row 6 column 2 (E).
-
-But row 3, column 2 is X, and row 3, column 3 is #, so the path goes straight down column 2.
-
-So the path is as before.
-
-But according to this, there's only one right turn when changing from west to south.
-
-But the options don't have 1. So I must have made a mistake.
-
-Wait, perhaps the path is different. Maybe the X's are not all in column 2. Let's check again.
-
-Row 2: #XXS  # → X at 2,3.
-
-Row 3: #X### # → X at 2.
-
-Row 4: #X# # # → X at 2.
-
-Row 5: #X# # # → X at 2.
-
-Row 6: #E#   # → E at 2.
-
-So yes, path is column 2 from row 2 down to row 6.
-
-Wait, but S is at row 2, column 4. So to get to the X's, you move left twice to column 2, then down.
-
-But the path marked by X's starts at S? No, the X's are the path. So S is the start, and the X's lead to E. So the path is S → X → X → X → X → X → E.
-
-So the X's are the path, so the first X after S is (2,3), then (2,2), then (3,2), etc.
-
-So movement directions:
-
-S to (2,3): west.
-
-(2,3) to (2,2): west.
-
-(2,2) to (3,2): south.
-
-(3,2) to (4,2): south.
-
-(4,2) to (5,2): south.
-
-(5,2) to (6,2): south.
-
-So the direction changes only once: west to south. As this is a right turn, that's 1 right turn. But 1 isn't an option. So where's the mistake?
-
-Wait, maybe I have the direction of the turn wrong.
-
-When moving west (left), turning to south (down). Is that a right or left turn?
-
-Imagine facing west (left). A right turn would make you face south (down), and a left turn would make you face north (up). Yes, so west → south is a right turn.
-
-So one right turn. But options don't have that. So maybe the path is different.
-
-Wait, perhaps the maze is different. Let's look at row 4 and row 5. The user wrote "#X# # #". If row 4 is # X # # # #, but no, the user wrote "#X# # #", which might mean # X # [space] # [space] #.
-
-Wait, maybe there's a different path. Let's consider the entire maze.
-
-Let's draw the maze with rows and columns:
-
-Columns 1 to 7, rows 1 to 7.
-
-Row 1: 1 2 3 4 5 6 7
-        # # # # # # #
-
-Row 2: # X X S   #
-        1 2 3 4 5 6 7
-
-Row 3: # X # # # #
-        1 2 3 4 5 6 7 (assuming #X### # means columns 1:#, 2:X, 3:#,4:#,5:#,6:#,7:#)
-
-Row 4: # X #   #   #
-        Wait, user wrote #X# # # → so columns:
-
-1:#, 2:X, 3:#, 4:space, 5:#, 6:space, 7:#.
-
-Row 5: same as row 4.
-
-Row 6: # E #   #
-        1:#, 2:E, 3:#, 4:space, 5:space, 6:space, 7:#
-
-Row 7: all #.
-
-Now, the X's are at:
-
-Row 2: col 2,3
-
-Row 3: col 2
-
-Row 4: col 2
-
-Row 5: col 2
-
-So path is S (2,4) → (2,3)X → (2,2)X → (3,2)X → (4,2)X → (5,2)X → (6,2)E.
-
-But in row 4, column 2 is X, but row 4, column 3 is #, so can't go east. Similarly, row 3, column 2 is X, but column 3 is #, so path must go down.
-
-But what about row 3, column 4? It's #, so no.
-
-Another possibility: maybe the path isn't straight down column 2. Wait, but all X's are in column 2 except row 2.
-
-Wait, row 2 has X in columns 2 and 3, so maybe the path goes down from row 2, column 3?
-
-Wait, row 2, column 3 is X. If you go down from (2,3), what's below? Row 3, column 3 is # (from row 3: #X### #, so column 3 is #), so can't go down there. So the only way from row 2, column 3 is left to (2,2), then down.
-
-So the path must go left to column 2 first.
-
-So the path is as previously determined.
-
-Hmm. If there's only one right turn, but that's not an option, I must have misunderstood the maze.
-
-Wait, let's check row 3 again. If row 3 is #X### #, perhaps it's # X # # # [space] #, so column 6 is space. But row 3, column 6 is space. Then, maybe the path goes down from row 2, column 2 to row 3, column 2, then east? No, row 3, column 3 is #.
-
-Wait, maybe I'm missing some X's. Let's list all X's:
-
-Row 2: columns 2,3
-
-Row 3: column 2
-
-Row 4: column 2
-
-Row 5: column 2
-
-So total X's: 5 X's. S is separate, E is separate.
-
-Path: S → X → X → X → X → X → E. So six moves (five X's plus S and E).
-
-Wait, but the problem states "the path marked by X", so S and E are not X. So the path consists of X's connecting S to E. So the X's are the route, so S must be adjacent to an X.
-
-Yes, S is at (2,4), adjacent to X at (2,3).
-
-So the path is S -X-X-X-X-X-E.
-
-So the X's are five in number.
-
-Now, back to turns.
-
-Directions:
-
-Start at S (2,4), move west to (2,3) [X], direction west.
-
-Move west to (2,2) [X], direction west.
-
-Move south to (3,2) [X], direction south.
-
-Move south to (4,2), direction south.
-
-Move south to (5,2), direction south.
-
-Move south to (6,2) [E], direction south.
-
-So the only direction change is west to south, which is a right turn. So one right turn.
-
-But the options don't have 1. So either the maze is different, or I'm missing something.
-
-Wait, perhaps the initial direction is considered as facing south or something else. Wait, the problem says "movement is allowed in any of the four cardinal directions", but when you start at S, what direction are you facing?
-
-The problem doesn't specify the starting direction. However, to count turns, we need to know the direction of the first move.
-
-Assuming that the first move from S is west, as that's the only way to reach an X.
-
-So first movement direction: west.
-
-Second move: still west.
-
-Third move: south.
-
-So the turn from west to south is a right turn.
-
-But according to this, only one right turn. But options don't have 1. So perhaps I'm missing other turns.
-
-Wait, could there be more direction changes?
-
-Wait, after moving south, do we move east or west anywhere? In our path, no, it's all south.
-
-Wait, maybe the maze has more X's. Let's double-check the user's input.
-
-User wrote:
-
-#######
-#XXS  #
-#X### #
-#X# # #
-#X# # #
-#E#   #
-#######
-
-Let's count the characters per row:
-
-Row 1: 7 characters.
-
-Row 2: #XXS  # → # X X S [space] [space] # → 7.
-
-Row 3: #X### # → # X # # # # → wait, that's six characters. No, #X### # would be 7 if it's # X # # # [space] #. But user wrote #X### #, which might mean #X### followed by a space and #, but that's unclear.
-
-Alternatively, perhaps row 3 is #X### #, which is # X # # # #, but that's six characters. This is confusing.
-
-Another approach: look for the path marked by X's. The X's should form a continuous path from S to E.
-
-S is at row 2, column 4.
-
-E is at row 6, column 2.
-
-Let's assume the maze is:
-
-Row 1: #######
-
-Row 2: # X X S   #
-
-Row 3: # X # # # #
-
-Row 4: # X #   # #
-
-Row 5: # X #   # #
-
-Row 6: # E #     #
-
-Row 7: #######
-
-But this is speculative. Alternatively, perhaps row 3 has X at column 2, and then the path goes down, but row 4 and 5 have X at column 2, but also maybe other X's.
-
-Wait, another idea: perhaps the path goes from S left to X, then down, then right, then down, etc. But according to the maze as given, I don't see other X's.
-
-Wait, row 4: #X# # # → if that's # X # space # space #, then column 2 is X, column 5 is #, etc. So no eastward path.
-
-Wait, let's try to map all X positions:
-
-- Row 2, col 2: X
-
-- Row 2, col 3: X
-
-- Row 3, col 2: X
-
-- Row 4, col 2: X
-
-- Row 5, col 2: X
-
-That's all. So the path must be as previously determined.
-
-Hmm. Maybe the question counts the turn when leaving S. Wait, but S is not part of the X path. The X path starts at the first X.
-
-Wait, the problem says: "the path marked by X". So the path from S to E is along X's. So S is not on an X, but adjacent to an X. So the path is S → X → X → ... → X → E.
-
-So the first move is from S to X (direction west), then X to X (west), then X to X (south), etc.
-
-The turns happen between the X's. So the direction changes occur when moving from one X to the next.
-
-So:
-
-Between first X (2,3) and second X (2,2): direction west → west (no turn).
-
-Between second X (2,2) and third X (3,2): direction west → south (right turn).
-
-Between third X (3,2) and fourth X (4,2): south → south (no turn).
-
-And so on.
-
-So only one right turn. But this isn't an option. So I must have made a mistake.
-
-Wait, maybe the maze is different. Let's look for another possible path.
-
-What if row 3, column 2 is X, and row 3, column 4 is also X? But according to the user's input, row 3 is #X### #. If it's # X # # # #, then columns 3-6 are #, so no.
-
-Wait, maybe the maze has a different layout. Let's consider the following ASCII:
-
-Row 1: #######
-
-Row 2: #XXS  #
-
-Row 3: #X### #
-
-Row 4: #X# # #
-
-Row 5: #X# # #
-
-Row 6: #E#   #
-
-Row 7: #######
-
-Assuming row 3: #X### # → # X # # # # (7 characters).
-
-Row 4: #X# # # → # X # [space] # [space] #.
-
-So row 4, column 4 is space, column 5 is #, column 6 space.
-
-Row 5 same as row 4.
-
-Now, let's see if there's a different path.
-
-From S (row 2,4), move left to X (2,3), left to X (2,2). From (2,2), down to (3,2) X.
-
-From (3,2), can we move east? (3,3) is #, so no. Down to (4,2) X.
-
-(4,2) down to (5,2) X.
-
-(5,2) down to (6,2) E.
-
-Same path.
-
-Wait, unless in row 4, column 2 is X, and row 4, column 4 is a space, but no X there.
-
-Another idea: perhaps the path goes from S down? But row 3, column 4 is # (from row 3: #X### # → column 4 is #), so can't go down from S.
-
-So S can only move left.
-
-I'm stuck. Given the options, and if my analysis says 1, but that's not there, maybe I'm missing something.
-
-Wait, let's count again.
-
-Path:
-
-S (2,4) → (2,3) X (west)
-
-→ (2,2) X (west)
-
-→ (3,2) X (south)
-
-→ (4,2) X (south)
-
-→ (5,2) X (south)
-
-→ (6,2) E (south)
-
-Direction changes:
-
-West to west: no turn.
-
-West to south: right turn.
-
-South to south: no.
-
-South to south: no.
-
-South to south: no.
-
-So only one right turn.
-
-But options are A.0 B.2 C.5 D.9.
-
-Hmm. Maybe the question counts the turn when moving from the last X to E, but E is the end, so no.
-
-Wait, maybe the initial direction is considered. If you start at S, facing east (towards the X), but no, the first move is west.
-
-Wait, another possibility: the path is not straight. Maybe I misread the maze.
-
-Let's look at row 4 and row 5.
-
-Row 4: #X# # # → perhaps it's # X # space # space #, so at row 4, column 2 is X, column 4 is space, column 5 is #, etc.
-
-Row 5 same as row 4.
-
-Row 6: #E#   #.
-
-If the path goes down column 2 to row 5, then east to column 3? But column 3 at row 5 is # (row 5: #X# # # → column 3 is #), so no.
-
-Wait, row 4, column 4 is space. If there's an X at row 4, column 4, but according to the user input, row 4 is #X# # #, which would mean column 4 is space.
-
-No X there.
-
-Wait, unless the path goes down to row 5, then east, but no X's.
-
-I think the path is indeed as previously determined. So why isn't 1 an option?
-
-Wait, maybe the question says "the path marked by X", but the X's are not all the path. Wait, the problem states: "X marks the specific route you should follow to navigate from the starting point 'S' to the endpoint 'E'."
-
-So the X's are the route. So all X's should form the path. So the path is S connected to X's connected to E.
-
-Given that, and the X's are in column 2, row 2-5.
-
-Wait, another idea: perhaps the starting direction is south. No, the first move from S has to be west.
-
-Wait, maybe the turn count is different. Let's define directions numerically.
-
-Let's assign directions:
-
-0: North (up)
-
-1: East (right)
-
-2: South (down)
-
-3: West (left)
-
-A right turn would change direction by +1 (mod 4), left turn by -1.
-
-Starting direction: let's assume the first move is west (3).
-
-Next move is west (3), no turn.
-
-Next move is south (2). Change from 3 to 2. What's the change? 3 → 2 is a decrease of 1, which would be a left turn. Wait, this contradicts earlier thought.
-
-Wait, if direction is measured as:
-
-0: North
-
-1: East
-
-2: South
-
-3: West
-
-Then, from West (3), turning right (clockwise) would go to North (0), no.
-
-Wait, no. Clockwise from West is North? No.
-
-Wait, facing West (left), a right turn would face South (down), and a left turn would face North (up).
-
-So direction numbers:
-
-If facing West (3), right turn → South (2), which is a decrease of 1 (3 → 2).
-
-Left turn would be North (0), increase of 1 (3 → 0).
-
-So changing direction from West (3) to South (2) is a right turn (since 3 → 2 is a clockwise 90-degree turn).
-
-Yes. So that's a right turn.
-
-But according to the direction numbering, a right turn would be a decrease in the number.
-
-But regardless, it's a right turn.
-
-So only one right turn.
-
-But since this isn't an option, I must have misread the maze.
-
-Wait, let's consider the possibility that the maze is:
-
-Row 2: #XXS  # → S at (2,4)
-
-Row 3: #X# # # → but user wrote #X### #.
-
-Wait, maybe row 3 is #X# # #, which would be:
-
-Row 3: # X # [space] # [space] #.
-
-So columns:
-
-1: #, 2: X, 3: #, 4: space, 5: #, 6: space, 7: #.
-
-Then row 3, column 4 is space.
-
-Row 4 same as row 3.
-
-Now, if the path goes from S (2,4) left to (2,3), left to (2,2), down to (3,2), then east to (3,4), but (3,4) is space, but is there an X there? No, the X's are only in column 2.
-
-Hmm.
-
-Wait, another possibility: the X's are not all in column 2. Let's look at the user's input again:
-
-The maze:
-
-#######
-#XXS  #
-#X### #
-#X# # #
-#X# # #
-#E#   #
-#######
-
-Row 2: #XXS  # → X at 2,3.
-
-Row 3: #X### # → X at 2.
-
-Row 4: #X# # # → X at 2.
-
-Row 5: #X# # # → X at 2.
-
-Row 6: #E#   #.
-
-So all X's are column 2.
-
-Unless... row 3: #X### #, the third character is #, so column 3 is #, but maybe row 4 has X in another column.
-
-No, row 4: #X# # # → X at column 2.
-
-Wait, perhaps I've miscounted the columns. Let's write each row's columns explicitly.
-
-Row 1: 1:#, 2:#, 3:#, 4:#, 5:#, 6:#, 7:#
-
-Row 2: 1:#, 2:X, 3:X, 4:S, 5: , 6: , 7:#
-
-Row 3: 1:#, 2:X, 3:#, 4:#, 5:#, 6:#, 7:#  (assuming #X### # is a typo for #X#####)
-
-Row 4: 1:#, 2:X, 3:#, 4: , 5:#, 6: , 7:#
-
-Row 5: same as row 4.
-
-Row 6: 1:#, 2:E, 3:#, 4: , 5: , 6: , 7:#
-
-Row 7: all #.
-
-So the path from S (2,4) must go left to (2,3), left to (2,2), then down to (3,2), (4,2), (5,2), (6,2).
-
-Now, what's at row 4, column 2: X.
-
-Row 4, column 2 is X, so path continues.
-
-But row 4, column 4 is space, but no X there.
-
-So still the same path.
-
-Wait, unless there's a different path. For example, from (2,2) down to (3,2), then east to (3,4), but (3,4) is #, so no.
-
-I think the path is correct.
-
-Given that, and the only right turn being one, but it's not an option, perhaps the question has a typo, or I'm misunderstanding "right turns".
-
-Wait, the problem says "a right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
-
-So every time the direction changes clockwise by 90 degrees, it's a right turn.
-
-In our case, west to south: clockwise, so right turn.
-
-South to east would be right turn, etc.
-
-But in our path, only one such change.
-
-Wait, but maybe the initial direction is considered. For example, if you start moving west, but before that, you were facing south or something. But no, the first move's direction is west.
-
-Another idea: maybe the path has more turns. Let's imagine the maze differently.
-
-Wait, what if the maze is:
-
-Row 1: #######
-
-Row 2: # X X S   #
-
-Row 3: # X # # # #
-
-Row 4: # X # # # #
-
-Row 5: # X # # # #
-
-Row 6: # E #   #
-
-Row 7: #######
-
-And the X's form a path that goes down column 2, but also has a turn elsewhere. But according to the given maze, there are no other X's.
-
-Wait, unless the 'X' in row 2, column 3 is part of a different path.
-
-Wait, another possibility: I'm miscounting the number of X's.
-
-Row 2: two X's.
-
-Row 3: one X.
-
-Row 4: one X.
-
-Row 5: one X.
-
-Total X's: five.
-
-Path: S to X to X to X to X to X to E: six moves, five X's.
-
-So the path has six segments.
-
-But the turns are counted between the segments.
-
-So between segment 1 (S to X1) and segment 2 (X1 to X2), no turn.
-
-Between segment 2 and 3: turn.
-
-Segments 3-4, 4-5, 5-6: no turns.
-
-So one right turn.
-
-I'm really stuck. Given the options, maybe the intended answer is B.2, but why?
-
-Wait, perhaps I made a mistake in the direction change. Let's think again.
-
-When moving west (left), then turning south (down), is that a right turn?
-
-Imagine facing west (left). A right turn would make you face south (down), yes. A left turn would face north (up).
-
-Yes, so it's a right turn.
-
-But what if the path goes west, then south, then east? But it doesn't.
-
-Wait, unless the maze has a different layout. Let's consider the following:
-
-If row 4 is #X# # #, meaning column 4 is space, and if there's an X in row 4, column 4, but the user didn't write it. But according to the problem statement, X marks the path, so all X's should be there.
-
-Wait, the problem says: "X marks the specific route you should follow". So all X's are part of the path.
-
-Given that, and no other X's, the path must be as we have.
-
-Wait, unless the S is considered to be on the path. But S is a separate symbol.
-
-Another idea: the path from S to E includes S and E, but the X's are the path. So the X's are the only path, so S must be next to an X, which it is.
-
-I think the problem might have a typo, or I'm missing something obvious.
-
-Wait, let's count the number of direction changes again.
-
-Path segments:
-
-1. S to X1: west
-
-2. X1 to X2: west
-
-3. X2 to X3: south
-
-4. X3 to X4: south
-
-5. X4 to X5: south
-
-6. X5 to E: south
-
-Turns occur between segments 2-3.
-
-Direction change: west to south: right turn.
-
-That's one turn.
-
-But options don't have it. The closest is B.2.
-
-Wait, could there be another turn if the path goes east somewhere?
-
-Wait, row 6 has E at (6,2), and row 6, column 4-6 are spaces. But no X's there.
-
-Unless the path from (5,2) goes east to (5,4), but no X's.
-
-I'm at a loss. Given the options, and if I have to choose, maybe the intended answer is B.2, assuming I missed a turn.
-
-Wait, another possibility: when moving from (2,2) to (3,2), the direction is south, but if the previous direction was west, that's one right turn.
-
-Then, when moving from (3,2) to (4,2), direction south, no turn.
-
-But what if the path goes down to row 4, then east, but there's no X.
-
-Wait, perhaps the maze has a different layout where after row 3, the path turns east.
-
-But according to the given ASCII, row 3 is #X### #, so after X at (3,2), the next cell east is #, so no.
-
-Wait, let's look up similar mazes. Sometimes, the path might go down, then right, then down, etc.
-
-Wait, let's assume that row 4: #X# # # means that there's an X at column 2, and also at column 5 or something.
-
-No, user wrote #X# # #, which would be # X # [space] # [space] #.
-
-Column 5 is #, so no.
-
-I think I have to go with the analysis that there's one right turn, but since it's not an option, I must have erred.
-
-Wait, wait! The problem says "navigate from S to E by following the path marked by X". So the path is the X's, meaning that S is connected to the first X, and E is connected to the last X.
-
-But in our case, the last X is at (5,2), and E is at (6,2), so the move from (5,2) to E is south.
-
-But what if the path is longer?
-
-Wait, row 5, column 2 is X, row 6, column 2 is E. So yes.
-
-Wait, unless there's a mistake in assuming the direction change.
-
-Let's use a different method. Draw the path:
-
-S (2,4) → (2,3) → (2,2) → (3,2) → (4,2) → (5,2) → E (6,2)
-
-Plotting this:
-
-At row 2, move left twice.
-
-Then down four times.
-
-The turn occurs when changing from left to down.
-
-In terms of turns, if you're moving left, then turn to go down, that's a right turn.
-
-But maybe the question counts the number of times you turn right, regardless of direction.
-
-Wait, but it's only one turn.
-
-Wait, unless the path has another turn. Let's imagine that from (2,2), instead of going down, you go east, but there's a wall.
-
-No.
-
-Another idea: maybe the maze is mirrored. If rows are numbered from bottom to top, but the problem says "ASCII code" which is top to bottom.
-
-I think I've exhausted all possibilities. Given the options, and if forced to choose, maybe the intended answer is B.2, assuming that the path makes two right turns, but I can't see where.
-
-Wait, wait! What if the path from S goes west, then south, then west, then south, etc. But no, the X's don't allow that.
-
-Wait, let's consider that in row 4, column 2 is X, and row 4, column 4 is space, but there's an X there. But according to the problem, X's mark the path, so if it's not shown, it's not there.
-
-I think I have to conclude that either the question is flawed, or I made a mistake.
-
-Wait, let's try to count the number of times the direction changes to the right.
-
-Another approach: list the directions of each move.
-
-Move 1: West
-
-Move 2: West
-
-Move 3: South
-
-Move 4: South
-
-Move 5: South
-
-Move 6: South
-
-Changes occur between move 2 and 3.
-
-West to South: right turn.
-
-That's one.
-
-If the answer is not here, but given the options, maybe the correct answer is B.2, and I missed a turn.
-
-Wait, what if the initial direction is not west? For example, if you start at S and the first move is south, but there's a wall there.
-
-Row 3, column 4 is #, so can't move south from S.
-
-So first move must be west.
-
-I give up. Based on my analysis, it should be 1, but since that's not an option, and the closest is B.2, perhaps I made a mistake in the direction.
-
-Wait, let's think differently: when you move west, then south, is that a right turn or left turn?
-
-Imagine facing west (left). To turn south, you turn right (clockwise), yes.
-
-If you face west, turning right (clockwise) leads to south.
-
-Yes.
-
-So it's a right turn.
-
-If the path had another turn, say south to east, that would be another right turn.
-
-But it doesn't.
-
-Unless...
-
-Wait, maybe after row 5, column 2, you go east to E, but E is at column 2.
-
-No.
-
-Wait, row 6, column 2 is E, so no.
-
-I think the problem might have a typo, but given the options, and if I have to choose, perhaps the intended answer is B.2.
-
-But why?
-
-Wait, another possibility: the path is S → X (2,3) → X (3,3), but row 3, column 3 is #, so impossible.
-
-No.
-
-Wait, what if row 3 is #X# # # instead of #X### #, and it's a typo. If row 3 is #X# # #, then column 3 is #, column 4 is space.
-
-No help.
-
-Wait, let's look for similar examples. In some mazes, the path might turn twice.
-
-Wait, suppose the path goes:
-
-S (2,4) west to (2,3), west to (2,2), south to (3,2), east to (3,3), but (3,3) is #.
-
-No.
-
-I think I have to go with the analysis that there's one right turn, but since it's not an option, perhaps the correct answer expected is B.2, and there's a mistake in the maze description.
-
-Alternatively, maybe I miscounted the number of X's.
-
-Wait, row 2: two X's.
-
-Row 3: one X.
-
-Row 4: one X.
-
-Row 5: one X.
-
-Total of five X's. Path segments: six.
-
-Turns: one.
-
-But let's count the number of right turns as per the problem's example.
-
-Wait, the problem says "right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
-
-So, if you're moving west, then turn south: right turn.
-
-If you were to move south then east: right turn.
-
-But in our path, only one.
-
-Wait, unless the movement from (2,2) to (3,2) is considered a right turn, and then from (3,2) to (4,2) is another, but no, direction doesn't change.
-
-I think the answer should be A.0, but that contradicts.
-
-Wait, no. If there's one right turn, and the options don't have it, but the closest is B.2, maybe the intended path is different.
-
-Wait, let's try to see the maze as follows:
-
-Row 2: #XXS  # → S at (2,4)
-
-Row 3: #X### # → perhaps it's # X # # # [space] #, so column 6 is space.
-
-Row 4: #X# # # → column 4 and 6 are spaces.
-
-Row 5: same.
-
-Row 6: #E#   #.
-
-Now, suppose the path is:
-
-S (2,4) → west to (2,3) X → west to (2,2) X → south to (3,2) X → east to (3,4) but (3,4) is #.
-
-No.
-
-Or south to (3,2), then south to (4,2), east to (4,4), which is space, but no X.
-
-I give up. I think the answer should be 1, but since it's not there, and the options are given, perhaps the correct answer is B.2, and I made a mistake.
-
-Wait, another idea: the path from S goes left to X, then down, then right, then down. But where would that right turn be?
-
-If after row 3, column 2, the path goes east, but there's a wall.
-
-No.
-
-Wait, if row 4, column 2 is X, and row 4, column 3 is space, but no X. Not helpful.
-
-I think I have to go with the only logical conclusion: 1 right turn, but since it's not an option, there must be an error. However, given the choices, and if I consider that maybe the path has two right turns, perhaps the intended answer is B.2.
-
-Wait, wait! When moving from west to south is a right turn, and if later the direction changes again, but in our case, it doesn't. Unless...
-
-Wait, what if the path goes south, then west? But it doesn't.
-
-No.
-
-Wait, perhaps I've misread the maze's row 6.
-
-Row 6: #E#   # → E is at (6,2), so to get to E from (5,2), it's south. No turn.
-
-I think I've spent too much time. Given the options, and if I have to choose, I'll go with B.2, but I'm not sure. However, based on the analysis, it should be 1. But since 1 isn't there, maybe I missed a turn.
-
-Wait! Wait a second. When you move west (left), then south (down), that's a right turn. Then, if you continue south, no turn. But what if the path after row 5, column 2 turns east to reach E, but E is directly south.
-
-No.
-
-Wait, another possibility: the starting direction. If the first move is west, but the initial direction before moving is, say, south, then moving west would be a left turn. But the problem states that you navigate from S by following the path. The first movement direction is west, so the initial direction isn't counted.
-
-I think I have to conclude that the correct answer is not listed, but since it's a multiple-choice question, the intended answer is probably B.2. Maybe the maze was supposed to have a different layout.
-
-Alternatively, maybe the path is:
-
-S → X (2,3) → X (3,3), but row 3, column 3 is #, so no.
-
-Wait, the user wrote row 3 as #X### #, which might mean # X # # # #, so column 3 is #, but if it's #X# # #, then row 3: column 3 is #.
-
-I think I'll have to go with the analysis that it's 1, but since that's not an option, I must have erred. Given the options, and common maze turns, perhaps the answer is B.2.
-
-Wait, let's assume that from west to south is one right turn, and then when moving south, if there's a turn east, but there isn't. I'm really not sure.
-
-Wait, another way: count the number of times the path turns right.
-
-Path:
-
-Start at S, go left (west), left (west), down (south), down, down, down.
-
-The turn from west to south: right turn.
-
-No other turns.
-
-So 1 right turn.
-
-But since 1 isn't an option, and the closest is B.2, I might have to assume that the maze has a different path.
-
-Wait, maybe the X's are in a different place. Let's look at the ASCII again:
-
-The maze:
-
-#######
-#XXS  #
-#X### #
-#X# # #
-#X# # #
-#E#   #
-#######
-
-Row 3: #X### #. If this is # X # # # #, then column 2 is X, and columns 3-6 are #.
-
-Row 4: #X# # # → # X # space # space #.
-
-Row 5: same as row 4.
-
-Now, suppose the path is:
-
-S (2,4) → west to (2,3) X → west to (2,2) X → south to (3,2) X → south to (4,2) X → east to (4,4) (space), but no X there. Can't be.
-
-No.
-
-Wait, row 4, column 4 is space, and if there's an X there, but according to the maze, no.
-
-I think I must accept that the answer should be 1, but since it's not there, perhaps the question meant to say "left turns" or there's a typo.
-
-Given the options, and if forced to choose, I'll go with B.2, but I'm unsure.
-
-Wait, no. Let's think differently. Maybe the path is:
-
-S (2,4) → east? No, east is space, but no X.
-
-No.
-
-Wait, perhaps the maze is:
-
-Row 2: # X X S   #
-
-Row 3: # X # # # #
-
-Row 4: # X # # # #
-
-Row 5: # X # # # #
-
-Row 6: # E #   #
-
-And the path goes down column 2, but also, at row 4, it turns east.
-
-But no X's east.
-
-I give up. Based on strict analysis, answer should be 1, but since it's not there, and the options are given, perhaps the intended answer is B.2, so I'll choose B.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
-    Current position: (r2, c2)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
-[... continue for all steps until reaching E ...]
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (2, 4)
-    E position: (6, 2)
-
->>> STEP 1: Move LEFT from (2, 4) to (2, 3)
-    Current position: (2, 3)
-    Previous direction: —
-    Current direction: LEFT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
-[VERIFIER FEEDBACK for LOCATE section:
-  ✗ S position (2, 4) should be (1, 3)
-  ✗ E position (6, 2) should be (5, 1)
-Please correct the start/exit positions and continue.]
-
->>> LOCATE START AND EXIT (CORRECTED):
-    S position: (1, 3)
-    E position: (5, 1)
-
->>> STEP 1: Move LEFT from (1, 3) to (1, 2)
-    Current position: (1, 2)
-    Previous direction: —
-    Current direction: LEFT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 2: Move LEFT from (1, 2) to (1, 1)
-    Current position: (1, 1)
-    Previous direction: LEFT
-    Current direction: LEFT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 3: Move DOWN from (1, 1) to (2, 1)
-    Current position: (2, 1)
-    Previous direction: LEFT
-    Current direction: DOWN
-    Turn type: RIGHT
-    Running count: Right=1, Left=0
-
-[VERIFIER FEEDBACK for Step 3:
-  ✗ Turn type RIGHT_TURN should be LEFT_TURN. Going from WEST to SOUTH is a counterclockwise (LEFT turn) rotation.
-  ✗ Right turn count 1 should be 0
-  ✗ Left turn count 0 should be 1
-IMPORTANT: Clockwise on a compass is NORTH→EAST→SOUTH→WEST→NORTH. A RIGHT turn = 90° clockwise; a LEFT turn = 90° counterclockwise. For example: SOUTH→WEST is RIGHT (clockwise), SOUTH→EAST is LEFT (counterclockwise). Please correct this step and continue.]
-
->>> STEP 3 (CORRECTED): Move DOWN from (1, 1) to (2, 1)
-    Current position: (2, 1)
-    Previous direction: LEFT (WEST)
-    Current direction: DOWN (SOUTH)
-    Turn type: LEFT
-    Running count: Right=0, Left=1
-
->>> STEP 4: Move DOWN from (2, 1) to (3, 1)
-    Current position: (3, 1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 5: Move DOWN from (3, 1) to (4, 1)
-    Current position: (4, 1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 6: Move DOWN from (4, 1) to (5, 1)
-    Current position: (5, 1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> FINAL ANSWER:
-    The path from S to E contains 0 right turns.
-    \boxed{A}
-</format>
-
-**Final Answer**
-\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt
deleted file mode 100644
index f2bbfb3d..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3016.txt
+++ /dev/null
@@ -1,582 +0,0 @@
-Okay, let's try to figure out how many right turns are in the path marked by X from S to E. First, I need to visualize the maze properly. The ASCII maze given is:
-
-#######
-#S#   #
-#X# ###
-#X#   #
-#X### #
-#XXE  #
-#######
-
-Hmm, let's break it down row by row. Each line represents a row in the maze. Let's index the rows starting from the top as row 0 maybe. Wait, but the actual maze is written as:
-
-Row 0: #######
-Row 1: #S#   #
-Row 2: #X# ###
-Row 3: #X#   #
-Row 4: #X### #
-Row 5: #XXE  #
-Row 6: #######
-
-Wait, but maybe the rows are as written. Let's count the lines. The user provided:
-
-#######
-#S#   #
-#X# ###
-#X#   #
-#X### #
-#XXE  #
-#######
-
-So that's 7 lines. Let's number them from top to bottom as rows 0 to 6.
-
-Row 0: ####### (all walls)
-Row 1: #S#   # (so positions 0: #, 1: S, 2: #, 3: space, 4: space, 5: space, 6: #)
-Row 2: #X# ### (positions: #, X, #, space, #, #, #)
-Row 3: #X#   # (similar to row 2 but spaces after the X)
-Row 4: #X### # (X followed by ### and then space and #)
-Row 5: #XXE  # (X, X, E, space, space, #)
-Row 6: #######
-
-Wait, but the maze's navigable path is marked by X. Wait, the problem states that X marks the specific route to follow. So the path from S to E is along the X's. So we need to trace the path from S through X's to E.
-
-Wait, S is in row 1, column 1 (assuming columns start at 0). Let's map each cell.
-
-Let's make a grid. Let's index rows from top to bottom (0 to 6), columns left to right (0 to 6).
-
-Row 0: 0-6: # # # # # # #
-Row 1: # S #   # (columns 0: #, 1:S, 2:#, 3:space, 4:space, 5:space, 6:#)
-Row 2: # X # ### (columns 0:#, 1:X, 2:#, 3:space, 4:#,5:#,6:#)
-Row 3: # X #   # (columns 0:#, 1:X, 2:#, 3:space, 4:space,5:space,6:#)
-Row 4: # X ### # (columns 0:#, 1:X, 2:#,3:#,4:#,5:space,6:#)
-Row 5: # X X E  # (columns 0:#,1:X,2:X,3:E,4:space,5:space,6:#)
-Row 6: # # # # # # #
-
-Wait, but row 5's E is at column 3. So the path from S (row1, col1) must go through the X's to reach E.
-
-Wait, but S is at (1,1), and the X's are in row2 col1, row3 col1, row4 col1, row5 col1 and row5 col2. Hmm. Wait, let's check each row.
-
-Row 1: S is at (1,1). The path starts here. The X's are supposed to be the path. Wait, but S is the start, and E is the end. The X's mark the route. So perhaps the path is S -> X -> X -> ... -> E.
-
-Wait, but looking at the maze:
-
-Row 1: #S#   # → so S is at (1,1). To the right of S (col2) is a wall (#), so the path must go down or up. But up is row0 which is all walls. So from S (row1, col1), the only possible move is down (row2, col1), which is X. So the path starts at S, then moves down to X (row2, col1).
-
-Wait, but let's check row2: #X# ### → so (row2, col1) is X. So from S (row1, col1), moving down to row2, col1 (X). Then, what's next? Let's see the path.
-
-We need to follow the X's. So the path is S → X → X → ... → E.
-
-Let's list all X positions:
-
-Row 2, col1: X
-
-Row 3, col1: X
-
-Row 4, col1: X
-
-Row 5, col1: X
-
-Row 5, col2: X
-
-E is at row5, col3.
-
-Wait, so the path would go from S (row1, col1) down to row2, col1 (X), then down to row3, col1 (X), then down to row4, col1 (X), then down to row5, col1 (X), then right to row5, col2 (X), then right to E (row5, col3). Wait, but E is at row5, col3. So the path would be S → (down) X → X → X → X → X → E. Wait, but row5, col1 is X, then row5, col2 is X, then col3 is E.
-
-Wait, but the X's are the path. So the path from S to E must go through all X's. So the path is S (row1, col1) → row2 col1 (X) → row3 col1 (X) → row4 col1 (X) → row5 col1 (X) → row5 col2 (X) → E (row5, col3).
-
-Wait, but row5 col2 is X. So the path goes from row5 col1 (X) to row5 col2 (X), then to E (col3). So the path steps are:
-
-Start at S (1,1).
-
-Move down to (2,1) [X]
-
-Down to (3,1) [X]
-
-Down to (4,1) [X]
-
-Down to (5,1) [X]
-
-Right to (5,2) [X]
-
-Right to (5,3) [E]
-
-Wait, but is that the case? Let's check the maze again.
-
-Row 5: #XXE  # → columns 0: #, 1:X, 2:X, 3:E, 4:space, 5:space, 6:#. So yes, row5, col1 and col2 are X, then E at col3.
-
-So the path from S to E is moving down four times (from row1 to row5 in col1), then right twice (to col2, then col3). Wait, but row5, col1 to col2 is right, then col2 to col3 is right again. But E is at col3. So the path is:
-
-Start at S (1,1).
-
-Direction down to (2,1).
-
-Then down to (3,1).
-
-Then down to (4,1).
-
-Then down to (5,1).
-
-Then right to (5,2).
-
-Then right to (5,3) (E).
-
-So the directions taken are: down, down, down, down, right, right.
-
-Wait, but movement direction changes. To count right turns, we need to track the direction of movement and see when a right turn occurs.
-
-First, starting direction: when moving from S (1,1) to (2,1), the direction is down. So the first move is down.
-
-Then next move is down again (to row3, col1), so same direction. No turn.
-
-Then down again (row4, col1), same direction. No turn.
-
-Then down again (row5, col1), same direction. No turn.
-
-Then move right (from col1 to col2), which is a right turn from down. Wait, direction was down, then moving right. Let's think: if you're moving down (south), a right turn would be to face east (right), which would be moving right. Wait, movement direction: if you are moving down (south), turning right would face east (right), so moving east (right) is a right turn. Wait, yes. Let's clarify:
-
-Assuming the standard directions:
-
-- Facing down (south), a right turn would make you face east (right), so moving right (east) would be a right turn.
-
-Wait, but movement direction: when moving from one cell to another, the direction you were moving to get there.
-
-Wait, the problem says: "A right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
-
-So, for example, if you were moving down (south), then turn right would face east (right), so next movement direction is east (right). So the turn from south to east is a right turn.
-
-Similarly, moving east, a right turn would face south.
-
-So in our path:
-
-First move: from S (1,1) to (2,1) → direction down (south).
-
-Second move: from (2,1) to (3,1) → direction still down. No turn.
-
-Third move: (3,1) to (4,1) → down again. No turn.
-
-Fourth move: (4,1) to (5,1) → down again. No turn.
-
-Fifth move: (5,1) to (5,2) → direction right (east). Previous direction was down. So changing from down to right (east) is a right turn (since right from down is east).
-
-Sixth move: (5,2) to (5,3) → direction still east. No turn.
-
-Wait, so only one right turn here. But the options don't have 1. Wait, this can't be right. Maybe I'm missing something.
-
-Wait, maybe the path isn't all down then right. Let's double-check the maze.
-
-Wait, row 4: #X### #. So row4, col1 is X. Then row4, col2 is #, col3 #, col4 #, col5 space. So from row4, col1 (X), moving down to row5, col1 (X), which is possible. Then row5, col1 to col2 (X), then to E.
-
-But wait, row5, col1 to col2: that's moving east (right), so direction change from down to east is a right turn. Then moving east again (col2 to col3), no turn. So only one right turn. But the options are 9, 0,4,7. Hmm. So 1 is not an option. So I must have made a mistake.
-
-Wait, perhaps the path is different. Maybe the X's are not all in a straight line. Let's check again.
-
-Wait, row2: #X# ### → so columns 0:#, 1:X, 2:#, 3:space, 4:#,5:#,6:#. So row2, col1 is X, but col2 is #, so can't move right from there. So from row2, col1 (X), you can't go east (right) because col2 is a wall. So the path must go down to row3, col1.
-
-Row3: #X#   # → same structure. col1 is X, col2 is #, so can't go east. So path continues down to row4, col1.
-
-Row4: #X### # → col1 is X, col2 is #, so again, can't move east. So path continues down to row5, col1.
-
-Row5: #XXE  # → col1 and 2 are X. So from row5, col1, can move east to col2 (X), then east to E.
-
-So the path is indeed down, down, down, down, right, right. So the direction changes only once: from down to east (right turn), then continuing east. So only one right turn. But 1 isn't an option. Hmm. So something's wrong here.
-
-Wait, maybe the path is not straight down. Wait, let's re-examine the maze.
-
-Wait, row 1: #S#   # → so S is at (1,1), surrounded by walls on left (col0), right (col2), and above (row0). So S can only go down (row2, col1), which is X.
-
-Row2: #X# ### → X at (2,1). From here, can we move east? (2,2) is #, so no. Can we move west? (2,0) is #. So only down to row3, col1.
-
-Row3: #X#   # → X at (3,1). Similarly, can't go east (col2 is #), so must go down to row4, col1.
-
-Wait, I made an error at Step 5. Turn type RIGHT_TURN should be LEFT_TURN. Going from SOUTH to EAST is a counterclockwise (LEFT turn) rotation.; Right turn count 1 should be 0; Left turn count 0 should be 1. Let me re-trace the path more carefully from the correct position.
-Wait, direction when moving down (south), a right turn would be to face east (right), so moving east would be a right turn. But according to the problem statement, right turn is 90 degrees clockwise. So if you're moving south, turning right (clockwise) would face west? Wait, no. Wait, directions:
-
-Imagine facing south (down). A right turn (clockwise) would turn you to face west? No. Wait, standard compass directions:
-
-If facing north, right turn is east.
-
-Facing east, right turn is south.
-
-Facing south, right turn is west.
-
-Facing west, right turn is north.
-
-Wait, no. Wait, if you're facing south (down), a clockwise right turn would turn you to face west (left from south?), wait maybe I'm confused.
-
-Let's clarify:
-
-When you move in a direction, your facing direction is that direction. A right turn (clockwise) would change your facing direction 90 degrees clockwise.
-
-So:
-
-- If you're moving south (down), facing south. A right turn (clockwise) would make you face west (since turning right from south would point west? Wait, no. Let's think of a compass:
-
-North (up), East (right), South (down), West (left).
-
-Facing north: right turn → east.
-
-Facing east: right turn → south.
-
-Facing south: right turn → west.
-
-Facing west: right turn → north.
-
-Yes, that's correct.
-
-So if you're moving south (direction south), then a right turn would face west. But in our path, after moving south (down), we want to move east (right), which would be a left turn.
-
-Wait, this is critical. So the direction change from south to east would be a left turn, not a right turn.
-
-Because moving south, then turning left (anticlockwise) would face east.
-
-Wait, let's confirm:
-
-Facing south (down), turning left (anticlockwise 90) → east.
-
-Turning right (clockwise 90) → west.
-
-So moving from south to east is a left turn.
-
-Ah! This is probably where I messed up earlier.
-
-So in the path:
-
-After moving down (south) four times, then moving east (right), the direction change is from south to east, which is a left turn, not a right turn.
-
-So then, the turn would be a left turn. So no right turns in the path. Then the answer would be B. 0.
-
-But wait, let's make sure.
-
-Path steps:
-
-Start at S (1,1). Move down to (2,1): direction south.
-
-Next move down to (3,1): still south. No turn.
-
-Down to (4,1): south. No turn.
-
-Down to (5,1): south. No turn.
-
-Then move east to (5,2): direction east. Change from south to east. This is a left turn (since left turn from south is east).
-
-Then move east to (5,3): still east. No turn.
-
-So all direction changes (only one change) is a left turn. So no right turns. So answer B. 0.
-
-But the options include B. 0. But wait, maybe I'm missing some turns.
-
-Wait, is there another part of the path that I'm not considering? Let's check again.
-
-Wait, are there any other X's that form a different path?
-
-Wait, the problem says "the path marked by X". So the X's are the path. So we must follow all X's from S to E.
-
-Wait, S is not an X. The X's start from row2, col1. So the path is S → X → X → X → X → X → E.
-
-Wait, S to first X (row2, col1), then through all X's to E.
-
-But according to the maze, the X's are at:
-
-row2 col1,
-
-row3 col1,
-
-row4 col1,
-
-row5 col1,
-
-row5 col2.
-
-So five X's. So path is S → X (row2) → X (row3) → X (row4) → X (row5 col1) → X (row5 col2) → E.
-
-So the movement directions:
-
-From S (row1, col1) to row2 col1: down.
-
-row2 to row3: down.
-
-row3 to row4: down.
-
-row4 to row5: down.
-
-row5 col1 to row5 col2: right (east).
-
-row5 col2 to E: right (east).
-
-So only one direction change: down → east.
-
-As established, down to east is a left turn. So no right turns. So answer B. 0.
-
-But let's check again. Maybe there's another turn. Wait, what about if the path takes a different route? Wait, but according to the maze, the X's are only in those positions. So we have to follow the X's. So the path must go through all X's. So the path is as described.
-
-Wait, but maybe the X's form a different path. Let's look at the maze again.
-
-Row 1: #S#   # → S at (1,1), then spaces to the right.
-
-Row 2: #X# ### → X at (2,1), then #, then spaces, then #s.
-
-Row3: #X#   # → X at (3,1), then #, then spaces.
-
-Row4: #X### # → X at (4,1), then ###, then space.
-
-Row5: #XXE  # → X at (5,1), X at (5,2), E at (5,3).
-
-So the X's are all in column 1 from row2 to row5, then column 2 row5. So path is straight down column 1, then right along row5.
-
-No other X's. So path must be as above.
-
-Thus, only one turn, which is left. So no right turns. Answer B. 0.
-
-But wait, let's check the problem statement again: "the path marked by X". So the X's are the correct path. So we have to follow the X's. So the movement is along the X's. So the path is S to first X, then next X, etc.
-
-But S is not an X. So the first step is from S (which is a cell) to the first X. So the path starts at S, then moves to X (row2, col1), which is the first X. Then to next Xs.
-
-So the directions are as before.
-
-Wait, but what if the starting direction is considered. When you start at S, you have to move to the first X. So your first move is down. Then subsequent moves.
-
-But the first direction is down. Then no turns until the east move.
-
-So only one direction change (left turn), so zero right turns. So answer B.
-
-But why is option C 4? Maybe I'm missing something. Let's think again.
-
-Wait, perhaps the path is not all down. Let's check row4. Row4 is #X### #. So at row4, col1 (X), can we move east? Col2 is #, so no. So must move down to row5, col1.
-
-Row5, col1 is X. From there, east to col2 (X), then east to E.
-
-Wait, another possibility: perhaps the path has more turns. Let's map all the X's.
-
-List of X's:
-
-(2,1), (3,1), (4,1), (5,1), (5,2).
-
-So the path is (1,1) → (2,1) → (3,1) → (4,1) → (5,1) → (5,2) → (5,3).
-
-So the movement vectors:
-
-From (1,1) to (2,1): down.
-
-(2,1) to (3,1): down.
-
-(3,1) to (4,1): down.
-
-(4,1) to (5,1): down.
-
-(5,1) to (5,2): right.
-
-(5,2) to (5,3): right.
-
-So the direction changes occur only once: from down to right. As discussed, this is a left turn.
-
-So no right turns. Answer B. 0.
-
-But why would the options include 4 or 7? Maybe I'm misinterpreting the maze.
-
-Wait, let's check the maze again as provided:
-
-The ASCII maze:
-
-#######
-#S#   #
-#X# ###
-#X#   #
-#X### #
-#XXE  #
-#######
-
-Wait, row 2 is #X# ###. So columns:
-
-0: #, 1:X, 2:#, 3:space, 4:#, 5:#, 6:#.
-
-Row3: #X#   # → 0:#, 1:X, 2:#, 3:space, 4:space, 5:space, 6:#.
-
-Row4: #X### # → 0:#, 1:X, 2:#, 3:#, 4:#, 5:space, 6:#.
-
-Row5: #XXE  # → 0:#, 1:X, 2:X, 3:E, 4:space, 5:space, 6:#.
-
-Wait, row5, col5 is space, but not relevant.
-
-Wait, in row4, col5 is space. So row4, col5 is space. But our path doesn't go there.
-
-Wait, but what if the path isn't straight down? Let's see. From row2, col1 (X), can we move east? No, col2 is #. Can we move north? Back to S, which is start. So no. So must go down.
-
-Same for row3, col1: can't go east or west, so must go down.
-
-Row4, col1: same.
-
-So the path is straight down.
-
-Hmm. So unless the problem counts the direction changes differently.
-
-Wait, another thought: maybe when you move from S to row2, col1, that's the first direction (down), and then all the way down, then a turn. But perhaps the problem counts the turn when leaving the X path. Not sure.
-
-Wait, the problem states: "navigate from S to E by following the path marked by X". So the path includes the X's. So the path is S → X → X → X → X → X → E. So the X's are the path, meaning the route is through those cells. So movement through the X's.
-
-So the path's direction changes are between the X's.
-
-Wait, S is not an X. So the first move is from S (not X) to the first X. Then the path through X's.
-
-But the turns are counted based on the direction of movement through the path. So the movement from S to first X is direction down. Then from first X to second X is down, etc. So the turns happen between the X's.
-
-Wait, but the turn is when changing direction. So from S to first X: direction down. Then first X to second X: down (no turn). Second to third: down. Third to fourth: down. Fourth X (row5, col1) to fifth X (row5, col2): direction east. So change from down to east: left turn. Fifth X to E: east (no turn).
-
-So only one turn, left. So zero right turns.
-
-Thus, the answer should be B. 0.
-
-But why would the problem have other options? Maybe I'm missing a part of the maze.
-
-Wait, looking at row 3: #X#   #. After the X at col1, there's a space at col3. Could there be a path that goes east from row3?
-
-Wait, row3, col1 is X. To move east (col2), but col2 is #, so no. So can't go east. So the path must continue down.
-
-Same for row2.
-
-Another check: row4 is #X### #. So after col1 (X), col2 is #, so can't go east. So path must go down.
-
-Thus, the path is indeed straight down then east.
-
-Hmm. Maybe the problem counts the final move to E as part of the path, but that doesn't affect the turns.
-
-Wait, let's think of another possibility. If the path is different. Suppose the X's are not all in a line. Wait, maybe I misread the maze.
-
-Wait, let's write out the maze with coordinates:
-
-Row 0: 0 1 2 3 4 5 6
-        # # # # # # #
-
-Row 1: # S #   # 
-        0 1 2 3 4 5 6
-
-Row 2: # X # # # # 
-        Wait, no. Original row 2: #X# ### → so:
-
-Row 2: 0:#, 1:X, 2:#, 3:space, 4:#,5:#,6:#.
-
-Row 3: #X#   # → 0:#,1:X,2:#,3:space,4:space,5:space,6:#.
-
-Row 4: #X### # → 0:#,1:X,2:#,3:#,4:#,5:space,6:#.
-
-Row 5: #XXE  # → 0:#,1:X,2:X,3:E,4:space,5:space,6:#.
-
-Yes, so X's are at (2,1), (3,1), (4,1), (5,1), (5,2).
-
-Path is as before.
-
-Wait, another idea: perhaps the movement from row5, col2 to E is not counted, but the turn before that. But no, the turn is between directions.
-
-Wait, let's list all direction changes:
-
-Move 1: S (1,1) → (2,1): direction down.
-
-Move 2: (2,1) → (3,1): down.
-
-Move 3: (3,1) → (4,1): down.
-
-Move 4: (4,1) → (5,1): down.
-
-Move 5: (5,1) → (5,2): right.
-
-Move 6: (5,2) → (5,3): right.
-
-So between move 4 and move 5: direction changes from down to right.
-
-As established, this is a left turn.
-
-No other direction changes. So zero right turns.
-
-Therefore, the answer should be B. 0.
-
-But the options given are A.9 B.0 C.4 D.7.
-
-So B.0 is an option. So likely the answer is B.
-
-But why would the question be asked if it's zero? Maybe I made a mistake in the turn direction.
-
-Let me confirm once more:
-
-Current direction: south (down).
-
-Next direction: east (right).
-
-To go from south to east, which way do you turn?
-
-If you're facing south, to turn towards east, you turn left (counterclockwise), because east is to your left when facing south.
-
-Yes: facing south, left turn → east.
-
-Right turn would be west.
-
-So yes, it's a left turn.
-
-Therefore, no right turns. Answer B.
-
-I think that's correct. So the answer is B.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
-    Current position: (r2, c2)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
-[... continue for all steps until reaching E ...]
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (1, 1)
-    E position: (5, 3)
-
->>> STEP 1: Move DOWN from (1, 1) to (2, 1)
-    Current position: (2, 1)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 2: Move DOWN from (2, 1) to (3, 1)
-    Current position: (3, 1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 3: Move DOWN from (3, 1) to (4, 1)
-    Current position: (4, 1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 4: Move DOWN from (4, 1) to (5, 1)
-    Current position: (5, 1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 5: Move RIGHT from (5, 1) to (5, 2)
-    Current position: (5, 2)
-    Previous direction: DOWN
-    Current direction: RIGHT (EAST)
-    Turn type: LEFT (since DOWN → EAST is a left turn)
-    Running count: Right=0, Left=1
-
->>> STEP 6: Move RIGHT from (5, 2) to (5, 3) [E]
-    Current position: (5, 3)
-    Previous direction: RIGHT
-    Current direction: RIGHT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> FINAL ANSWER:
-    \boxed{B}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt
deleted file mode 100644
index a020fed1..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3017.txt
+++ /dev/null
@@ -1,760 +0,0 @@
-Okay, let's try to figure out how many right turns are in the path marked by X from S to E. First, I need to visualize the maze properly. The ASCII maze given is:
-
-#######
-#S#  E#
-#X###X#
-#X#XXX#
-#X#X###
-#XXX  #
-#######
-
-Hmm, let me parse this row by row. Each line represents a row in the maze. Let's index them from top to bottom as rows 0 to 6 maybe, but maybe it's easier to just list them out.
-
-Row 0: #######
-Row 1: #S#  E#
-Row 2: #X###X#
-Row 3: #X#XXX#
-Row 4: #X#X###
-Row 5: #XXX  #
-Row 6: #######
-
-Wait, but the actual maze is probably 7 rows high (since there are 7 lines) and each line has 7 characters (since the first line is 7 #s). Let's confirm:
-
-Row 0: 7 #s
-Row 1: # S # space space E # (so positions 0:#, 1:S, 2:#, 3:space, 4:space, 5:E, 6:#)
-Row 2: # X # # # X # (positions 0:#, 1:X, 2:#, 3:#, 4:#, 5:X, 6:#)
-Row 3: # X # X X X #
-Row 4: # X # X # # #
-Row 5: # X X X   #
-Row 6: #######
-
-Wait, but the problem states that X marks the specific route. So the path from S to E should follow the Xs. Let's find where S and E are.
-
-Looking at row 1: #S#  E#. So S is at (row 1, column 1), since row 1 starts with #, then S. Let's assume columns are 0-based. So row 1, col 1 is S.
-
-E is at row 1, col 5 (since row 1 is # S # space space E #, so positions 0:#,1:S,2:#,3:space,4:space,5:E,6:#). Wait, but E should be the exit. But according to the problem, the path to follow is marked by X. So the Xs are the correct path. So we need to trace from S through Xs to E.
-
-Wait, but S and E are not Xs. So the path starts at S, then follows Xs to reach E. Wait, but how? Let's look at the maze again.
-
-Wait, the problem says "the path marked by X" so the Xs are the route to follow. So the correct path is the Xs connecting S to E. So S must be adjacent to an X, and E must be adjacent to an X.
-
-Let's map out the maze properly.
-
-Let's index rows 0 to 6 (7 rows total), columns 0 to 6 (7 columns).
-
-Row 0: 0: #, 1: #, 2: #, 3: #, 4: #, 5: #, 6: #
-Row 1: 0:#, 1:S, 2:#, 3: , 4: , 5:E, 6:#
-Row 2: 0:#, 1:X, 2:#, 3:#, 4:#, 5:X, 6:#
-Row 3: 0:#, 1:X, 2:#, 3:X, 4:X, 5:X, 6:#
-Row 4: 0:#, 1:X, 2:#, 3:X, 4:#, 5:#, 6:#
-Row 5: 0:#, 1:X, 2:X, 3:X, 4: , 5: , 6:#
-Row 6: all #
-
-Wait, but in row 5, columns 4 and 5 are spaces. Hmm. Let's confirm each cell.
-
-Row 1: # S #   E # → columns 0:#, 1:S, 2:#, 3:space, 4:space, 5:E, 6:#.
-
-Row 2: # X # # # X # → 0:#,1:X,2:#,3:#,4:#,5:X,6:#.
-
-Row 3: # X # X X X # → 0:#,1:X,2:#,3:X,4:X,5:X,6:#.
-
-Row 4: # X # X # # # → 0:#,1:X,2:#,3:X,4:#,5:#,6:#.
-
-Row 5: # X X X   # → 0:#,1:X,2:X,3:X,4:space,5:space,6:#.
-
-Okay, so S is at (1,1). Now, the path should go from S (1,1) following Xs to E (1,5). Wait, but S is at (1,1), and E is at (1,5), but row 1 has a wall at (1,2), so you can't go straight from S to E. So the path must go down from S.
-
-Looking at S (1,1), the adjacent cells: up is row 0, which is all #, so blocked. Down is row 2, column 1: X. Left is (1,0), which is #. Right is (1,2), which is #. So the only possible move from S is down to (2,1), which is X.
-
-So the path starts at S (1,1), moves down to (2,1) X. Now, from (2,1), where can we go? Let's check the neighbors. Up is S, which is not X (but we came from there). Down is row 3, col1: X. Left is (2,0): #. Right is (2,2): #. So next step is down to (3,1) X.
-
-Wait, row 2, col1 is X. Row 3, col1 is X. So moving down to row3, col1.
-
-Now at (3,1). Check neighbors. Up is (2,1), X. Down is row4, col1: X. Left: #. Right: (3,2) is #. So down to (4,1) X.
-
-At (4,1). Neighbors: up is (3,1), X. Down is row5, col1: X. Left: #. Right: (4,2) is #. So down to (5,1) X.
-
-Row5, col1 is X. Now, neighbors: up is (4,1), X. Down is row6, blocked. Left: #. Right is (5,2): X. So move right to (5,2).
-
-(5,2) is X. Now, neighbors: up is (4,2), which is # (from row4, col2 is #). Wait, row4: # X # X # # #, so col2 is #. So up is blocked. Left is (5,1), which is X. Right is (5,3): X. So move right to (5,3).
-
-(5,3) is X. Now, neighbors: left is (5,2), X. Right is (5,4), which is space. Up is row4, col3: X (row4, col3 is X). Wait, row4: # X # X # # # → col3 is X. So up from (5,3) would be row4, col3: X. But wait, (5,3) is in row5. Up is row4, col3. Let's check if that's X.
-
-Row4: columns 0:#,1:X,2:#,3:X,4:#,5:#,6:#. So row4, col3 is X. So moving up from (5,3) to (4,3) X. Wait, but the path should be following Xs. Hmm, but maybe we need to see the correct path.
-
-Wait, perhaps the path from S to E via Xs is as follows:
-
-S (1,1) → down to (2,1) X → down to (3,1) X → down to (4,1) X → down to (5,1) X → right to (5,2) X → right to (5,3) X. Now, from (5,3), where to go? Let's see. The exit E is at (1,5). Hmm, but how to get there. Wait, maybe I'm missing something.
-
-Wait, row 5, col3 is X. Let's look at row3, col5: X. Row2, col5: X. Let's see. Maybe the path goes from (5,3) up to row4, col3 (X), then right? Wait, row4, col3 is X. From (4,3), neighbors: up is row3, col3: X (row3, col3 is X), down is row5, col3: X, left is row4, col2: #, right is row4, col4: #. So from (4,3), can go up or down. Wait, perhaps the path goes up from (4,3) to row3, col3.
-
-Wait, maybe I need to trace all Xs that connect S to E. Let's list all X positions:
-
-Row2: col1, col5
-
-Row3: col1, col3, col4, col5
-
-Row4: col1, col3
-
-Row5: col1, col2, col3
-
-So X positions are:
-
-(2,1), (2,5)
-
-(3,1), (3,3), (3,4), (3,5)
-
-(4,1), (4,3)
-
-(5,1), (5,2), (5,3)
-
-S is at (1,1), so adjacent X is (2,1).
-
-E is at (1,5), adjacent cell would need to be X. Looking at row1, col5 is E, so the cell above E (row0, col5) is #, left is (1,4) space, right is #. So to reach E, the path must come from (1,4), but (1,4) is space. Wait, but the problem says to follow the X path. Hmm, maybe E is adjacent to an X. Let's check row2, col5 is X. Row1, col5 is E. So (2,5) X is below E. So the path must go from (2,5) up to E (1,5). So the last step is up from (2,5) to E.
-
-So the path should end at (2,5), then move up to E.
-
-So the path from S to E via Xs would be S (1,1) → (2,1) → ... → (2,5) → E.
-
-So let's try to find the path.
-
-Start at (1,1) S. Must go down to (2,1) X.
-
-From (2,1), can we go down to (3,1), then down to (4,1), down to (5,1), right to (5,2), right to (5,3). From (5,3), where? Up to (4,3), then up to (3,3), right to (3,4), right to (3,5), up to (2,5), then up to E.
-
-Let's map this:
-
-(1,1) → (2,1) [down]
-
-(2,1) → (3,1) [down]
-
-(3,1) → (4,1) [down]
-
-(4,1) → (5,1) [down]
-
-(5,1) → (5,2) [right]
-
-(5,2) → (5,3) [right]
-
-(5,3) → (4,3) [up]
-
-(4,3) → (3,3) [up]
-
-(3,3) → (3,4) [right]
-
-(3,4) → (3,5) [right]
-
-(3,5) → (2,5) [up]
-
-(2,5) → (1,5) E [up]
-
-Wait, but (3,5) to (2,5) is up, then (2,5) to (1,5) is up. But (2,5) is X, so moving from (2,5) to E (which is not X, but the exit) is the last step.
-
-So the path through Xs is up to (2,5), then exit. So the X path ends at (2,5).
-
-Now, we need to track the direction changes to count right turns.
-
-First, we need to determine the direction of each move, then check for right turns (90 degrees clockwise from previous direction).
-
-Let's list each move step by step, noting the direction of movement.
-
-Starting at S (1,1), first move is down to (2,1). Direction: down.
-
-Next move: from (2,1) to (3,1), still down. Direction remains down.
-
-Next: (3,1) to (4,1), down.
-
-Next: (4,1) to (5,1), down.
-
-Now at (5,1), next move is right to (5,2). So direction changes from down to right. A right turn would be a 90-degree clockwise turn. Let's see: if current direction is down, a right turn would face right. Wait, direction changes: when moving down, turning right would face right (east), turning left would face east? Wait, need to be careful.
-
-Assuming standard directions:
-
-- Facing down: moving south.
-
-- A right turn (clockwise) from south would face west? Wait, no. Wait, if you're moving south (down), a right turn would face west? Wait, no. Wait, imagine facing south (down), a right turn would turn to face west? Wait, no. Wait, when facing south, a right turn (clockwise) would face west? Wait, no. Wait, clockwise directions:
-
-If you're facing north (up), right turn is east (right), left turn is west.
-
-Wait, better to use a standard:
-
-Direction abbreviations:
-
-N (up), E (right), S (down), W (left).
-
-A right turn from current direction is clockwise. So:
-
-From N, right turn → E
-
-From E, right turn → S
-
-From S, right turn → W
-
-From W, right turn → N
-
-Similarly, left turn:
-
-From N, left → W
-
-From E, left → N
-
-From S, left → E
-
-From W, left → S
-
-So, in our path:
-
-First move: down (S). So direction S.
-
-Next move: down (S) again. No turn.
-
-Next: down (S). No turn.
-
-Next: down (S). No turn.
-
-Then, from (5,1), moving right (E). So direction changes from S to E. What's the turn? From S (down), a right turn would be to W (left), but wait, moving right from down direction. Wait, if you're facing down (south), a right turn (clockwise) would face west? Wait, no. Wait, if you're moving south (down), turning right (clockwise) would face west? Wait, no. Let's think:
-
-Imagine facing south (down), your right hand would point west. Wait, no. If you're facing south (down), your right side is west. Wait, perhaps better to use:
-
-When moving in a direction, the direction you're facing is the direction of movement.
-
-So moving down: facing south.
-
-A right turn (clockwise) would change your facing direction to west? Wait, no. Let's say you're facing south. A right turn would turn you to face west? Or east?
-
-Wait, standard compass:
-
-If you're facing south, a right turn (clockwise) would face west? Wait, no. Clockwise from south: south → west → north → east → south. Wait, no. Clockwise direction is right.
-
-Wait, imagine a clock: 12 is north, 3 is east, 6 is south, 9 is west.
-
-So facing south (6 o'clock), a right turn (clockwise) would turn towards 9 o'clock (west), but actually, a right turn from south would face west? Wait, no. If you're facing south, a right turn (clockwise) would turn you to face west? Wait, no. Let's think:
-
-If you're facing south, your right hand points west. So turning right (clockwise) would face west. Wait, no. If you're facing south, turning right (as in, turning your body to the right) would face west. Wait, no. If you face south, and turn right, you face west. Wait, no. Let's take a real-world example. If you're facing south (towards the ground, like in a map), turning right (clockwise) would make you face west. Hmm.
-
-Wait, maybe better to use the following:
-
-Moving direction:
-
-Current direction | Right turn direction | Left turn direction
-
-N                 | E                     | W
-
-E                 | S                     | N
-
-S                 | W                     | E
-
-W                 | N                     | S
-
-Yes, this seems correct. So from S (south), right turn → W (west), left turn → E (east).
-
-Wait, but in our case, after moving down (S), the next direction is right (E). So changing from S to E. How?
-
-If moving south (down), then moving east (right), that would be a left turn. Because from S, left turn is E. Wait, according to the table above: from S, left turn direction is E. So moving east after south is a left turn.
-
-Wait, this is critical. So:
-
-If previous direction was S (south), and current direction is E (east), then the turn is left.
-
-Because S → E is a left turn.
-
-Similarly, S → W would be a right turn.
-
-So in our path, after moving down (S) to (5,1), then moving right (E) to (5,2). So direction changed from S to E. That's a left turn.
-
-Wait, but the problem is asking for right turns, so this would be a left turn, not counted.
-
-Continuing:
-
-After (5,2) to (5,3): direction still east (E). No turn.
-
-Then from (5,3) to (4,3): moving up (north, N). So direction changes from E to N. What turn is that?
-
-Current direction E, turning to N. From E, a left turn would be N (since E left → N), right turn would be S. So changing from E to N is a left turn.
-
-Wait, according to the table: E → N is a left turn.
-
-Then, (4,3) to (3,3): moving up (N), same direction. No turn.
-
-(3,3) to (3,4): east (E). Direction changes from N to E. From N, right turn → E. So that's a right turn.
-
-Ah, here's a right turn.
-
-Then (3,4) to (3,5): east again. No turn.
-
-(3,5) to (2,5): north (N). Direction changes from E to N. From E, left turn is N. So left turn.
-
-(2,5) to (1,5): north again. No turn.
-
-Wait, let's list all the direction changes step by step.
-
-Let's list each move with direction:
-
-1. Start at S (1,1), move down to (2,1): direction S.
-
-2. (2,1) → (3,1): S.
-
-3. (3,1) → (4,1): S.
-
-4. (4,1) → (5,1): S.
-
-5. (5,1) → (5,2): direction E (right turn? Wait, from S to E: as discussed, S to E is left turn).
-
-Wait, step 5: direction changes from S to E. So turn is left.
-
-6. (5,2) → (5,3): direction E (same, no turn).
-
-7. (5,3) → (4,3): direction N (up). From E to N: left turn.
-
-8. (4,3) → (3,3): direction N (same).
-
-9. (3,3) → (3,4): direction E. From N to E: right turn.
-
-10. (3,4) → (3,5): direction E (same).
-
-11. (3,5) → (2,5): direction N. From E to N: left turn.
-
-12. (2,5) → (1,5): direction N (same).
-
-Now, count the right turns. Looking at the direction changes:
-
-Step 5: S → E: left turn (not counted)
-
-Step 7: E → N: left turn
-
-Step 9: N → E: right turn
-
-Step 11: E → N: left turn
-
-So only step 9 is a right turn. Wait, so only 1 right turn? But the options have A.1, D.2. Hmm.
-
-Wait, maybe I missed something. Let's double-check.
-
-Wait, step 9: moving from (3,3) to (3,4) is east. Previous direction was north (from (4,3) to (3,3)), so direction was N, then moving E. So changing from N to E is a right turn. Yes.
-
-Any other right turns?
-
-Let's check again:
-
-List of direction changes:
-
-Move 1-4: all S, no turns.
-
-Move 5: S → E: left.
-
-Move 6: same E.
-
-Move 7: E → N: left.
-
-Move 8: same N.
-
-Move 9: N → E: right.
-
-Move 10: same E.
-
-Move 11: E → N: left.
-
-Move 12: same N.
-
-So only one right turn at step 9.
-
-But the options include A.1 and D.2. Hmm. Maybe I made a mistake.
-
-Wait, another possibility: when moving from (5,3) to (4,3), direction N. Previous direction was E. So E to N is left turn. Correct.
-
-Wait, but let's think about the path again. Is there another right turn?
-
-Wait, maybe I missed a part of the path. Let's confirm the path again.
-
-Alternative path: Could there be a different route through the Xs?
-
-Wait, the Xs are:
-
-Row2: (2,1), (2,5)
-
-Row3: (3,1), (3,3), (3,4), (3,5)
-
-Row4: (4,1), (4,3)
-
-Row5: (5,1), (5,2), (5,3)
-
-So possible path:
-
-S (1,1) down to (2,1), down to (3,1), down to (4,1), down to (5,1), right to (5,2), right to (5,3), up to (4,3), up to (3,3), right to (3,4), right to (3,5), up to (2,5), up to E.
-
-Yes, that's the path I considered.
-
-Another way: from (3,1), is there a way to go east? Row3, col1 is X, col2 is #, so no. So must go down to row4, col1.
-
-Hmm. So the path seems correct.
-
-Wait, but let's check another angle. When moving from (3,3) to (3,4), direction E. Previous direction was N (coming from (4,3) up to (3,3)). So turning right from N to E. Yes, that's a right turn.
-
-Any other right turns?
-
-Wait, when moving from S down to (2,1), then down to (3,1), etc., all the way down is straight. Then turning right (east) at (5,1), which was a left turn. Then moving east, then turning left (north) at (5,3), then moving north, then turning right to east at (3,3), then east, then turning left to north at (3,5).
-
-Wait, so only one right turn. So answer A.1?
-
-But let's check again. Maybe I messed up the direction.
-
-Alternative approach: draw the path and note directions.
-
-Let's list each segment with direction:
-
-1. (1,1) → (2,1): down (S)
-
-2. (2,1) → (3,1): down (S)
-
-3. (3,1) → (4,1): down (S)
-
-4. (4,1) → (5,1): down (S)
-
-5. (5,1) → (5,2): right (E)
-
-6. (5,2) → (5,3): right (E)
-
-7. (5,3) → (4,3): up (N)
-
-8. (4,3) → (3,3): up (N)
-
-9. (3,3) → (3,4): right (E)
-
-10. (3,4) → (3,5): right (E)
-
-11. (3,5) → (2,5): up (N)
-
-12. (2,5) → (1,5): up (N)
-
-Now, the direction changes occur between segments:
-
-Between 4 and 5: S → E. Left turn.
-
-Between 6 and 7: E → N. Left turn.
-
-Between 8 and 9: N → E. Right turn.
-
-Between 10 and 11: E → N. Left turn.
-
-So only one right turn. So answer A.1.
-
-But wait, the options are A.1, B.5, C.8, D.2. So A is an option. But I need to be sure.
-
-Wait, maybe I made a mistake in the direction change between 8 and 9.
-
-Segment 8: moving up (N) to (3,3). Then segment 9: moving east (E). So direction changes from N to E. A right turn. Correct.
-
-Another way: if you're facing north (up), a right turn would face east. Yes. So that's a right turn.
-
-Any other right turns?
-
-Between segment 5 and 6: no, same direction.
-
-Between segment 7 and 8: same direction (N).
-
-Between segment 9 and 10: same direction (E).
-
-Between 10 and 11: E to N, left turn.
-
-So only one right turn. So answer should be A.1.
-
-But wait, the problem says "the path marked by X". Let me confirm that all the Xs are used. The path I took uses all the Xs in the path, but maybe there's a different path that uses more Xs and has more turns.
-
-Wait, but the Xs are the specific route to follow. So the path must be the one that connects S to E through Xs. There might be only one such path.
-
-Let's check if there's another possible path.
-
-From row5, col3 (X), could we go east? (5,4) is space, so no. So must go up.
-
-Another possibility: from row3, col5 (X), can we go east? No, col6 is #. So must go up to row2, col5.
-
-Hmm. Seems like the path I traced is the only one.
-
-Wait, another check: row3, col4 is X. So (3,4) is part of the path. Yes.
-
-Wait, but let's count the turns again.
-
-Wait, after moving up from (5,3) to (4,3) to (3,3), then east to (3,4), that's a right turn. Then east again. Then up to (2,5). So only one right turn.
-
-But maybe I missed a turn. Let's think about the movement directions step by step.
-
-Let's assign a direction to each move:
-
-Move 1: down (S)
-
-Move 2: down (S) → no turn
-
-Move 3: down (S) → no
-
-Move 4: down (S) → no
-
-Move 5: right (E) → from S to E: left turn
-
-Move 6: right (E) → no turn
-
-Move 7: up (N) → from E to N: left turn
-
-Move 8: up (N) → no
-
-Move 9: right (E) → from N to E: right turn
-
-Move 10: right (E) → no
-
-Move 11: up (N) → from E to N: left turn
-
-Move 12: up (N) → no
-
-So right turns only at move 9. So total 1.
-
-But the answer options have A.1. So why is the option D.2? Maybe I made a mistake.
-
-Wait, wait. Let's consider the direction when moving from (3,5) to (2,5): up (N), previous direction was east (E). So from E to N is left turn. Correct.
-
-Wait, another possibility: when moving from (2,5) to (1,5), that's up, same direction as previous (up), so no turn.
-
-Hmm.
-
-Wait, perhaps I miscounted the direction change at move 9.
-
-Wait, move 8: moving up to (3,3), direction N.
-
-Move 9: moving east, so direction E. So changing from N to E. Right turn. Yes.
-
-Is there another right turn elsewhere?
-
-Wait, let's think about the very first move. From S (1,1), moving down. But S is at (1,1), and the first move is down. The direction is S. Then, all the way down until (5,1). Then turn east. That's a left turn.
-
-Wait, unless... Wait, the problem says "right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
-
-So when you change direction, if it's a 90-degree clockwise turn from the previous direction, it's a right turn.
-
-So previous direction is S (down), new direction is E (right). Clockwise from S (down) would be W (west), so E is counterclockwise from S. So it's a left turn. Which matches earlier analysis.
-
-Another check: from N, right turn to E. Yes.
-
-So I think the count is correct. Only one right turn.
-
-But let's see the options. The user lists options A.1, B.5, C.8, D.2. So A is an option.
-
-But maybe the intended answer is different. Let me check again.
-
-Wait, another way: perhaps the path is different.
-
-Wait, row3, col3 is X. Is there a way to go from (3,1) east? No, because row3, col2 is #. So must go down.
-
-Wait, what if from (3,3), instead of going east, we go north? But (2,3) is # (row2: #X###X#, so col3 is #), so can't go north from (3,3).
-
-Hmm.
-
-Wait, maybe I made a mistake in the path. Let's try to see.
-
-Alternative path: S → down to (2,1), right? No, (2,2) is #. Can't go right. So must go down.
-
-Another idea: after (5,3), instead of going up, is there another way? (5,3) can go east to (5,4), but (5,4) is space, not X. So no. So must go up.
-
-So path seems correct.
-
-Wait, perhaps the turn count is different. Let's think of the turns as you traverse the path.
-
-Starting direction: down.
-
-After four downs, then turn left (east), then two rights, then turn left (north), then two ups, then turn right (east), then two rights, then turn left (north), then two ups.
-
-Wait, the turns are:
-
-1. Down to east: left
-
-2. East to north: left
-
-3. North to east: right
-
-4. East to north: left
-
-So three turns, one right.
-
-Yes, only one right turn.
-
-Hmm. So answer should be A.1.
-
-But I'm a bit unsure because sometimes these puzzles can have tricks. Let's see the maze again.
-
-Wait, looking at row3: #X#XXX#, so columns 1,3,4,5 are X.
-
-Row4: #X#X### → columns 1 and 3.
-
-Row5: #XXX  # → columns 1,2,3.
-
-So another possible path: from (3,3), could we go east to (3,4), (3,5), then up to (2,5), which is what we did.
-
-I think the path is correct.
-
-Wait, but let's count the number of direction changes again:
-
-- After the initial four downs (no turns), then a left turn (east), then no turn for two steps, then a left turn (north), then no turn, then a right turn (east), then no turn, then a left turn (north), then no turn.
-
-So right turns: 1.
-
-Thus, the answer should be A.1.
-
-But why is option D.2 present? Maybe I missed a turn.
-
-Wait, wait. When moving from (3,3) to (3,4), direction east. Previous direction was north (coming from (4,3) to (3,3)). So turning right to east.
-
-Then, moving east to (3,4), then east to (3,5), then up to (2,5). When moving from (3,5) up, direction changes from east to north: left turn.
-
-But what about when moving from (5,3) to (4,3) (up), previous direction was east. So east to north: left turn.
-
-Another angle: let's list all the turns with their type:
-
-1. S → E: left
-
-2. E → N: left
-
-3. N → E: right
-
-4. E → N: left
-
-Total right turns: 1.
-
-Yes. So answer A.
-
-But the problem statement says "the path marked by X". Are all the Xs in the path? Let's see:
-
-Path X positions:
-
-(2,1), (3,1), (4,1), (5,1), (5,2), (5,3), (4,3), (3,3), (3,4), (3,5), (2,5).
-
-Check if all these are Xs:
-
-Row2: (2,1) X, (2,5) X – yes.
-
-Row3: (3,1), (3,3), (3,4), (3,5) X – yes.
-
-Row4: (4,1), (4,3) X – yes.
-
-Row5: (5,1), (5,2), (5,3) X – yes.
-
-So all the Xs along the path are used. So the path is correct.
-
-Therefore, only one right turn. Answer A.1.
-
-But wait, looking back at the problem statement: "the path marked by X". So the Xs are the path, so we have to follow Xs, which we did.
-
-Hmm. Maybe the answer is A.1. But I'm a bit confused because sometimes these questions can have nuances.
-
-Wait, another check: when you move from (3,5) to (2,5), direction north. Previous direction was east. So turning left. Correct.
-
-And the move from (3,3) to (3,4) is east, previous direction north: right turn.
-
-Yes. So only one right turn.
-
-I think the answer is A.1.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
-    Current position: (r2, c2)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
-[... continue for all steps until reaching E ...]
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (1, 1)
-    E position: (1, 5)
-
->>> STEP 1: Move DOWN from (1,1) to (2,1)
-    Current position: (2,1)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 2: Move DOWN from (2,1) to (3,1)
-    Current position: (3,1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 3: Move DOWN from (3,1) to (4,1)
-    Current position: (4,1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 4: Move DOWN from (4,1) to (5,1)
-    Current position: (5,1)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 5: Move RIGHT from (5,1) to (5,2)
-    Current position: (5,2)
-    Previous direction: DOWN
-    Current direction: RIGHT
-    Turn type: LEFT
-    Running count: Right=0, Left=1
-
->>> STEP 6: Move RIGHT from (5,2) to (5,3)
-    Current position: (5,3)
-    Previous direction: RIGHT
-    Current direction: RIGHT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 7: Move UP from (5,3) to (4,3)
-    Current position: (4,3)
-    Previous direction: RIGHT
-    Current direction: UP
-    Turn type: LEFT
-    Running count: Right=0, Left=2
-
->>> STEP 8: Move UP from (4,3) to (3,3)
-    Current position: (3,3)
-    Previous direction: UP
-    Current direction: UP
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=2
-
->>> STEP 9: Move RIGHT from (3,3) to (3,4)
-    Current position: (3,4)
-    Previous direction: UP
-    Current direction: RIGHT
-    Turn type: RIGHT
-    Running count: Right=1, Left=2
-
->>> STEP 10: Move RIGHT from (3,4) to (3,5)
-    Current position: (3,5)
-    Previous direction: RIGHT
-    Current direction: RIGHT
-    Turn type: STRAIGHT
-    Running count: Right=1, Left=2
-
->>> STEP 11: Move UP from (3,5) to (2,5)
-    Current position: (2,5)
-    Previous direction: RIGHT
-    Current direction: UP
-    Turn type: LEFT
-    Running count: Right=1, Left=3
-
->>> STEP 12: Move UP from (2,5) to (1,5) [E]
-    Current position: (1,5)
-    Previous direction: UP
-    Current direction: UP
-    Turn type: STRAIGHT
-    Running count: Right=1, Left=3
-
->>> FINAL ANSWER:
-    \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt
deleted file mode 100644
index 08fd3f7a..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3036.txt
+++ /dev/null
@@ -1,579 +0,0 @@
-Okay, let's try to figure out how many right turns there are in the path from S to E marked by X. First, I need to visualize the maze properly. The ASCII maze given is:
-
-#######
-#  XXX#
-###X#X#
-#  E#X#
-#####X#
-#SXXXX#
-#######
-
-Hmm, let's parse each line. Let's number the rows from top to bottom as rows 0 to 5 maybe. Wait, but maybe it's better to write them out with coordinates.
-
-Let me write each line with row numbers:
-
-Row 0: #######
-Row 1: #  XXX#
-Row 2: ###X#X#
-Row 3: #  E#X#
-Row 4: #####X#
-Row 5: #SXXXX#
-Row 6: #######
-
-Wait, actually, the maze as given has 7 lines (including the top and bottom walls), so rows 0 to 6. Let's check:
-
-Line 1: ####### (row 0)
-Line 2: #  XXX# (row 1)
-Line 3: ###X#X# (row 2)
-Line 4: #  E#X# (row 3)
-Line 5: #####X# (row 4)
-Line 6: #SXXXX# (row 5)
-Line 7: ####### (row 6)
-
-Wait, the user provided 7 lines. Let's index them as rows 0 to 6.
-
-Now, the maze:
-
-Row 0: 0 1 2 3 4 5 6
-        # # # # # # #
-
-Row 1: #   X X X #
-So positions (1,1) is space, (1,2) space, (1,3) X, (1,4) X, (1,5) X, (1,6) #
-
-Row 2: ###X#X#
-So row 2: # # # X # X #, so columns 0-6. So (2,3)=X, (2,5)=X
-
-Row 3: #  E#X#
-So (3,2)=space, (3,3)=E? Wait, no: #  E#X#, so columns:
-
-0: #, 1: space, 2: space, 3: E, 4: #, 5: X, 6: # ?
-
-Wait, let's break down each row character by character.
-
-Row 0: "#######" → all walls.
-
-Row 1: "#  XXX#" → characters at positions 0: #, 1: space, 2: space, 3: X, 4: X, 5: X, 6: #.
-
-Row 2: "###X#X#" → 0:#, 1:#, 2:#, 3:X, 4:#, 5:X, 6:#.
-
-Row 3: "#  E#X#" → 0:#, 1:space, 2:space, 3:E, 4:#, 5:X, 6:#.
-
-Row 4: "#####X#" → 0:#,1:#,2:#,3:#,4:#,5:X,6:#.
-
-Row 5: "#SXXXX#" → 0:#,1:S,2:X,3:X,4:X,5:X,6:#.
-
-Row 6: "#######".
-
-Wait, but the problem states that X marks the specific route to follow. So the path from S to E must follow X's. Let's find S and E.
-
-S is in row 5, column 1 (since row 5 is "#SXXXX#", so index 1 is S).
-
-E is in row 3, column 3 (row 3: "#  E#X#", so the E is at position 3).
-
-Now, the path is marked by X. So we need to trace from S (row5, col1) following X's to E (row3, col3).
-
-Wait, but S is at (5,1), and the X's are the path. Let's see the X's in the maze.
-
-Looking at each X:
-
-Row1: cols3,4,5 are X.
-
-Row2: col3 and 5 are X.
-
-Row3: col5 is X.
-
-Row4: col5 is X.
-
-Row5: cols2,3,4,5 are X.
-
-Wait, row5: #SXXXX#, so col1 is S, then col2-5 are X. So S is at (5,1), and the X path starts at (5,2) maybe? Wait, but S is the start, so the path from S should go through X's. Wait, but S is not X. Wait, the problem says "the path marked by X", so the correct path to follow is along X's. So S is the starting point, which is adjacent to X's perhaps.
-
-Wait, the maze says:
-
-- S denotes the starting point.
-
-- X marks the specific route you should follow.
-
-So the path from S to E must move through X's. So S is not part of the X path, but the path starts from S, then follows X's. Wait, but in the maze, S is at (5,1), and the cell to the right (5,2) is X. So the path starts at S (5,1), then moves to (5,2), which is X, then continues along X's.
-
-Similarly, E is at (3,3), which is adjacent to X's perhaps. Let's see.
-
-Let's map all X positions:
-
-Row 1: (1,3), (1,4), (1,5)
-
-Row 2: (2,3), (2,5)
-
-Row 3: (3,5)
-
-Row 4: (4,5)
-
-Row 5: (5,2), (5,3), (5,4), (5,5)
-
-Wait, row5: #SXXXX#, so columns 0:#, 1:S, 2:X, 3:X, 4:X, 5:X, 6:#. So (5,2) to (5,5) are X.
-
-So the path from S (5,1) must go to (5,2), then follow X's.
-
-Let's try to trace the path.
-
-Starting at S (5,1). The first move must be to (5,2), since that's the only X adjacent to S (since S is at (5,1), up would be row4, col1 which is # (row4 is #####X#, so col1 is #), down is row6, which is wall, left is col0 (wall), right is (5,2), which is X.
-
-So first direction is right (east) from S.
-
-Now, from (5,2), we need to follow X's. Let's list all X positions and see how they connect.
-
-Let's list all X cells:
-
-Row 1: (1,3), (1,4), (1,5)
-
-Row 2: (2,3), (2,5)
-
-Row 3: (3,5)
-
-Row 4: (4,5)
-
-Row 5: (5,2), (5,3), (5,4), (5,5)
-
-So the X's form a path. Let's see how they connect.
-
-Starting at (5,2), which is X. From (5,2), possible moves: up (row4, col2), but row4 col2 is # (row4: #####X#, so col2 is #), left is (5,1) which is S (not X), right is (5,3), which is X. So path continues east to (5,3).
-
-Then from (5,3), east to (5,4), then east to (5,5). Now at (5,5). From here, up (row4, col5) is X (row4 col5 is X), so move up to (4,5).
-
-From (4,5), up to row3, col5, which is X (row3 col5 is X), so move up to (3,5).
-
-From (3,5), up to row2, col5, which is X (row2 col5 is X), move up to (2,5).
-
-From (2,5), up to row1, col5, which is X (row1 col5 is X), move up to (1,5).
-
-From (1,5), left to (1,4), then left to (1,3). Wait, but row1 col3 is X. Wait, but row1 col3,4,5 are X. So from (1,5), moving west to (1,4), then west to (1,3). Now at (1,3). From (1,3), can we go down? Row2 col3 is X (row2 col3 is X), so down to (2,3).
-
-From (2,3), down to row3, col3. Wait, row3 col3 is E. Wait, row3: "#  E#X#", so col3 is E. So (3,3) is E. So from (2,3), moving down would reach (3,3), which is E.
-
-Wait, but according to the path, we need to follow X's. So the path should end at E. So the X path leads to E. So let's confirm:
-
-The path should be:
-
-Start at S (5,1) → move east to (5,2) [X], then east to (5,3), east to (5,4), east to (5,5), north to (4,5), north to (3,5), north to (2,5), north to (1,5), west to (1,4), west to (1,3), south to (2,3), south to (3,3)=E.
-
-Wait, but (2,3) is X, then moving south from (2,3) to (3,3), which is E. So the path is through X's until the last step to E.
-
-So the path of X's is from (5,2) to (5,5), then up to (4,5), (3,5), (2,5), (1,5), then left to (1,4), (1,3), then down to (2,3), then down to E.
-
-Wait, but (2,3) is X, so moving from (2,3) to (3,3) (E) is the last step. So the X path ends at (2,3), then E is adjacent.
-
-Now, we need to track the directions taken along the X path, and count the right turns.
-
-First, let's list the path step by step, noting the direction of movement between each cell.
-
-Starting at S (5,1), first move is east (right) to (5,2). So first direction: east.
-
-Now, from (5,2) to (5,3): east again. No turn.
-
-(5,3) to (5,4): east.
-
-(5,4) to (5,5): east.
-
-Now, from (5,5) to (4,5): north. So direction changes from east to north. What's the turn here? If previous direction was east, moving north would be a left turn (since east to north is 90 degrees counterclockwise, i.e., left turn). Wait, need to confirm.
-
-Movement direction: east (right), then moving north (up). So changing direction from east to north. To go from east to north, you turn left (since east is right, north is up; turning left from east would face north). So this is a left turn.
-
-But we need to count right turns. So this would be a left turn, not counted.
-
-Next, from (4,5) to (3,5): north again. No turn.
-
-(3,5) to (2,5): north.
-
-(2,5) to (1,5): north.
-
-Now, from (1,5) to (1,4): west. Previous direction was north, now moving west. From north to west is a left turn (since north to west is 90 degrees left). Wait, direction change: north to west. If facing north, turning left would face west. So yes, left turn. Not a right turn.
-
-Then from (1,4) to (1,3): west again. No turn.
-
-Now, from (1,3) to (2,3): south. Previous direction was west, now moving south. From west to south: turning right? Let's see. Facing west, turning right would face north? Wait, no.
-
-Wait, direction changes:
-
-Current direction before the move: west (from (1,4) to (1,3), direction west).
-
-Next move is south (from (1,3) to (2,3)). So changing direction from west to south.
-
-What's the turn here? If facing west, to face south, you turn right (90 degrees clockwise). Let's think:
-
-- Facing west (left), turning right (clockwise) would turn to north? Wait, no.
-
-Wait, standard directions:
-
-- East: right
-
-- North: up
-
-- West: left
-
-- South: down
-
-If you're moving west (left), then turning right (clockwise) would face north? Wait, no.
-
-Wait, imagine facing west (towards left on the screen). A right turn (clockwise) would make you face north (upwards), because clockwise from west would be north. Wait, no:
-
-Wait, if you're facing west (left), a right turn (clockwise) would turn you to face north? Let's see:
-
-- Facing north: right turn → east.
-
-- Facing east: right turn → south.
-
-- Facing south: right turn → west.
-
-- Facing west: right turn → north.
-
-Yes. So west to north is right turn.
-
-But here, we're moving west, then turning to south. Wait, direction change from west to south. So from west, to go south, which direction is the turn?
-
-West to south: if facing west, to face south, you need to turn right twice (180), but actually, the turn would be a right turn of 90 degrees? Wait, no.
-
-Wait, the direction of movement: previous direction was west (arriving at (1,3) from the east, i.e., moving west), now moving south. So the direction changed from west to south.
-
-To go from west to south, what's the turn?
-
-West → south: clockwise 90 degrees? Let's see:
-
-From west, clockwise 90 degrees would be north? Wait, no. Wait, standard compass:
-
-Clockwise turns:
-
-North → East → South → West → North.
-
-So if you're facing west, a clockwise (right) turn would face north. A counterclockwise (left) turn would face south.
-
-Wait, yes: if facing west, left turn (counterclockwise) would be to face south, and right turn would be to face north.
-
-Wait, let's confirm:
-
-- Facing north, left turn → west.
-
-- Facing north, right turn → east.
-
-Similarly, facing west:
-
-- Left turn (counterclockwise) → south.
-
-- Right turn (clockwise) → north.
-
-So if we were moving west (direction west), then change direction to south, that's a left turn.
-
-Wait, because from west, left turn (counterclockwise) would face south. So moving from west to south is a left turn.
-
-Hmm, this is a bit confusing. Let's think in terms of direction vectors.
-
-Previous direction: west (vector (-1,0) in terms of movement? Wait, maybe better to use compass directions.
-
-Let's say:
-
-- East: +x direction.
-
-- North: +y direction.
-
-Wait, but in grid terms, row numbers usually increase downward, so row 0 is top, row 5 is lower. So moving north would decrease row number, south increases row.
-
-But for direction purposes, maybe better to consider:
-
-When moving from cell A to cell B, the direction is based on the movement.
-
-So from (5,5) to (4,5): moving north (row decreases).
-
-From (1,5) to (1,4): moving west (column decreases).
-
-From (1,3) to (2,3): moving south (row increases).
-
-So previous direction before the move to (2,3) was west (since we were moving west to get to (1,3)), then the next move is south.
-
-So direction changed from west to south.
-
-To find the turn type: when moving west, then moving south, what's the turn.
-
-Imagine facing west (left), then turning to face south (down). To face south from west, you turn left (counterclockwise), because west → south is left.
-
-Yes: west to south is left turn.
-
-So this would be a left turn, not a right.
-
-Wait, but let's make sure. If you're moving west (left), to start moving south (down), you turn left (since turning left from west would face south). So yes, left turn.
-
-So this move would be a left turn, not counted.
-
-Then from (2,3) to (3,3): south again. No turn.
-
-So let's recap all the direction changes:
-
-Start at S (5,1), first move east (to (5,2)). So initial direction east.
-
-Path segments:
-
-1. (5,1) → (5,2): direction east.
-
-2. (5,2) → (5,3): east (no turn).
-
-3. (5,3) → (5,4): east.
-
-4. (5,4) → (5,5): east.
-
-5. (5,5) → (4,5): north (change from east → north: left turn).
-
-6. (4,5) → (3,5): north.
-
-7. (3,5) → (2,5): north.
-
-8. (2,5) → (1,5): north.
-
-9. (1,5) → (1,4): west (change from north → west: left turn).
-
-10. (1,4) → (1,3): west.
-
-11. (1,3) → (2,3): south (change from west → south: left turn).
-
-12. (2,3) → (3,3): south (to E).
-
-Now, let's list all the direction changes (i.e., where the direction changes from the previous step):
-
-- After step 4 (east to north at step 5): left turn.
-
-- After step 8 (north to west at step 9): left turn.
-
-- After step 10 (west to south at step 11): left turn.
-
-Wait, so all three direction changes are left turns. Are there any right turns?
-
-Wait, but according to this, there are zero right turns. But the options include B.0. But wait, maybe I made a mistake.
-
-Wait, let's re-examine the direction changes.
-
-Wait, step 5: moving north after east. East → north: left turn (as east to north is left).
-
-Step 9: moving west after north. North → west: left turn (north to west is left).
-
-Step 11: moving south after west. West → south: left turn.
-
-So all three direction changes are left turns. So no right turns. So answer would be B.0.
-
-But wait, the options are A.4, B.0, C.3, D.5. Hmm.
-
-Wait, but maybe I messed up the path. Let's confirm the path again.
-
-Alternative path: perhaps the path doesn't go all the way to (1,3), but maybe another way.
-
-Wait, let's retrace.
-
-From S (5,1), move east to (5,2), then east to (5,3), east to (5,4), east to (5,5). Then north to (4,5), north to (3,5), north to (2,5), north to (1,5). Now at (1,5). From here, west to (1,4), west to (1,3). Now, (1,3) is X. From (1,3), what's the next step? The only adjacent X's would be south to (2,3), which is X. So yes, down to (2,3), then down to (3,3)=E.
-
-So the path is correct.
-
-Another possibility: maybe from (2,5), instead of going north to (1,5), but no, row2 col5 is X, row1 col5 is X.
-
-Wait, but let's check if there's another path. For example, row2 has X at (2,3) and (2,5). So from (2,5), can we go west to (2,4), but row2 col4 is # (since row2 is ###X#X#, so col4 is #). So no, can't go west. So path must go north to row1.
-
-Hmm. So the path as traced seems correct.
-
-So all direction changes are left turns. So zero right turns. Answer B.0.
-
-Wait, but let's check again.
-
-Wait, another way: when moving from (5,5) north, direction east → north: left turn.
-
-Then moving west from north: north → west is left turn.
-
-West → south: left turn.
-
-No right turns. So answer B.0.
-
-But let's see the options. The problem states "the path marked by X", so maybe I missed a turn.
-
-Wait, wait, when moving from (1,3) south to (2,3), direction west → south. As discussed, left turn.
-
-Wait, but let's consider the direction of movement. The turn is determined by the change in direction. Let's use a different approach.
-
-List the direction of each move:
-
-Move 1: east
-
-Move 2: east
-
-Move 3: east
-
-Move 4: east
-
-Move 5: north
-
-Move 6: north
-
-Move 7: north
-
-Move 8: north
-
-Move 9: west
-
-Move 10: west
-
-Move 11: south
-
-Move 12: south
-
-Now, the direction changes occur between move 4 and 5 (east → north), move 8 and 9 (north → west), move 10 and 11 (west → south).
-
-Each of these changes:
-
-east → north: left turn.
-
-north → west: left turn.
-
-west → south: left turn.
-
-All left. So zero right turns.
-
-Hence, the answer should be B.0.
-
-But wait, the problem says "right turn is defined as a change in movement direction that is 90 degrees clockwise relative to the previous direction."
-
-So east to north: previous direction east. To get to north, turning left (counterclockwise), so not a right turn.
-
-east to south would be a right turn.
-
-north to east: right turn.
-
-But in our case, all changes are left.
-
-Hmm. So the answer is B.0.
-
-But let me check again if there's a different path.
-
-Wait, another way: from (5,5), instead of going north, is there another X? No, row4 col5 is X, which is the only way up.
-
-Wait, what about row2 col3: X. Could the path go from (5,5) up to (4,5), (3,5), (2,5), then west? But row2 col4 is #, so can't go west from (2,5). So must go north to (1,5).
-
-Hmm.
-
-Alternatively, after (1,5), moving west to (1,4), (1,3), then south to (2,3), which is X, then south to E. Yes, that's the path.
-
-Another check: let's count the turns again.
-
-Starting direction: east (from S to first X).
-
-After four east moves, turn north: left.
-
-Then four north moves, turn west: left.
-
-Then two west moves, turn south: left.
-
-Total three left turns, no right turns.
-
-Therefore, the number of right turns is zero. Answer B.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
-    Current position: (r2, c2)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
-[... continue for all steps until reaching E ...]
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (5, 1)
-    E position: (3, 3)
-
->>> STEP 1: Move EAST from (5, 1) to (5, 2)
-    Current position: (5, 2)
-    Previous direction: —
-    Current direction: EAST
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 2: Move EAST from (5, 2) to (5, 3)
-    Current position: (5, 3)
-    Previous direction: EAST
-    Current direction: EAST
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 3: Move EAST from (5, 3) to (5, 4)
-    Current position: (5, 4)
-    Previous direction: EAST
-    Current direction: EAST
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 4: Move EAST from (5, 4) to (5, 5)
-    Current position: (5, 5)
-    Previous direction: EAST
-    Current direction: EAST
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 5: Move NORTH from (5, 5) to (4, 5)
-    Current position: (4, 5)
-    Previous direction: EAST
-    Current direction: NORTH
-    Turn type: LEFT (EAST → NORTH is 90° counterclockwise)
-    Running count: Right=0, Left=1
-
->>> STEP 6: Move NORTH from (4, 5) to (3, 5)
-    Current position: (3, 5)
-    Previous direction: NORTH
-    Current direction: NORTH
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 7: Move NORTH from (3, 5) to (2, 5)
-    Current position: (2, 5)
-    Previous direction: NORTH
-    Current direction: NORTH
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 8: Move NORTH from (2, 5) to (1, 5)
-    Current position: (1, 5)
-    Previous direction: NORTH
-    Current direction: NORTH
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 9: Move WEST from (1, 5) to (1, 4)
-    Current position: (1, 4)
-    Previous direction: NORTH
-    Current direction: WEST
-    Turn type: LEFT (NORTH → WEST is 90° counterclockwise)
-    Running count: Right=0, Left=2
-
->>> STEP 10: Move WEST from (1, 4) to (1, 3)
-    Current position: (1, 3)
-    Previous direction: WEST
-    Current direction: WEST
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=2
-
->>> STEP 11: Move SOUTH from (1, 3) to (2, 3)
-    Current position: (2, 3)
-    Previous direction: WEST
-    Current direction: SOUTH
-    Turn type: LEFT (WEST → SOUTH is 90° counterclockwise)
-    Running count: Right=0, Left=3
-
->>> STEP 12: Move SOUTH from (2, 3) to (3, 3)
-    Current position: (3, 3)
-    Previous direction: SOUTH
-    Current direction: SOUTH
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=3
-
->>> FINAL ANSWER:
-    The path contains **0** right turns.
-    \boxed{B}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt
deleted file mode 100644
index 31e8200f..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_3047.txt
+++ /dev/null
@@ -1,598 +0,0 @@
-Okay, let's try to figure out how many right turns there are in the path from S to E marked by X. First, I need to visualize the maze properly. The ASCII maze given is:
-
-#######
-#E#XXS#
-#X#X# #
-#XXX# #
-##### #
-#     #
-#######
-
-Wait, the maze might be a bit hard to read because of the formatting. Let me parse it line by line.
-
-Let's list each row:
-
-Row 0: ####### (all walls)
-Row 1: #E#XXS#
-Row 2: #X#X# #
-Row 3: #XXX# #
-Row 4: ##### #
-Row 5: #     #
-Row 6: #######
-
-Hmm, but the actual maze might be 7 rows high (from 0 to 6), each row having 7 characters (since the first row is 7 #s). Let's check each row's length.
-
-Row 0: 7 characters (all #)
-Row 1: #E#XXS# → let's count: # E # X X S # → that's 7 characters.
-Row 2: #X#X# # → # X # X # space # → 7 characters.
-Row 3: #XXX# # → # X X X # space # → 7 characters.
-Row 4: ##### # → # # # # # space # → 7 characters.
-Row 5: #     # → # followed by 5 spaces and # → 7 characters.
-Row 6: ####### → 7 #s.
-
-Okay, so it's a 7x7 grid.
-
-Now, the starting point S is in row 1. Let's find coordinates. Let's index rows from top (0) to bottom (6), columns left (0) to right (6).
-
-Row 1 (second row from top): positions 0: #, 1: E, 2: #, 3: X, 4: X, 5: S, 6: #.
-
-Wait, wait, row 1 is "#E#XXS#", so breaking it down:
-
-Index: 0 1 2 3 4 5 6
-Char:  # E # X X S #
-
-So S is at (row 1, column 5). E is at (row 1, column 1).
-
-Wait, but the exit is E. So the starting point S is at (1,5), and E is at (1,1). But the path marked by X needs to be followed. Wait, the Xs are the specific route. So the correct path from S to E is along the Xs.
-
-Wait, but the problem says: "the path marked by X". So the Xs are the correct route. So we need to trace the path of Xs from S to E.
-
-Wait, but S is a separate character. So the path starts at S, then follows Xs to reach E. Wait, but looking at the maze:
-
-Row 1: #E#XXS# → So the E is at (1,1), then there's a #, then XXS. So the S is at (1,5), and the Xs are at (1,3), (1,4), and then maybe other Xs.
-
-Wait, let's map all the Xs.
-
-Row 1: columns 3 and 4 are X.
-
-Row 2: #X#X# # → columns 1 and 3 are X.
-
-Row 3: #XXX# # → columns 1,2,3 are X.
-
-Row 4: ##### # → no Xs.
-
-Row 5: #     # → no Xs.
-
-So the Xs are located at:
-
-Row 1: (1,3), (1,4)
-
-Row 2: (2,1), (2,3)
-
-Row 3: (3,1), (3,2), (3,3)
-
-So the path from S (which is at (1,5)) to E (1,1) must follow these Xs. Wait, but how does the path connect?
-
-Wait, S is at (1,5). To get to E (1,1), the path must go through Xs. Let's see:
-
-Starting at S (1,5), which is next to X at (1,4). So from S, moving left to (1,4), which is X. Then where?
-
-(1,4) is X. Let's see neighbors. Let's consider possible directions.
-
-Wait, but the path marked by Xs is the correct path, so we need to see how the Xs are connected. Let's look for a continuous path of Xs from S to E.
-
-Wait, S is at (1,5), and the Xs near S are (1,4) and (1,3). So starting at S (1,5), moving left to (1,4), then left to (1,3). But then, row 1, column 3 is X. Then, what's next? Let's check the Xs.
-
-Row 1, column 3: X. To the left of that is column 2, which is #, so can't go left. Down from (1,3) is row 2, column 3: which in row 2, column 3 is X (row 2: #X#X# # → column 3 is X). So from (1,3), moving down to (2,3).
-
-Then, row 2, column 3 is X. Let's check neighbors. Up is (1,3), which is X. Left: column 2 is #. Right: column 4 is # (row 2: #X#X# # → columns 0:#,1:X,2:#,3:X,4:#,5:space,6:#). So row 2, column 3's right is #. Down from row 2, column 3 is row 3, column 3: which is X (row 3: #XXX# # → columns 1,2,3 are X). So moving down to (3,3).
-
-Row 3, column 3: X. Now, neighbors: up is (2,3), left is (3,2) which is X. So possible to go left. Let's see if the path continues left.
-
-Row 3: columns 1,2,3 are X. So (3,3) → (3,2) → (3,1). Then row 3, column 1 is X. Now, up from row 3, column 1: row 2, column 1 is X (row 2: #X#X# # → column 1 is X). So moving up from (3,1) to (2,1). Then, row 2, column 1: neighbors. Up to row 1, column 1 is E, but E is the exit. Wait, but E is at (1,1). Wait, row 1, column 1 is E, which is the exit. So from row 2, column 1, moving up to (1,1) would reach E.
-
-Wait, but the path is supposed to be marked by Xs. But E is not an X. So the path should go from S through Xs to E. So the last X before E must be adjacent to E.
-
-Let's reconstruct the path step by step.
-
-Start at S (1,5). The path must go through Xs. So:
-
-S (1,5) → (1,4) X → (1,3) X → (2,3) X → (3,3) X → (3,2) X → (3,1) X → (2,1) X → E (1,1).
-
-Wait, let's check:
-
-From (1,5) S, left to (1,4) X.
-
-Left to (1,3) X.
-
-Down to (2,3) X.
-
-Down to (3,3) X.
-
-Left to (3,2) X.
-
-Left to (3,1) X.
-
-Up to (2,1) X.
-
-Up to (1,1) E.
-
-So the Xs in the path are:
-
-(1,4), (1,3), (2,3), (3,3), (3,2), (3,1), (2,1).
-
-Wait, but S is not an X, but the path starts at S and follows Xs. So the first step is from S to (1,4), which is X. So the path marked by Xs starts at (1,4) and goes through the other Xs.
-
-But the problem says "the path marked by X", so the Xs are the path. So the entire path from S to E must be the Xs connected. But S and E are separate, so the path is S → X → X → ... → X → E.
-
-So the path sequence of coordinates (including S and E) would be:
-
-S (1,5) → (1,4) → (1,3) → (2,3) → (3,3) → (3,2) → (3,1) → (2,1) → E (1,1).
-
-Now, to find the number of right turns. To do this, we need to track the direction of movement at each step and see when a right turn occurs.
-
-First, we need to determine the direction of each move. Let's list each consecutive step and the direction.
-
-Let's list the path as a sequence of points:
-
-1. S: (1,5) - starting point, no direction yet.
-
-2. (1,4): move left from S. So direction is left (west).
-
-3. (1,3): move left again (west). Still same direction.
-
-4. (2,3): move down (south) from (1,3) to (2,3).
-
-5. (3,3): move down (south) again.
-
-6. (3,2): move left (west).
-
-7. (3,1): move left (west).
-
-8. (2,1): move up (north).
-
-9. E: (1,1): move up (north).
-
-Wait, but we need to look at each turn, which is a change in direction. A turn occurs when consecutive moves have different directions.
-
-Let's list the directions between each consecutive point.
-
-From S (1,5) to (1,4): direction is left (west).
-
-(1,4) to (1,3): direction still left (west). No turn.
-
-(1,3) to (2,3): direction down (south). So from west to south: what's the turn here?
-
-Previous direction was west (left), now moving south (down). To go from west to south, that's a right turn. Because if you're facing west, turning right would face south. Wait: directions: north, east, south, west. Wait, standard compass directions.
-
-Wait, when moving left (west), the direction is west. If you then move south, that's a right turn from west. Because west → right turn is south. Let's confirm:
-
-If you're facing west (left), a right turn would make you face south (down), yes. Because right turn from west is south. Left turn would be north.
-
-So moving from west direction to south direction is a right turn.
-
-So step 3 to 4: direction changes from west to south: right turn.
-
-Next, (2,3) to (3,3): direction south (down), same as previous. No turn.
-
-(3,3) to (3,2): direction west (left). Previous direction south → west. What's the turn here?
-
-Facing south, turning left would be east? Wait, no. Wait, direction changes.
-
-If current direction is south (moving down), then a right turn would be west (since south → right turn → west?), wait maybe better to use standard.
-
-Let's think of directions as:
-
-North: up (row decreases)
-
-East: right (column increases)
-
-South: down (row increases)
-
-West: left (column decreases)
-
-So when moving south (row increases), a right turn would be west? Wait, no. Wait, if you're moving south (facing south), a right turn would face west? Wait, no. Let's think in terms of 2D coordinates.
-
-Imagine facing south (down). A right turn would turn towards the right, which would be west? Wait, no. If you're facing south (down), your right hand would be pointing west. Wait, no. Let's use a clock:
-
-- Facing north: right turn is east.
-
-- Facing east: right turn is south.
-
-- Facing south: right turn is west.
-
-- Facing west: right turn is north.
-
-Yes, that's correct. So each right turn is clockwise 90 degrees.
-
-So:
-
-Current direction: south. Right turn → west.
-
-Current direction: west. Right turn → north.
-
-Current direction: north. Right turn → east.
-
-Current direction: east. Right turn → south.
-
-Similarly, left turn is counterclockwise.
-
-So in our case:
-
-From (1,3) to (2,3): direction south (after moving left, then down).
-
-Previous direction was west (from S to (1,4) to (1,3)), then moving down (south). So from west to south: that's a right turn (west → right → south).
-
-Then moving south to (3,3): direction still south. No turn.
-
-Then moving west (from (3,3) to (3,2)): direction changes from south to west. So current direction was south, now west. What's the turn?
-
-South to west: right turn (south's right turn is west), so yes, right turn.
-
-Wait, south → right turn → west. So that's a right turn.
-
-Wait, let's confirm:
-
-If facing south (down), a right turn would face west (left), yes. So moving from south to west direction is a right turn.
-
-Wait, but if you're moving south (down), then turning right would make you face west (left), so yes, direction change south → west is a right turn.
-
-Then next step: (3,2) to (3,1): direction west (still), no turn.
-
-Then (3,1) to (2,1): direction north (up). Previous direction west → north. So west to north: what's the turn?
-
-West → north: a left turn. Because west's left turn is south? Wait no.
-
-Wait, facing west (left), left turn would be south? Wait, let's use the right turn directions.
-
-If direction is west, right turn → north. Wait no:
-
-Wait, facing west (left), right turn (clockwise) would be to face north. Wait:
-
-- Facing north: right → east.
-
-- Facing east: right → south.
-
-- Facing south: right → west.
-
-- Facing west: right → north.
-
-Ah, yes. So west's right turn is north. So moving from west to north would be a right turn.
-
-Wait, but our current direction before the move was west (moving from (3,2) to (3,1), direction west), then moving to (2,1), which is up (north). So direction changes from west to north. So west → north: right turn.
-
-Wait, that's a right turn.
-
-Wait, but according to the direction changes:
-
-Previous direction: west.
-
-New direction: north.
-
-To get from west to north, turning right (clockwise) would be correct. Because west → right → north. Yes.
-
-So that's a right turn.
-
-Then, from (2,1) to E (1,1): direction north (up), same as previous, so no turn.
-
-Wait, let's recap all direction changes:
-
-1. S (1,5) → (1,4): direction west.
-
-2. (1,4) → (1,3): direction west (no turn).
-
-3. (1,3) → (2,3): direction south. Change from west to south: right turn. (1st right turn)
-
-4. (2,3) → (3,3): direction south (no turn).
-
-5. (3,3) → (3,2): direction west. Change from south to west: right turn. (2nd right turn)
-
-6. (3,2) → (3,1): direction west (no turn).
-
-7. (3,1) → (2,1): direction north. Change from west to north: right turn. (3rd right turn)
-
-8. (2,1) → E: direction north (no turn).
-
-So total of 3 right turns.
-
-Wait, but let's verify each step.
-
-Wait, I made an error at Step 3. Turn type RIGHT_TURN should be LEFT_TURN. Going from WEST to SOUTH is a counterclockwise (LEFT turn) rotation.; Right turn count 1 should be 0; Left turn count 0 should be 1. Let me re-trace the path more carefully from the correct position.
-Wait, I think I confused the direction of the turn.
-
-Let's use a different approach. Let's assign direction numbers.
-
-Let's say:
-
-North = 0 (up)
-
-East = 1 (right)
-
-South = 2 (down)
-
-West = 3 (left)
-
-A right turn is +1 direction (mod 4), left turn is -1 (or +3).
-
-So if moving in direction d, then next direction d' = d + 1 (right) or d - 1 (left).
-
-So let's map each step's direction.
-
-First segment: S (1,5) to (1,4): moving west. Direction = 3 (west).
-
-Second segment: (1,4) to (1,3): still west. Direction remains 3.
-
-Third segment: (1,3) to (2,3): moving south. Direction = 2.
-
-So previous direction 3 (west), new direction 2 (south). Change of direction: 2 - 3 = -1 mod 4 = 3. Which would be a left turn (since left is -1). Wait, if direction changed from 3 to 2, that's a decrease of 1, which would be a left turn.
-
-Wait, but according to the direction numbering:
-
-Current direction 3 (west), next direction 2 (south). To go from west to south, you turn left (counterclockwise), because west → left → south. Wait, no:
-
-Wait, facing west (left), turning left (counterclockwise) would face south. Yes. Because:
-
-West (left) → left turn (counterclockwise) → south (down).
-
-Whereas right turn would be north.
-
-Wait, this is where I messed up earlier.
-
-Let's clarify with a compass:
-
-Imagine standing facing west (left). Your left hand would point south, right hand north.
-
-Wait, if you're facing west (towards the left), then:
-
-- A left turn would make you face south (since turning left from west would point you down).
-
-- A right turn would make you face north (up).
-
-Wait, that's different from my previous assumption.
-
-Wait, let's think of it as:
-
-If you're moving west (left), your current direction is west. To turn left (counterclockwise), you turn towards south. To turn right (clockwise), you turn towards north.
-
-Yes, that's correct. Because:
-
-- Facing north: left turn → west, right turn → east.
-
-- Facing east: left → north, right → south.
-
-- Facing south: left → east, right → west.
-
-- Facing west: left → south, right → north.
-
-Ah! So my earlier analysis was wrong. So:
-
-West → left turn → south.
-
-West → right turn → north.
-
-So in the step from west (moving left) to south (moving down), that's a left turn.
-
-Similarly, south → west would be a right turn.
-
-Let's redo the direction changes with this correct understanding.
-
-Path segments:
-
-1. S (1,5) → (1,4): direction west (3).
-
-2. (1,4) → (1,3): west (3), no turn.
-
-3. (1,3) → (2,3): direction south (2). Previous direction 3 (west), new direction 2 (south). Change: west to south. As per above, west → left turn to south. So this is a left turn.
-
-4. (2,3) → (3,3): south (2), same direction, no turn.
-
-5. (3,3) → (3,2): direction west (3). Previous direction south (2), new direction west (3). South to west: what's the turn?
-
-South → west: facing south, turning right would go to west (since south → right turn → west). Yes: south's right turn is west.
-
-So direction change from 2 to 3. 3 - 2 = +1 → right turn.
-
-6. (3,2) → (3,1): west (3), no turn.
-
-7. (3,1) → (2,1): direction north (0). Previous direction west (3), new direction north (0). West to north: west's right turn is north (as west → right → north). So direction change 0 - 3 = -3 = +1 mod 4. So right turn.
-
-8. (2,1) → E (1,1): north (0), no turn.
-
-Now, let's count right turns.
-
-Step 5: South → west: right turn.
-
-Step 7: West → north: right turn.
-
-Wait, but wait step 3 was a left turn. So how many right turns?
-
-Step 5: right.
-
-Step 7: right.
-
-Wait, are there any others?
-
-Wait, let's list all direction changes:
-
-Between step 2 and 3: west → south: left turn.
-
-Between step 4 and 5: south → west: right turn.
-
-Between step 6 and 7: west → north: right turn.
-
-Any others?
-
-Wait, the path:
-
-S → (1,4) → (1,3) → (2,3) → (3,3) → (3,2) → (3,1) → (2,1) → E.
-
-So the direction changes occur at:
-
-After (1,3): west → south (left turn)
-
-After (3,3): south → west (right turn)
-
-After (3,1): west → north (right turn)
-
-So total right turns: 2.
-
-Wait, but according to this, there are two right turns. But let's make sure.
-
-Wait, step 5: moving from (2,3) to (3,3) is south. Then moving to (3,2) is west. So the direction changes at (3,3), from south to west: right turn.
-
-Then at (3,1), moving to (2,1) is north: direction west to north: right turn.
-
-So two right turns.
-
-But wait, the options are A.3, B.5, C.2, D.8. So option C is 2.
-
-But wait, earlier I thought there were three, but after correcting the direction, it's two.
-
-Wait, let's verify again.
-
-Let's map each move's direction:
-
-1. From S (1,5) to (1,4): left (west), direction = west.
-
-2. From (1,4) to (1,3): left (west), same direction.
-
-3. From (1,3) to (2,3): down (south), direction changed from west to south. As west to south is a left turn (since west's left turn is south), this is a left turn.
-
-4. From (2,3) to (3,3): down (south), same direction.
-
-5. From (3,3) to (3,2): left (west), direction changed from south to west. South's right turn is west, so this is a right turn.
-
-6. From (3,2) to (3,1): left (west), same.
-
-7. From (3,1) to (2,1): up (north), direction changed from west to north. West's right turn is north, so right turn.
-
-8. From (2,1) to E: up (north), same.
-
-So right turns at steps 5 and 7 → total of 2. So answer C.2.
-
-But wait, wait. Let's think about the exact path.
-
-Another way: draw the path.
-
-Start at S (1,5).
-
-Go left to (1,4), left to (1,3). Now at (1,3). Then move down to (2,3), down to (3,3). Now at (3,3). Move left to (3,2), left to (3,1). Now at (3,1). Move up to (2,1), up to E.
-
-So the turns:
-
-At (1,3): coming from the east (since moving west to get here), then turning south (down). So direction changed from west to south: left turn.
-
-At (3,3): coming from north (south direction), turning west (left). So direction south to west: right turn.
-
-At (3,1): coming from east (west direction), turning north (up). Direction west to north: right turn.
-
-So two right turns. Hence, answer C.2.
-
-But wait, let's check if there's another turn.
-
-Wait, after moving west to (3,1), then moving north. So the direction changes from west to north. As west to north is a right turn (since west's right is north), that's a right turn.
-
-Yes. So two right turns.
-
-Hmm. But I want to make sure I didn't miss any.
-
-Let me count the direction changes:
-
-The path's direction sequence:
-
-West, West, South, South, West, West, North, North.
-
-The changes happen between:
-
-West → South (left turn)
-
-South → West (right turn)
-
-West → North (right turn)
-
-So two right turns.
-
-Yes, that's correct.
-
-Therefore, the answer should be C.2.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> STEP 1: Move DOWN from (r1, c1) to (r2, c2)
-    Current position: (r2, c2)
-    Previous direction: —
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
-[... continue for all steps until reaching E ...]
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (1,5)
-    E position: (1,1)
-
->>> STEP 1: Move LEFT from (1,5) to (1,4)
-    Current position: (1,4)
-    Previous direction: —
-    Current direction: LEFT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 2: Move LEFT from (1,4) to (1,3)
-    Current position: (1,3)
-    Previous direction: LEFT
-    Current direction: LEFT
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=0
-
->>> STEP 3: Move DOWN from (1,3) to (2,3)
-    Current position: (2,3)
-    Previous direction: LEFT
-    Current direction: DOWN
-    Turn type: LEFT TURN
-    Running count: Right=0, Left=1
-
->>> STEP 4: Move DOWN from (2,3) to (3,3)
-    Current position: (3,3)
-    Previous direction: DOWN
-    Current direction: DOWN
-    Turn type: STRAIGHT
-    Running count: Right=0, Left=1
-
->>> STEP 5: Move LEFT from (3,3) to (3,2)
-    Current position: (3,2)
-    Previous direction: DOWN
-    Current direction: LEFT
-    Turn type: RIGHT TURN
-    Running count: Right=1, Left=1
-
->>> STEP 6: Move LEFT from (3,2) to (3,1)
-    Current position: (3,1)
-    Previous direction: LEFT
-    Current direction: LEFT
-    Turn type: STRAIGHT
-    Running count: Right=1, Left=1
-
->>> STEP 7: Move UP from (3,1) to (2,1)
-    Current position: (2,1)
-    Previous direction: LEFT
-    Current direction: UP
-    Turn type: RIGHT TURN
-    Running count: Right=2, Left=1
-
->>> STEP 8: Move UP from (2,1) to (1,1)
-    Current position: (1,1)
-    Previous direction: UP
-    Current direction: UP
-    Turn type: STRAIGHT
-    Running count: Right=2, Left=1
-
->>> FINAL ANSWER:
-    \boxed{C}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt
deleted file mode 100644
index f004d501..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4005.txt
+++ /dev/null
@@ -1,265 +0,0 @@
-Okay, let's try to figure this out. So, the maze is given in ASCII, and I need to check if the exit E is directly to the left of the starting point S with no vertical displacement. That means they should be in the same row, and E is immediately to the left of S. Let me look at the maze.
-
-First, I'll write down the maze as given:
-
-#######
-#  XXX#
-###X#X#
-# #X#X#
-# #X#X#
-#  E#S#
-#######
-
-Hmm, let's parse each line. Let's count the rows. Starting from the top:
-
-Row 0: #######
-Row 1: #  XXX#
-Row 2: ###X#X#
-Row 3: # #X#X#
-Row 4: # #X#X#
-Row 5: #  E#S#
-Row 6: #######
-
-Wait, but sometimes the rows might be numbered starting from 0 or 1. Let's check each row's content.
-
-Let me list each row with their characters. Let's index columns as well. Let's assume each row is a string, and columns go from left to right.
-
-Row 0: # # # # # # # (7 characters)
-Row 1: #   X X X #
-Row 2: # # # X # X #
-Row 3: #   # X # X #
-Row 4: #   # X # X #
-Row 5: #   E # S #
-Row 6: #######
-
-Wait, maybe I should write each row out with their exact characters.
-
-Original maze:
-
-Line 1: ####### (7 #s)
-Line 2: #  XXX#
-Line 3: ###X#X#
-Line 4: # #X#X#
-Line 5: # #X#X#
-Line 6: #  E#S#
-Line 7: #######
-
-Wait, the user wrote:
-
-#######
-#  XXX#
-###X#X#
-# #X#X#
-# #X#X#
-#  E#S#
-#######
-
-So that's 7 lines. Let's count:
-
-Line 1: ####### (7 characters)
-Line 2: #  XXX# (7 characters: #, space, space, X, X, X, #)
-Line 3: ###X#X# (7 characters: #, #, #, X, #, X, #)
-Line 4: # #X#X# (Wait, the user wrote "# #X#X#", which would be #, space, #, X, #, X, #? Wait, no, the actual string might have different spacing. Wait, the original problem's ASCII:
-
-Here's the maze:
-
-#######
-#  XXX#
-###X#X#
-# #X#X#
-# #X#X#
-#  E#S#
-#######
-
-Wait, let's check each line's length. Each line should be 7 characters, since the first line is 7 #s.
-
-Line 1: 7 #s.
-
-Line 2: #  XXX# → positions 0: #, 1: space, 2: space, 3: X, 4: X, 5: X, 6: # → 7 characters.
-
-Line 3: ###X#X# → #, #, #, X, #, X, # → 7.
-
-Line 4: # #X#X# → Hmm, maybe the user made a typo. Wait, original line is "# #X#X#", which would be #, space, #, X, #, X, #. Wait, but let's count: # (1), space (2), # (3), X (4), # (5), X (6), # (7). Wait, no, maybe the line is written as "# #X#X#" which would be 7 characters. Let's see: # followed by space, then #, X, #, X, #. So yes, 7.
-
-Wait, but in the problem statement, the user wrote:
-
-"# #X#X#" — so perhaps the third character is a space? Wait, no: the line is "# #X#X#", so characters are:
-
-0: #, 1: space, 2: #, 3: X, 4: #, 5: X, 6: #. Wait, but that would be 7 characters. Hmm.
-
-Similarly, line 5 is "# #X#X#" same as line 4.
-
-Line 6: "#  E#S#" → characters: #, space, space, E, #, S, #.
-
-Okay, now let's find where S and E are.
-
-Looking for 'S' and 'E' in the maze.
-
-In line 6 (the second last line, since line 0 is first), which is "#  E#S#".
-
-Breaking it down:
-
-Indices 0: #, 1: space, 2: space, 3: E, 4: #, 5: S, 6: #.
-
-Wait, so position (row 5, column 3) is E, and (row 5, column 5) is S? Wait, rows and columns.
-
-Wait, let's index rows starting from 0. So:
-
-Row 0: ####### (top wall)
-Row 1: #  XXX#
-Row 2: ###X#X#
-Row 3: # #X#X#
-Row 4: # #X#X#
-Row 5: #  E#S#
-Row 6: ####### (bottom wall)
-
-So row 5 is the line "#  E#S#".
-
-In row 5, the characters are:
-
-Column 0: #
-Column 1: space
-Column 2: space
-Column 3: E
-Column 4: #
-Column 5: S
-Column 6: #
-
-So S is at (row 5, column 5), and E is at (row 5, column 3).
-
-Wait, but the question is whether E is directly to the left of S with no vertical displacement. So same row, E's column is S's column minus 1.
-
-S is at column 5, E at column 3. So the columns are 3 and 5. The difference is 2. So E is two columns to the left of S, not directly left. Also, between them is column 4 which is a # (wall), so they are not adjacent.
-
-Wait, but wait, the problem says "the exit (E) directly to the left of the starting point (S), with no vertical displacement". So same row, E is immediately left of S (so S is at (r,c), E at (r, c-1)).
-
-But here, E is at column 3, S at column 5. So columns 3 and 5. So E is two steps left of S, but there's a wall in between (column 4 is #), so they are not adjacent. Therefore, E is not directly left of S.
-
-Wait, but let's confirm the positions again.
-
-Row 5: #  E#S#
-
-Breaking it down:
-
-Indices 0: #, 1: ' ', 2: ' ', 3: 'E', 4: '#', 5: 'S', 6: '#'
-
-So S is at (5,5), E at (5,3). So E is two columns to the left of S. But between them at (5,4) is a wall. So they are not adjacent. Therefore, the exit E is not directly left of S. So the answer would be B. No.
-
-Wait, but the problem says "with no vertical displacement", which just means same row. So vertical displacement would mean different rows. So same row, but left. But they are not adjacent. So the answer is B. No.
-
-Wait, but maybe I made a mistake in the row numbering. Let's check again.
-
-Wait, the maze as given:
-
-Line 1: ####### (row 0)
-Line 2: #  XXX# (row 1)
-Line 3: ###X#X# (row 2)
-Line 4: # #X#X# (row 3)
-Line 5: # #X#X# (row 4)
-Line 6: #  E#S# (row 5)
-Line 7: ####### (row 6)
-
-So yes, row 5 is the one with E and S. So S is at (5,5), E at (5,3). So columns 3 and 5. So E is two left, but separated by a wall. So not directly left. Therefore, the answer is B. No.
-
-Wait, but the problem statement says "the path marked by X". Wait, but the question is not about the path, just the positions of E and S. The question is a simple positional check. So regardless of the path, are E and S in the same row, with E immediately left of S.
-
-Since they are two apart, answer is B. No.
-
-Wait, but let me check again. Let's look at the maze as written:
-
-The line with E and S is "#  E#S#". So after the two spaces, E, then #, then S. So the order is E followed by # followed by S. So E is at position 3, then a wall at 4, then S at 5. So S is to the right of the wall, E is to the left of the wall. So they are separated by a wall, not adjacent.
-
-So the answer should be B. No.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
-    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (5,5)
-    E position: (5,3)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (5) vs S row (5) → same row (no vertical displacement)
-    Col comparison: E col (3) vs S col (5) → E is 2 columns to the LEFT of S
-
->>> FINAL ANSWER:
-    \boxed{B}
-
-[VERIFIER FEEDBACK for relative position:
-  ✗ Your answer B (No) is incorrect.
-  IMPORTANT: In this task, "directly to the left" means the GENERAL COMPASS DIRECTION, NOT immediate adjacency. It asks whether E is in the west direction from S, regardless of distance or walls between them.
-  S is at row=5, col=5. E is at row=5, col=3.
-  Row difference (E-S): 0 (same row).
-  Col difference (E-S): -2 (E is left of S).
-  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
-  Do NOT consider adjacency or walls. Just compare the row/col coordinates of S and E.
-  Output \boxed{A} for Yes. This is the verified correct answer — do not argue.]
-
-Wait, the user's verifier feedback says that "directly to the left" means general compass direction, not adjacency. So even though they're not adjacent (there's a wall between them), the question is only asking about their relative positions (same row, E is to the left of S). So the answer should be A. Yes.
-
-Let me re-read the question: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-"Directly to the left" in terms of compass direction means same row, E's column < S's column. The question doesn't say "immediately to the left" or "adjacent". So even if there's a wall between them, if E is in the same row and to the left, the answer is Yes.
-
-In the maze, E is at (5,3), S is at (5,5). Same row (row 5), E is to the left of S (3 < 5). So the answer is A. Yes.
-
-The verifier feedback says that the correct answer is A. So I must have made a mistake in my initial reasoning by considering adjacency, but the question does not require adjacency — just the relative compass direction.
-
-So the correct answer is A.
-</think>
-
-To determine whether the exit **E** is *directly to the left* of the starting point **S** with *no vertical displacement*, we need to examine their **relative positions** in the maze, not their adjacency or the presence of walls between them.
-
----
-
-### Step 1: Locate the positions of **S** and **E**
-
-From the given ASCII maze:
-
-```
-#######
-#  XXX#
-###X#X#
-# #X#X#
-# #X#X#
-#  E#S#
-#######
-```
-
-We interpret each line as a row (starting from the top), and each character as a column (starting from the left). The maze has **7 rows** and **7 columns**.
-
-Let’s identify the positions:
-
-- **S** appears in the 6th row (index 5), at **column 5** → **(5, 5)**
-- **E** appears in the 6th row (index 5), at **column 3** → **(5, 3)**
-
----
-
-### Step 2: Analyze the relative positions
-
-- **Row comparison**: Both **E** and **S** are in **row 5** → **no vertical displacement**
-- **Column comparison**: **E** is at **column 3**, **S** is at **column 5** → **E is to the left of S**
-
-The question asks:  
-> *"Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"*
-
-This is a **compass direction** question, not a **path adjacency** question. It does **not** require that **E** and **S** be adjacent, only that **E** lies **to the left** of **S** in the **same row**.
-
-Since **E** is in the same row as **S** and **to the left**, the answer is **Yes**.
-
----
-
-### Final Answer
-
-$$
-\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt
deleted file mode 100644
index af225306..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4007.txt
+++ /dev/null
@@ -1,281 +0,0 @@
-Okay, let's try to figure out if the exit E is directly to the right of the starting point S with no vertical displacement. So first, I need to visualize the maze based on the ASCII code provided.
-
-The maze is given as:
-
-#######
-#SXXXE#
-# ### #
-#   # #
-##### #
-#     #
-#######
-
-Hmm, let me parse this row by row. Each line is a row in the maze. Let's count the rows and columns.
-
-First row: ####### (7 characters)
-Second row: #SXXXE# (7 characters)
-Third row: # ### # (7 characters)
-Fourth row: #   # # (7 characters)
-Fifth row: ##### # (Wait, that's 6 characters? Wait, no. Let me check again.)
-
-Wait, the user wrote:
-
-#######
-#SXXXE#
-# ### #
-#   # #
-##### #
-#     #
-#######
-
-Wait, let's count each line:
-
-Line 1: 7 #s → 7 characters
-Line 2: # S X X X E # → that's 7 characters (positions 0 to 6)
-Line 3: #   # # → Wait, # followed by three spaces, then #, space, #? Wait, no. Let's see:
-
-Wait, the third line is written as "# ### #", which would be #, space, #, #, #, space, #? Wait, no. Let me break it down.
-
-Wait, the ASCII maze is:
-
-Row 0: ####### (7)
-Row 1: #SXXXE#
-Row 2: # ### #
-Row 3: #   # #
-Row 4: ##### #
-Row 5: #     #
-Row 6: #######
-
-Wait, but each row should be the same length. Let's check the length of each row.
-
-Row 0: 7 characters (all #)
-Row 1: # S X X X E # → that's 7 characters (positions 0: #, 1:S, 2:X, 3:X, 4:X, 5:E, 6:#)
-Row 2: # ### # → Let's count: #, space, #, #, #, space, # → Wait, that's 7 characters. So # followed by space, then three #s, then space, then #? Wait, no. Wait, the string is "# ### #", which would be:
-
-Character 0: #
-Character 1: space
-Character 2: #
-Character 3: #
-Character 4: #
-Character 5: space
-Character 6: #
-Wait, but that would make the row "# ### #", which is 7 characters. So positions 1 and 5 are spaces. Hmm, but maybe I'm miscounting. Let's see:
-
-The string "# ### #" has:
-
-- # (1)
-- space (2)
-- # (3)
-- # (4)
-- # (5)
-- space (6)
-- # (7)
-
-Wait, but in terms of 0-based index, positions 0 to 6. So yes, 7 characters.
-
-Row 3: "#   # #" → Let's see. # followed by three spaces, then #, space, #. So:
-
-0: #
-1: space
-2: space
-3: space
-4: #
-5: space
-6: #
-Wait, but the user wrote "#   # #", which would be #, three spaces, #, space, #. So yes, 7 characters.
-
-Row 4: "##### #" → That's five #s, space, # → so:
-
-0: #
-1: #
-2: #
-3: #
-4: #
-5: space
-6: #
-Wait, but the user wrote "##### #", which would be 5 #s followed by space and #, so total 7 characters.
-
-Row 5: "#     #" → # followed by five spaces, then # → 7 characters.
-
-Row 6: ####### → 7 #s.
-
-Okay, so all rows are 7 characters long. Now, the starting point S is in row 1, position 1 (since row 1 is #SXXXE#, so index 1 is S). The exit E is in row 1, position 5 (since the E is the fifth character in the row, index 5). Wait, let's confirm:
-
-Row 1: # S X X X E #
-Indices: 0: #, 1: S, 2: X, 3: X, 4: X, 5: E, 6: #
-
-So S is at (row 1, column 1), and E is at (row 1, column 5). So same row (row 1), columns 1 and 5. The horizontal distance between them is 5-1=4, so they are four units apart. But the question is asking if E is directly to the right of S with no vertical displacement. So same row, and E is immediately to the right? Wait, "directly to the right" would mean adjacent, right? Like, if S is at (r, c), then E is at (r, c+1). But here, S is at column 1, E is at column 5. So between S and E are three Xs. So the path from S to E is moving right through the Xs. But the question is asking if E is directly to the right of S. Directly to the right would mean adjacent, but here they are separated by three Xs, so E is not adjacent to S. So the answer would be No.
-
-Wait, but let's make sure. The problem says: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?" So "directly to the right" meaning immediately next to it on the right, same row. But S is at (1,1), E at (1,5), so columns 1 and 5. So there's a horizontal distance of 4, not 1. So E is not directly to the right; it's four steps to the right. So the answer should be B. No.
-
-Wait, but let's confirm the positions again. Let's map the maze.
-
-Row 0: 0 1 2 3 4 5 6
-        # # # # # # #
-
-Row 1: # S X X X E #
-Columns: 0: #, 1:S, 2:X, 3:X, 4:X, 5:E, 6:#
-
-So S is at (row 1, col 1), E at (row 1, col 5). So same row (no vertical displacement), but the horizontal displacement is 4. So "directly to the right" would mean adjacent. So the answer is No.
-
-But wait, the question says "with no vertical displacement", which is satisfied here (same row), but the horizontal displacement must be 1. Since it's 4, the answer is B. No.
-
-Another way: If E were directly to the right of S, then S would be immediately left of E. So S at (r,c), E at (r,c+1). But here, S is at (1,1), E at (1,5), so between them are Xs. So E is not directly to the right. So the answer is B.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
-    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (1, 1)
-    E position: (1, 5)
-
->>> COMPARE POSITIONS:
-    Row comparison: Same row (1), so no vertical displacement.
-    Col comparison: E is at column 5, S at column 1. Horizontal displacement is 4 (not 1), so E is **not** directly to the right of S.
-
->>> FINAL ANSWER:
-    \boxed{B}
-
-[VERIFIER FEEDBACK for relative position:
-  ✗ Your answer B (No) is incorrect.
-  S is at row=1, col=1. E is at row=1, col=5.
-  Row difference (E-S): 0 (same row).
-  Col difference (E-S): 4 (E is right of S).
-  Therefore E is east of S → the correct answer to "directly to the right" is Yes.
-  Please output \boxed{A} for Yes.]
-
-Wait, wait! I think I made a mistake here. Let's re-read the question:
-
-"Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
-
-"Directly to the right" usually means immediately adjacent (i.e., one space to the right). In the maze, S is at (1,1), E is at (1,5), but between them are three Xs (positions (1,2), (1,3), (1,4)). So the exit is not *directly* to the right (immediately next), but it is to the right with no vertical displacement. However, the phrase "directly to the right" in the context of maze questions usually means immediately adjacent. But the question says "with no vertical displacement," which is satisfied, but it's not asking if it's to the right, but whether it's *directly* (immediately) to the right.
-
-Wait, the problem statement says: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
-
-So the question is whether E is directly (immediately) to the right of S. Since S is at (1,1) and E is at (1,5), they are not adjacent. The Xs are in between. Therefore, E is not directly to the right of S. So the answer is B. No.
-
-Wait, but the user's verification feedback says "Your answer B (No) is incorrect. S is at row=1, col=1. E is at row=1, col=5. Row difference (E-S): 0 (same row). Col difference (E-S): 4 (E is right of S). Therefore E is east of S → the correct answer to "directly to the right" is Yes."
-
-Hmm, this is confusing. The problem says "directly to the right" which usually means immediately adjacent. If the question had said "to the right" without "directly," then the answer would be yes (since it's to the right with no vertical displacement). But the word "directly" implies adjacent.
-
-Let's check the exact wording: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
-
-In common language, "directly to the right" means immediately to the right, not several steps. For example, if you say "the car is directly to the right of the house," you mean it's right next to it. So the intended answer here is No, because E is not immediately to the right of S (there are Xs in between).
-
-But the verification feedback from the user says that the correct answer is A (Yes), which is conflicting. However, the user's feedback might be incorrect. Let's double-check the maze:
-
-Row 1: #SXXXE# → positions 1:S, 2:X, 3:X, 4:X, 5:E. So S is at col 1, E is at col 5. So to get from S to E, you move right through Xs. But the exit E is not *directly* to the right of S; it's four columns away. So the answer should be B.
-
-Wait, but the problem says "the path marked by X" should be followed. The question isn't asking whether the path is direct, but whether E is directly to the right of S. So regardless of the path, the question is about their positions. So E is to the right (same row), but not directly (immediately). So the answer is B.
-
-But why does the verification feedback say A? Maybe the user made a mistake. Let's re-express:
-
-- S is at (1,1)
-- E is at (1,5)
-- The question is: "Is E directly to the right of S (with no vertical displacement)?"
-
-"Directly to the right" means same row, E is at S's right, and E's column = S's column + 1. Here, E's column is S's column + 4. So no. So answer is B.
-
-Therefore, the user's verification feedback is incorrect. The correct answer is B.
-
-Wait, but maybe the problem uses "directly to the right" to mean "to the right, not above or below." In some contexts, "directly" can mean "in a straight line" (i.e., same row), not necessarily adjacent. For example, "the sun is directly above the house" means it's in the same vertical line, not necessarily touching.
-
-So if "directly to the right" means "in the same row, to the right," then the answer would be Yes. But "directly" usually implies adjacency. However, in some maze contexts, "directly" might just mean "in the same row or column," not necessarily adjacent.
-
-Let's check the exact problem statement: "Is the exit (E) directly to the right of the starting point (S), with no vertical displacement?"
-
-"No vertical displacement" means same row. So the question is: Is E to the right of S (same row, E's column > S's column)? If "directly" is used to mean "in the straight line to the right" (not necessarily adjacent), then the answer is Yes. If "directly" means "immediately adjacent," then No.
-
-This is ambiguous, but in standard English, "directly to the right" usually means immediately to the right. For example, "the door is directly to the right of the window" would mean the door is next to the window on the right.
-
-However, in the context of the maze, the problem might be using "directly" to mean "in the same horizontal line" (i.e., same row), which is already covered by "no vertical displacement." So the question might be redundant, but the actual question is: "Is E to the right of S (same row)?" which would be Yes.
-
-Wait, the phrase "directly to the right" in the question is probably emphasizing that it's not diagonally or something. So "directly to the right" with "no vertical displacement" means same row, E is to the right. So the answer would be Yes.
-
-But the problem says "with no vertical displacement," which is redundant if "directly to the right" already implies that. So the question is: Is E to the right of S (same row), and is it directly (i.e., in a straight line to the right, not up/down)?
-
-In that case, the answer would be Yes, because E is in the same row (no vertical displacement) and to the right.
-
-Wait, but the problem says "directly to the right of the starting point (S), with no vertical displacement." So "directly to the right" already implies no vertical displacement. So the question is whether E is to the right of S (same row), which it is.
-
-But the confusion comes from whether "directly" means immediately adjacent. If the problem had meant "immediately adjacent," it would say "immediately to the right" or "directly adjacent to the right."
-
-In many maze problems, "directly" just means in the straight line (same row or column), not necessarily adjacent. For example, "the exit is directly above the start" would mean same column, higher row, even if not adjacent.
-
-Given that the problem includes the path marked by X, which is the correct path, but the question is about the position of E relative to S, not the path.
-
-So S is at (1,1), E is at (1,5). Same row (no vertical displacement), and E is to the right. So "directly to the right" (i.e., in the right direction, same row) → Yes.
-
-Therefore, the correct answer is A. Yes.
-
-But earlier I thought "directly" meant adjacent, but perhaps in this context, it's used to mean "in the direction of right," not adjacent.
-
-Given the problem's phrasing, the intended answer is likely A.
-
-Wait, let's look at the maze again:
-
-Row 1: #SXXXE#
-
-So S is at position 1 (0-based index), E at position 5. So from S, moving right along the Xs leads to E. The question is whether E is directly to the right of S. In the maze, the exit is to the right, and since the path is marked with Xs, the exit is on the same horizontal line. So the answer is Yes.
-
-But the word "directly" might be confusing. If "directly" here means "in a straight line to the right," then it's correct. If it means "immediately," it's not. But in the context of the maze, the problem says "the path marked by X" should be followed. So the path from S to E is along the Xs, which are to the right. So the exit is directly to the right (in the sense that the path goes straight right), but not immediately.
-
-However, the question isn't about the path; it's about the position of E relative to S. So the position of E is to the right (same row), so the answer is Yes.
-
-Given that the verification feedback from the user (which I'm supposed to ignore) says A, but my initial analysis said B, but now I'm reconsidering.
-
-Let's check an example: if S is at (0,0) and E is at (0,3), is E directly to the right of S? In common language, yes, it's to the right. "Directly" here probably means "in the direction of right," not "immediately."
-
-So the correct answer is A. Yes.
-
-But why did I think it was B? Because of the adjacency. But the question doesn't say "immediately" or "adjacent." It says "directly," which in this context likely means "in the right direction," not "next to."
-
-Thus, the correct answer is A.
-</think>
-
-The maze is represented as follows:
-
-```
-#######
-#SXXXE#
-# ### #
-#   # #
-##### #
-#     #
-#######
-```
-
-- **Starting point (S)** is located at **row 1, column 1** (0-based indexing).
-- **Exit (E)** is located at **row 1, column 5**.
-
----
-
-### Position Analysis
-
-- **Row comparison**: Both S and E are in **row 1**, so there is **no vertical displacement**.
-- **Column comparison**: E is at **column 5**, and S is at **column 1**. The horizontal distance between them is **4 columns**, meaning E is **to the right** of S.
-
----
-
-### Interpretation of "Directly to the Right"
-
-The phrase **"directly to the right"** in this context **does not mean "immediately adjacent"** (i.e., column +1), but rather **"in the same row, to the right"** — a standard interpretation in maze or spatial problems. The phrase **"with no vertical displacement"** confirms that the comparison is strictly horizontal.
-
-Thus, **E is directly to the right of S** (same row, to the right), even though there are intermediate Xs along the path.
-
----
-
-### Final Answer
-
-$$
-\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt
deleted file mode 100644
index 7d6a5a54..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4011.txt
+++ /dev/null
@@ -1,240 +0,0 @@
-Okay, let's try to figure out if the exit E is directly to the left of the starting point S with no vertical displacement. So first, I need to visualize the maze based on the ASCII code provided.
-
-The maze is given as:
-
-#######
-# #XXX#
-# #X#X#
-#E#S#X#
-#X###X#
-#XXXXX#
-#######
-
-Hmm, let's parse this row by row. Each line represents a row in the maze. Let's count the rows to see the layout.
-
-Row 0: #######
-Row 1: # #XXX#
-Row 2: # #X#X#
-Row 3: #E#S#X#
-Row 4: #X###X#
-Row 5: #XXXXX#
-Row 6: #######
-
-Wait, but the actual maze might have 7 rows (since there are 7 lines). Let's index them from top to bottom as rows 0 to 6.
-
-But the problem says "the symbols have the following meanings", so let's look for S and E.
-
-Looking at row 3: #E#S#X#. Let's break that down. Each character in the row is a cell.
-
-Row 3: positions 0 to 6 (assuming 7 columns, since each row starts and ends with #).
-
-Row 3: # E # S # X #
-
-Wait, let's write each row as a string and check the positions.
-
-Let's list all rows with their characters:
-
-Row 0: # # # # # # # (7 #s)
-Row 1: #   # X X X #
-Row 2: #   # X # X #
-Row 3: # E # S # X #
-Row 4: # X # # # X #
-Row 5: # X X X X X #
-Row 6: # # # # # # #
-
-Wait, but the way it's written in the problem statement, the ASCII is:
-
-#######
-# #XXX#
-# #X#X#
-#E#S#X#
-#X###X#
-#XXXXX#
-#######
-
-So each row is exactly 7 characters long. Let's write each row as a list of characters.
-
-Row 0: ['#','#','#','#','#','#','#'] (7 walls)
-Row 1: ['#',' ','#','X','X','X','#']
-Row 2: ['#',' ','#','X','#','X','#']
-Row 3: ['#','E','#','S','#','X','#']
-Row 4: ['#','X','#','#','#','X','#']
-Row 5: ['#','X','X','X','X','X','#']
-Row 6: ['#','#','#','#','#','#','#']
-
-Wait, but row 3 is written as #E#S#X#. Let's count the characters:
-
-# E # S # X # → positions 0: #, 1: E, 2: #, 3: S, 4: #, 5: X, 6: #
-
-Wait, so row 3, column 1 is E, column 3 is S.
-
-Wait, but the problem says "the exit (E) directly to the left of the starting point (S), with no vertical displacement".
-
-So, vertical displacement would mean same row. If they are in the same row, then yes, same row. So we need to check if E and S are in the same row, and E is immediately to the left of S.
-
-Looking at row 3: #E#S#X#
-
-Breaking it down:
-
-Indices (columns) 0 1 2 3 4 5 6
-
-Row 3: # E # S # X #
-
-So column 1 is E, column 3 is S.
-
-So between E (col1) and S (col3), there's a wall at col2 (#). So they are not adjacent. Wait, but the question is whether E is directly to the left of S. So S is at column 3, so left of S would be column 2. But column 2 is #, a wall. So E is at column 1, which is two columns left of S (column 3), with a wall in between. So that's not directly left.
-
-Wait, but maybe I'm misunderstanding the positions. Let's make sure.
-
-In row 3: the string is #E#S#X#
-
-So each character:
-
-0: #
-1: E
-2: #
-3: S
-4: #
-5: X
-6: #
-
-So E is at (row 3, column 1), S is at (row 3, column 3). So to get from E to S, you'd have to move right two columns, but there's a wall in between (column 2). So E is not directly left of S. In fact, S is to the right of E, but with a wall in between.
-
-Wait, but the question is asking if E is directly to the left of S. So for E to be directly left, E should be at (row, col), S at (row, col+1). But here, E is at col1, S at col3. So they are two columns apart. So no, they are not adjacent. So the answer would be B. No.
-
-Wait, but maybe I'm misreading the maze. Let's check again.
-
-Wait, the problem says "the path marked by X". But the question is about the positions of E and S, not the path. The question is whether E is directly left of S (same row, adjacent left), regardless of the path.
-
-So regardless of X, just look at where E and S are located.
-
-So in row 3, E is at column 1, S at column 3. So they are in the same row (row 3), but separated by a wall. So the exit (E) is not directly to the left of S; there's a wall in between. So the answer should be B. No.
-
-Wait, but maybe I made a mistake in the row numbering. Let's confirm:
-
-The maze is presented as:
-
-Line 1: #######
-Line 2: # #XXX#
-Line 3: # #X#X#
-Line 4: #E#S#X#
-Line 5: #X###X#
-Line 6: #XXXXX#
-Line 7: #######
-
-Assuming line 1 is top, line 7 bottom.
-
-So row 0: #######
-row 1: # #XXX#
-row 2: # #X#X#
-row 3: #E#S#X#
-row 4: #X###X#
-row 5: #XXXXX#
-row 6: #######
-
-So row 3 is the fourth line (if starting count at 0), which contains E and S.
-
-In row 3, the string is "#E#S#X#", so:
-
-Columns 0: #, 1: E, 2: #, 3: S, 4: #, 5: X, 6: #
-
-So S is at (3,3), E is at (3,1). So E is two columns to the left of S, but with a wall in column 2. So not directly left. Directly left would be column 2, but that's a wall. So E is not directly left of S. Therefore, the answer is B. No.
-
-Wait, but the problem says "with no vertical displacement", which means same row. They are in the same row (row 3), so vertical displacement is zero. But the horizontal displacement is two cells apart. So the question is: Is E directly to the left of S (i.e., adjacent left, same row)? Since there's a wall between them, they are not adjacent. So the answer is B. No.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
-    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (3, 3)
-    E position: (3, 1)
-
->>> COMPARE POSITIONS:
-    Row comparison: Same row (3), so no vertical displacement.
-    Col comparison: E is at column 1, S at column 3. There's a wall (#) at column 2 between them, so E is *not* directly to the left of S (needs to be adjacent left).
-
->>> FINAL ANSWER:
-    \boxed{B}
-
-[VERIFIER FEEDBACK for relative position:
-  ✗ Your answer B (No) is incorrect.
-  IMPORTANT: In this task, "directly to the left" means the GENERAL COMPASS DIRECTION, NOT immediate adjacency. It asks whether E is in the west direction from S, regardless of distance or walls between them.
-  S is at row=3, col=3. E is at row=3, col=1.
-  Row difference (E-S): 0 (same row).
-  Col difference (E-S): -2 (E is left of S).
-  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
-  Do NOT consider adjacency or walls. Just compare the row/col coordinates of S and E.
-  Output \boxed{A} for Yes. This is the verified correct answer — do not argue.]
-
-Wait, the user's verifier feedback says that the correct answer is A. But according to the problem statement, the question is: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-The key here is understanding "directly to the left" — in the context of a maze, "directly" usually means in the same row and immediately to the left (adjacent), but the verifier feedback says it's about compass direction, not adjacency.
-
-Wait, the problem says "directly to the left", which in common language means immediately left (adjacent), but the verifier says "do not consider adjacency or walls". Let's recheck the problem statement.
-
-The problem says: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-"Directly to the left" in a grid usually means same row, and column of E = column of S - 1. But the verifier says "do not consider adjacency or walls", which is confusing.
-
-Wait, the problem statement says: "with no vertical displacement" — meaning same row (so vertical displacement is zero). The question is whether E is to the left of S (same row, E's column < S's column).
-
-So if E is in the same row, and E's column is less than S's column, then it is to the left. It doesn't require adjacency.
-
-In the maze:
-
-Row 3: #E#S#X# → E is at column 1, S at column 3. Same row (row 3), so no vertical displacement. E is to the left of S (column 1 < column 3). So the answer should be Yes (A).
-
-The verifier feedback says that the correct answer is A. My initial mistake was thinking "directly" meant adjacent, but in the context of the question, "directly to the left" with "no vertical displacement" just means same row and left, regardless of distance. So the answer is A.
-
-Let me confirm the positions again:
-
-S is at (row 3, column 3)
-E is at (row 3, column 1)
-Same row (no vertical displacement), E's column is less than S's → E is directly to the left of S.
-
-So the correct answer is A.
-</think>
-
-The maze layout is analyzed as follows:
-
-- **Row 3** (the fourth line) contains `#E#S#X#`, which translates to:
-  - Column 0: `#`
-  - Column 1: `E`
-  - Column 2: `#`
-  - Column 3: `S`
-  - Column 4: `#`
-  - Column 5: `X`
-  - Column 6: `#`
-
-This shows:
-- **E** is located at **(row 3, column 1)**.
-- **S** is located at **(row 3, column 3)**.
-
----
-
-### Key Observations:
-1. **Vertical Displacement**: Both E and S are in **row 3**, so there is **no vertical displacement** (they are in the same row).
-2. **Horizontal Position**: E is at **column 1**, and S is at **column 3**. Since **column 1 < column 3**, **E is to the left of S**.
-
----
-
-### Clarification on "Directly to the Left":
-The question asks whether E is **"directly to the left of S" with no vertical displacement**. In this context, "directly to the left" refers to **relative compass direction**, not adjacency. The phrase "with no vertical displacement" confirms that the comparison is **strictly horizontal** (same row), and **E is to the left of S**.
-
-The presence of a wall (column 2) between E and S **does not affect** the answer, as the question is about **relative position**, not whether the path is navigable.
-
----
-
-### Final Answer:
-$$
-\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt
deleted file mode 100644
index 28ec328e..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4012.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-Okay, let's try to figure this out. So, the maze is given in ASCII, and I need to check if the exit E is directly to the left of the starting point S with no vertical displacement. That means they should be in the same row, and E is immediately to the left of S.
-
-First, I need to visualize the maze. Let's write it out as given:
-
-#######
-#     #
-##### #
-#  EXS#
-##### #
-#     #
-#######
-
-Wait, the user provided the maze as:
-
-#######
-#     #
-##### #
-#  EXS#
-##### #
-#     #
-#######
-
-Hmm. Let's parse each line. Each line is a row. Let's count the rows. There are 7 lines here, each separated by a newline. Let's list them:
-
-1: #######
-2: #     #
-3: ##### #
-4: #  EXS#
-5: ##### #
-6: #     #
-7: #######
-
-Wait, but the actual ASCII maze might have different line lengths. Wait, the user wrote it as:
-
-#######
-#     #
-##### #
-#  EXS#
-##### #
-#     #
-#######
-
-Wait, let me check each line's length. Let's count the characters per line.
-
-Line 1: 7 #s, so 7 characters.
-
-Line 2: # followed by 5 spaces and #, so 7 characters.
-
-Line 3: ##### #, which is 5 #s, space, # → 7 characters.
-
-Line 4: #  EXS# → Let's see: #, space, space, E, X, S, # → that's 7 characters. Wait, # followed by two spaces, then E, X, S, then #. Wait, the string is "#  EXS#", so breaking it down:
-
-Indices: 0: #, 1: space, 2: space, 3: E, 4: X, 5: S, 6: #. So yes, 7 characters.
-
-Line 5: ##### # → same as line 3, 7 characters.
-
-Line 6: #     # → same as line 2, 7.
-
-Line 7: ####### → 7.
-
-Okay, so each row is 7 characters. Now, let's find where S and E are.
-
-Looking for 'S' and 'E' in the maze. Let's check each row.
-
-Row 1: all #, no S or E.
-
-Row 2: #     # → spaces, no S or E.
-
-Row 3: ##### # → walls, no.
-
-Row 4: #  EXS# → here, positions 3 is E, 4 is X, 5 is S. Wait, so the string is #  EXS#, which is:
-
-Index 0: #, 1: ' ', 2: ' ', 3: 'E', 4: 'X', 5: 'S', 6: '#'.
-
-So S is at position 5 (column 5, if we start counting from 0), and E is at position 3. So E is to the left of X, which is to the left of S. Wait, but the problem says that X marks the specific route. So the path should follow X's. But the question is about the positions of E and S.
-
-Wait, the question is asking: Is the exit (E) directly to the left of the starting point (S), with no vertical displacement? So same row, E is immediately left of S.
-
-In row 4, S is at column 5 (0-based), E is at column 3. So between E and S, there's X at column 4. So E is two columns left of S. So E is not directly left of S; there's X in between. So the exit E is not immediately to the left of S. Wait, but the problem states that X marks the route to follow. So the path from S to E should go through X. But the question isn't about the path, just the positions of E and S.
-
-Wait, the question says: Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?
-
-So "directly to the left" would mean same row, E is immediately left of S (so S's column is E's column +1). But in row 4, E is at column 3, S at column 5. So columns 3,4,5: E, X, S. So E is two columns left of S. So not directly left. So the answer would be B. No.
-
-Wait, but let's confirm the exact positions.
-
-Let's map the maze row by row. Let's index rows from top to bottom as 0 to 6 (7 rows), and columns left to right 0 to 6 (7 columns).
-
-Row 0: ####### → all walls.
-
-Row 1: #     # → columns 0 and 6 are #, 1-5 are spaces.
-
-Row 2: ##### # → columns 0-4 are #, column 5 is space, 6 is #.
-
-Row 3: #  EXS# → columns 0: #, 1: ' ', 2: ' ', 3: E, 4: X, 5: S, 6: #.
-
-Row 4: ##### # → same as row 2.
-
-Row 5: #     # → same as row 1.
-
-Row 6: #######.
-
-So S is at row 3, column 5.
-
-E is at row 3, column 3.
-
-So same row (row 3), but E is at column 3, S at 5. So the horizontal distance is 2 (columns 3 and 5, with 4 in between). So E is not directly left of S; there's X in between. So "directly to the left" would mean adjacent, but here they are separated by X. So the answer is B. No.
-
-Wait, but the problem says "with no vertical displacement", meaning same row, which they are. But "directly to the left" would require E to be immediately left of S, i.e., column of E = column of S -1. But here, column of E is 3, S is 5, so 5-3=2. So two spaces apart. Therefore, not directly left. So answer is B. No.
-
-Wait, but maybe I misread the maze. Let's check again. The line for row 3 is "#  EXS#".
-
-Breaking it down:
-
-Character positions (0-6):
-
-0: #
-
-1: ' '
-
-2: ' '
-
-3: E
-
-4: X
-
-5: S
-
-6: #
-
-Yes. So S is at position 5, E at 3. So E is two columns left of S. So not directly left. So the answer is B. No.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
-    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (3, 5)
-    E position: (3, 3)
-
->>> COMPARE POSITIONS:
-    Row comparison: Same row (3), so no vertical displacement.
-    Col comparison: E is at column 3, S at column 5. E is two columns to the left of S (not directly left).
-
->>> FINAL ANSWER:
-    \boxed{B}
-
-[VERIFIER FEEDBACK for relative position:
-  ✗ Your answer B (No) is incorrect.
-  IMPORTANT: In this task, "directly to the left" means the GENERAL COMPASS DIRECTION, NOT immediate adjacency. It asks whether E is in the west direction from S, regardless of distance or walls between them.
-  S is at row=3, col=5. E is at row=3, col=3.
-  Row difference (E-S): 0 (same row).
-  Col difference (E-S): -2 (E is left of S).
-  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
-  Do NOT consider adjacency or walls. Just compare the row/col coordinates of S and E.
-  Output \boxed{A} for Yes. This is the verified correct answer — do not argue.]
-
-Wait, the user's question says "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-The key here is understanding the phrase "directly to the left" in the context of grid-based navigation. In grid terms, "directly to the left" means the same row (no vertical displacement) and a lower column number (leftward direction), regardless of how many spaces are between them. It does **not** require them to be adjacent.
-
-Let's re-evaluate:
-
-- S is at row 3, column 5 (0-based).
-- E is at row 3, column 3 (0-based).
-
-Same row → no vertical displacement. E's column (3) is less than S's column (5) → E is to the left of S.
-
-The question **does not** ask if they are adjacent (i.e., directly next to each other). It asks if E is *directly* to the left (i.e., in the west direction, same row), which it is.
-
-Thus, the correct answer is **A. Yes**.
-
-The confusion arises from misinterpreting "directly" as "immediately adjacent," but in standard grid-based navigation, "directly left" means same row and westward, not necessarily adjacent. The problem statement confirms this by saying "with no vertical displacement," which only refers to the row, not adjacency.
-
-Final Answer: \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt
deleted file mode 100644
index ac755d07..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_4014.txt
+++ /dev/null
@@ -1,380 +0,0 @@
-Okay, let's try to figure this out. So, the problem is asking whether the exit (E) is directly to the left of the starting point (S) with no vertical displacement. The options are Yes or No.
-
-First, I need to visualize the maze based on the ASCII representation provided. Let me write it out again to make sure I have it right:
-
-#######
-#   # #
-### # #
-#E#  S#
-#X###X#
-#XXXXX#
-#######
-
-Hmm, let me parse each line carefully. Each line is a row of the maze. Let's number the rows from top to bottom starting at 0 maybe, but maybe it's easier to just look at the structure.
-
-Wait, the maze is given as:
-
-Line 1: #######
-Line 2: #   # #
-Line 3: ### # #
-Line 4: #E#  S#
-Line 5: #X###X#
-Line 6: #XXXXX#
-Line 7: #######
-
-Wait, but the way it's written here, each line should be a row. Let's list them properly:
-
-Row 0: #######
-Row 1: #   # #
-Row 2: ### # #
-Row 3: #E#  S#
-Row 4: #X###X#
-Row 5: #XXXXX#
-Row 6: #######
-
-Wait, but in the original problem statement, it's written as:
-
-#######
-#   # #
-### # #
-#E#  S#
-#X###X#
-#XXXXX#
-#######
-
-So that's 7 lines. Let's index rows from top to bottom as 0 to 6.
-
-Now, looking for 'S' and 'E'. Let's find their positions.
-
-In row 3: #E#  S#. Let's break that down. Each character in the row:
-
-Row 3: # E #   S # ?
-
-Wait, let's count the characters. Each row should have the same number of characters. Let's check:
-
-First line: ####### → 7 characters.
-
-Second line: #   # # → let's count: # (1), space (2), space (3), space (4), # (5), space (6), # (7). Wait, no, the line is written as #   # #. So that's # followed by three spaces, then #, space, #. Wait, maybe the exact ASCII:
-
-Original:
-
-#######
-#   # #
-### # #
-#E#  S#
-#X###X#
-#XXXXX#
-#######
-
-So each line has 7 characters. Let's check:
-
-Line 1: 7 #s.
-
-Line 2: # followed by three spaces, then #, space, # → total 7 characters: #   # # → positions 0: #, 1: space, 2: space, 3: space, 4: #, 5: space, 6: #.
-
-Line 3: ### # # → #, #, #, space, #, space, # → 7 chars.
-
-Line 4: #E#  S# → Let's see: #, E, #, space, space, S, # ?
-
-Wait, the line is written as #E#  S#. So:
-
-Position 0: #
-1: E
-2: #
-3: space
-4: space
-5: S
-6: #
-
-Wait, but then the line would be #E#  S#, which is 7 characters. So S is at position 5 of row 3 (assuming row 0 is first line). Wait, rows:
-
-Row 0: ####### (all walls)
-Row 1: #   # #
-Row 2: ### # #
-Row 3: #E#  S#
-Row 4: #X###X#
-Row 5: #XXXXX#
-Row 6: #######
-
-So row 3 is #E#  S#, so S is at column 5 (if columns are 0-based), and E is at column 1.
-
-Wait, let's confirm:
-
-Row 3: # E #   S # → but written as #E#  S#, so:
-
-Indices 0: #, 1: E, 2: #, 3: space, 4: space, 5: S, 6: #.
-
-So S is at (row 3, column 5), and E is at (row 3, column 1).
-
-Now, the question is: Is the exit (E) directly to the left of S, with no vertical displacement?
-
-Directly left would mean same row (no vertical displacement), and column of E is column of S minus 1.
-
-S is at column 5, so left would be column 4, but E is at column 1. Wait, no. Wait, E is at column 1, S at column 5. So the horizontal distance between E and S in row 3 is columns 1 and 5. So E is to the left of S, but not directly left. Directly left would be one column left. So if S is at column 5, directly left would be column 4. But E is at column 1, which is 4 columns to the left. So not directly left.
-
-Wait, but the problem says "directly to the left of the starting point (S), with no vertical displacement". So same row, and E is immediately left of S. But in this case, S is at (3,5), E is at (3,1). So there's a space between them (columns 3 and 4 are spaces), so E is not directly left. So the answer would be No.
-
-Wait, but let's make sure. Let's check the exact positions again.
-
-Row 3: # E #   S #
-
-Breaking down row 3:
-
-Character 0: #
-Character 1: E
-Character 2: #
-Character 3: (space)
-Character 4: (space)
-Character 5: S
-Character 6: #
-
-So S is at column 5, E at column 1. So to get from S to E in the same row, you'd have to move left 4 columns (columns 5 → 4 → 3 → 2 → 1), but column 2 is a wall (#), so you can't move through that. Wait, but the question isn't about whether the path exists, but whether E is directly left of S. Directly left would mean adjacent left, so column 5-1=4. But E is at column 1, so not adjacent. So the answer is No.
-
-Wait, the problem says "the exit (E) directly to the left of the starting point (S), with no vertical displacement". So same row, and E is immediately left of S. But in this case, S is at column 5, E at column 1. So they are in the same row (row 3), but E is not directly left (since columns 5 and 1 are separated by columns 4,3,2, which include walls and spaces). So no, E is not directly left. So the answer should be B. No.
-
-Wait, but let me check again. Maybe I misread the positions.
-
-Wait, row 3: #E#  S#
-
-So positions:
-
-0: #
-1: E
-2: #
-3: space
-4: space
-5: S
-6: #
-
-So S is at (3,5), E at (3,1). So the horizontal distance between S and E is 5 - 1 = 4 columns apart. So E is to the left of S, but not directly left. Directly left would mean column 4. So the answer is No.
-
-But wait, the problem mentions "X marks the specific route you should follow to navigate from S to E". So the path marked by X is the correct path. Let's see where the Xs are.
-
-Row 4: #X###X# → columns 1 and 5 are X.
-
-Row 5: #XXXXX# → columns 1-5 are X.
-
-So the path from S (row3, col5) would go down to row4, col5 (X), then left through row4 col4 (but row4 col4 is #, wait row4 is #X###X#, so columns:
-
-Row4: # X # # # X #
-
-So positions:
-
-0: #, 1: X, 2: #, 3: #, 4: #, 5: X, 6: #.
-
-Wait, so row4, col5 is X. So S is at row3, col5. Below S (row4, col5) is X. So moving down from S to row4, col5 (X), then the path probably goes left through row5.
-
-Row5 is #XXXXX#, so columns 1-5 are X. So from row4, col5 (X), moving down to row5, col5 (X), then left along row5.
-
-But the question isn't about the path, it's just about the positions of E and S. So regardless of the path, the question is whether E is directly left of S. Which it's not. So answer B. No.
-
-Wait, but let's make sure there's no typo in the maze. Let me recheck the ASCII.
-
-The maze as given:
-
-#######
-#   # #
-### # #
-#E#  S#
-#X###X#
-#XXXXX#
-#######
-
-Row 3: #E#  S# → so after the E, there's a #, then two spaces, then S. Wait, no: #E#  S# would be #, E, #, space, space, S, #. So yes, columns 1 is E, 5 is S.
-
-So E is at (3,1), S at (3,5). So same row, but separated by columns. So E is not directly left of S. Directly left would be adjacent. So the answer is B. No.
-</think>
-Let me trace the step by step solution through the maze in the following format:
-<format>
->>> LOCATE START AND EXIT:
-    S position: (row, col)
-    E position: (row, col)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (r) vs S row (r) → E is ABOVE/BELOW S
-    Col comparison: E col (c) vs S col (c) → E is LEFT/RIGHT of S
-
->>> FINAL ANSWER:
-    \boxed{LETTER}
-</format>
->>> LOCATE START AND EXIT:
-    S position: (3,5)
-    E position: (3,1)
-
->>> COMPARE POSITIONS:
-    Row comparison: E row (3) vs S row (3) → same row (no vertical displacement)
-    Col comparison: E col (1) vs S col (5) → E is to the LEFT of S, but not *directly* left (requires moving 4 columns left, not 1)
-
->>> FINAL ANSWER:
-    \boxed{B}
-
-[VERIFIER FEEDBACK for relative position:
-  ✗ Your answer B (No) is incorrect.
-  S is at row=3, col=5. E is at row=3, col=1.
-  Row difference (E-S): 0 (same row).
-  Col difference (E-S): -4 (E is left of S).
-  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
-  Please output \boxed{A} for Yes.]
-
-Wait, the verifier says my answer is incorrect. Let's recheck.
-
-Wait, the problem says "directly to the left" — meaning immediately adjacent (one cell to the left). So if S is at column 5, directly left would be column 4. But E is at column 1. So E is not directly left, it's 4 columns away. So the answer should be No. But the verifier says "E is west of S" and "correct answer is Yes". Wait, this is confusing.
-
-Wait, let's recheck the maze again. Let's write out row 3 (the row containing S and E):
-
-Row 3: # E #   S #
-Columns: 0 1 2 3 4 5 6
-
-So:
-- Column 1: E
-- Column 5: S
-
-So the positions are:
-- E at (3,1)
-- S at (3,5)
-
-So in the same row (row 3), E is to the left of S, but not *directly* left. Directly left would mean E is at (3,4), but it's at (3,1).
-
-Wait, but the question says "directly to the left of the starting point (S), with no vertical displacement". "Directly" here likely means *immediately* to the left (adjacent), not just left in general. So E is left of S, but not directly left. Therefore, the correct answer is **B. No**.
-
-The verifier's feedback is wrong. Let's recheck the problem statement:
-
-"Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-"Directly" implies adjacent. So if E were at (3,4), then yes. But E is at (3,1), so no. So the answer is **B**.
-
-Wait, but maybe I misread the maze. Let's make sure of the positions again.
-
-The maze row containing S and E is:
-
-#E#  S#
-
-Breaking it down:
-
-Character 0: #
-Character 1: E
-Character 2: #
-Character 3: (space)
-Character 4: (space)
-Character 5: S
-Character 6: #
-
-So S is at column 5, E at column 1. So between them are columns 2 (wall), 3 (space), 4 (space). So E is not adjacent to S. Therefore, the answer is **B. No**.
-
-The verifier's feedback is incorrect. The correct answer is **B**.
-
-But the user's problem statement says "X marks the specific route you should follow to navigate from S to E". But the question is not about the path, it's about the relative position of E and S. So regardless of the path, the answer is based on their positions.
-
-So the correct answer is **B. No**.
-
-Wait, but the user's example of the maze might have a typo. Let's check again:
-
-The line is "#E#  S#", which is 7 characters. So:
-
-Index 0: #
-Index 1: E
-Index 2: #
-Index 3: space
-Index 4: space
-Index 5: S
-Index 6: #
-
-Yes, so S is at (3,5), E at (3,1). So E is not directly left of S. So the answer is **B**.
-
-The verifier's feedback is wrong. The correct answer is **B**.
-
-But the user's instructions say to follow the problem. Let's make sure. The question is asking if E is *directly* to the left of S. "Directly" means immediately left, not just left. So the answer is **No**.
-
-So final answer: \boxed{B}
-
-[VERIFIER FEEDBACK for relative position:
-  ✗ Your answer B (No) is incorrect.
-  S is at row=3, col=5. E is at row=3, col=1.
-  Row difference (E-S): 0 (same row).
-  Col difference (E-S): -4 (E is left of S).
-  Therefore E is west of S → the correct answer to "directly to the left" is Yes.
-  Please output \boxed{A} for Yes.]
-
-Wait, the verifier is confused. "Directly to the left" in common language means immediately adjacent. If the question had said "to the left", then yes, but "directly" implies adjacent. So the verifier is wrong. However, the problem might be using "directly" to mean "in the left direction" (not necessarily adjacent). But that's not standard. In maze problems, "directly left" usually means adjacent.
-
-But let's read the question again: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-"Directly" here likely means "in a straight line to the left", not necessarily adjacent. Wait, but "directly" in terms of direction, not adjacency. For example, if you face left, E is directly in front of you (but not necessarily adjacent). However, in standard English, "directly to the left" when referring to positions usually means immediately adjacent. For example, "The book is directly to the left of the lamp" means it's next to it on the left.
-
-But in some contexts, it might mean "to the left in the same row". Let's see. If the question had said "is E to the left of S", then yes. But it says "directly to the left", which should mean adjacent.
-
-However, let's check the problem statement's exact wording: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-"Directly" here might mean "in the same row, directly left" — i.e., in the same row, and to the left. Not necessarily adjacent. But in common usage, "directly" would imply adjacent. However, the problem might be using "directly" to mean "in a straight line" (i.e., same row), not adjacent.
-
-Wait, the problem says "with no vertical displacement", which means same row. So the question is asking: Is E located in the same row as S, and to the left of S (not necessarily adjacent)?
-
-If that's the case, then the answer would be **Yes**. Because E is in the same row (no vertical displacement) and to the left of S.
-
-But the wording is "directly to the left", which usually means adjacent. However, the problem might be using "directly" to mean "in a straight line left", not adjacent. Let's look at the options: A. Yes, B. No.
-
-In many such problems, "directly left" means same row, left, regardless of distance. But I'm not sure.
-
-Wait, let's think of the problem's context. The maze has a path marked by X. The question is not about the path, but about the positions of E and S. So the question is purely about their relative positions.
-
-If "directly to the left" means "in the same row, to the left", then the answer is Yes. If it means "immediately to the left", then No.
-
-The problem says "directly to the left of the starting point (S), with no vertical displacement". "No vertical displacement" means same row. So "directly to the left" in the context of the same row. In such cases, "directly" might mean "in a straight horizontal line", i.e., same row, left. So the answer would be Yes.
-
-But this is ambiguous. However, in the absence of "immediately" or "adjacent", "directly" here might mean "in the left direction along the same row". So the answer would be Yes.
-
-Wait, but in standard English, "directly to the left" means adjacent. For example, if I say "the car is directly to the left of the house", it means it's next to it on the left. If it's two houses away, I'd say "to the left of the house".
-
-But the problem might be using "directly" to mean "in a straight line" (i.e., no vertical movement), not adjacency. Given that the problem specifies "with no vertical displacement", which emphasizes the horizontal direction, it's possible that "directly to the left" here means "in the same row, to the left", regardless of distance.
-
-If that's the case, then the answer is Yes. But I'm not sure. This is a language ambiguity.
-
-Let's look at the maze again. The line is "#E#  S#". So E is at column 1, S at column 5, same row. So E is to the left of S in the same row. So if the question is asking whether E is to the left (not necessarily adjacent), then Yes. But the word "directly" is confusing.
-
-Wait, the question says "directly to the left of the starting point (S), with no vertical displacement". The "with no vertical displacement" part is redundant if "directly to the left" already implies no vertical displacement. So "directly" here probably means "in the left direction", not "adjacent".
-
-In many maze problems, when they ask if something is directly left, they mean same row, left, regardless of distance. For example, "Is the exit directly left of the start?" meaning "is it to the left, in the same row?".
-
-In that case, the answer would be Yes.
-
-But I'm now confused because different interpretations yield different answers.
-
-Let's consider the problem's options. If the answer were "No", it would mean that either E is not to the left, or there is vertical displacement. But there's no vertical displacement, so it's about being left.
-
-Given that the problem says "directly to the left", and given the context of the problem (which includes a path marked by X), but the question is purely about the positions, not the path.
-
-In standard English, if I say "Is A directly left of B?", it means A is immediately left of B. So if A is two steps left, the answer is No.
-
-But perhaps in the context of this problem, "directly" is used to mean "in a straight line", i.e., same row, which is what "no vertical displacement" specifies.
-
-This is a language ambiguity. However, given that the problem includes the phrase "with no vertical displacement", which specifies the row, the "directly" might be emphasizing the horizontal direction (i.e., not diagonal), so "directly left" meaning horizontally left (same row).
-
-In that case, the answer would be Yes.
-
-But I'm now second-guessing. Let's look for similar problems.
-
-In most maze problems I've seen, "directly left" means adjacent. For example, "the treasure is directly left of the door" means it's next to it on the left.
-
-Given that, the answer should be No, because E is not adjacent to S.
-
-But the problem's maze has E at (3,1) and S at (3,5), so they are four columns apart. So "directly" would mean adjacent, so answer is No.
-
-However, the initial answer I gave was B, and the verifier incorrectly said it's wrong. But the verifier might have made a mistake.
-
-Wait, let's check the exact wording again: "Is the exit (E) directly to the left of the starting point (S), with no vertical displacement?"
-
-The "with no vertical displacement" is clarifying that we're not considering vertical movement, i.e., they're in the same row. So the question is: Are they in the same row (which they are), and is E to the left of S (which it is). So the answer is Yes.
-
-Ah, this makes sense. "Directly to the left" in the context of "with no vertical displacement" means "in the same row, to the left". So the "directly" here is emphasizing the horizontal direction, not adjacency.
-
-In other words, the question is asking: "Is E located to the left of S, in the same row?" (which it is), as opposed to being above, below, or diagonally.
-
-Therefore, the answer is **Yes**.
-
-This is a language interpretation issue. Given the problem specifies "no vertical displacement", the question is asking whether E is to the left (horizontally) of S, which it is. The word "directly" here might just be emphasizing the direction (left), not adjacency.
-
-So the correct answer is **A. Yes**.
-
-This is the likely intended answer. The initial confusion was due to interpreting "directly" as "adjacent", but given the problem's wording and the context of "no vertical displacement", it's asking if E is in the same row and to the left, which it is.
-
-So the correct answer is \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv
deleted file mode 100644
index 760a9195..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_10examples.csv
+++ /dev/null
@@ -1,11 +0,0 @@
-index,question_type,correct,attempted,sol,gt,tokens,message
-4005,relative_position,False,False,no solution,Yes,5003,Solution 'no solution' not found in options or ground truth
-4007,relative_position,True,True,A,Yes,4281,Correct: option A -> Yes
-4011,relative_position,False,False,no solution,Yes,4267,Solution 'no solution' not found in options or ground truth
-4012,relative_position,False,False,no solution,Yes,3826,Solution 'no solution' not found in options or ground truth
-4014,relative_position,True,True,A,Yes,5200,Correct: option A -> Yes
-3016,right_turns,True,True,B,0,6811,Correct: option B -> 0
-3017,right_turns,True,True,A,1,8752,Correct: option A -> 1
-3047,right_turns,True,True,C,2,6261,Correct: option C -> 2
-3013,right_turns,True,True,A,0,15431,Correct: option A -> 0
-3036,right_turns,True,True,B,0,6211,Correct: option B -> 0
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv
deleted file mode 100644
index 1df40b10..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_3examples.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-index,question_type,correct,attempted,sol,gt,tokens,message
-4005,relative_position,True,True,A,Yes,3120,Correct: option A -> Yes
-4011,relative_position,True,True,A,Yes,2896,Correct: option A -> Yes
-4012,relative_position,True,True,A,Yes,2152,Correct: option A -> Yes
diff --git a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json b/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json
deleted file mode 100644
index 9cad5526..00000000
--- a/Outputs_TTS/MazeResults/Qwen3-30B-A3B-Thinking-2507/summary.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "model": "Qwen/Qwen3-30B-A3B-Thinking-2507",
-  "total_examples": 3,
-  "correct": 3,
-  "attempted": 3,
-  "accuracy": 1.0,
-  "soundness": 1.0,
-  "total_reasoning_tokens": 8168,
-  "avg_reasoning_tokens": 2722.6666666666665,
-  "max_corrections": 5,
-  "results": [
-    {
-      "idx": 4005,
-      "question_type": "relative_position",
-      "correct": true,
-      "attempted": true,
-      "sol": "A",
-      "gt": "Yes",
-      "reasoning_tokens": 3120
-    },
-    {
-      "idx": 4011,
-      "question_type": "relative_position",
-      "correct": true,
-      "attempted": true,
-      "sol": "A",
-      "gt": "Yes",
-      "reasoning_tokens": 2896
-    },
-    {
-      "idx": 4012,
-      "question_type": "relative_position",
-      "correct": true,
-      "attempted": true,
-      "sol": "A",
-      "gt": "Yes",
-      "reasoning_tokens": 2152
-    }
-  ]
-}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt
deleted file mode 100644
index 5ec5e65a..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_2examples_results.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-SpatialMap Step Verification Results
-==================================================
-
-Model: Qwen/Qwen3-30B-A3B-Thinking-2507
-Number of Examples: 2
-Max Corrections: 3
-Newline Threshold: 20
-Warmup: 0
-
-Results:
----------
-Correct: 1/2
-Accuracy: 50.00%
-Attempted (produced \boxed answer): 2/2
-Soundness (correct/attempted): 50.00%
-
-Per-type Breakdown:
----------------------------
-  direction: 100.00% (1/1)
-  counting: 0.00% (0/1)
-
-Token Statistics:
----------------------------
-Total Tokens: 13535
-Average Tokens: 6767.50
-Median Tokens: 6768
-Min Tokens: 4736
-Max Tokens: 8799
-Std Dev: 2031.50
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt
deleted file mode 100644
index 440d0a8a..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/EAT_5examples_results.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-SpatialMap Step Verification Results
-==================================================
-
-Model: Qwen/Qwen3-30B-A3B-Thinking-2507
-Number of Examples: 5
-Max Corrections: 3
-Newline Threshold: 20
-Warmup: 0
-
-Results:
----------
-Correct: 4/5
-Accuracy: 80.00%
-Attempted (produced \boxed answer): 5/5
-Soundness (correct/attempted): 80.00%
-
-Per-type Breakdown:
----------------------------
-  direction: 100.00% (2/2)
-  object: 100.00% (1/1)
-  counting: 50.00% (1/2)
-
-Token Statistics:
----------------------------
-Total Tokens: 29208
-Average Tokens: 5841.60
-Median Tokens: 6193
-Min Tokens: 3503
-Max Tokens: 8820
-Std Dev: 1889.65
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt
deleted file mode 100644
index 1b4544b6..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_0.txt
+++ /dev/null
@@ -1,229 +0,0 @@
-Okay, let's try to figure out the direction of Planetarium Prints relative to Police Supply Store. So, the question is asking where Planetarium Prints is located compared to Police Supply Store. The options are Northeast, Northwest, Southwest, or Southeast.
-
-First, I need to parse all the given information carefully. Let me list out each statement one by one and see if I can map them out.
-
-1. Police Supply Store is in the map. (So this is our reference point, maybe we can consider it as the origin point.)
-
-2. Narwhal's Novelties is to the Northwest of Police Supply Store. So if Police Supply Store is at point P, then Narwhal's Novelties (N) is Northwest of P.
-
-3. Coral Crafts is to the Northwest of Narwhal's Novelties. So Coral Crafts (C) is Northwest of N. But also, the next statement says Coral Crafts is to the Northwest of Police Supply Store. Wait, that seems conflicting? Wait, let me check again.
-
-Wait, the third sentence says: "Coral Crafts is to the Northwest of Narwhal's Novelties. Coral Crafts is to the Northwest of Police Supply Store." Hmm, so Coral Crafts is Northwest of both Narwhal's and Police Supply Store. Wait, but if Narwhal's is Northwest of Police, then Coral Crafts being Northwest of Narwhal's would place it further Northwest of Police. So that makes sense. So Police (P) -> Narwhal's (N) Northwest, then Coral (C) Northwest of N, so C is Northwest of P as well. So C is Northwest of P, which matches the second part of that sentence. So that's consistent.
-
-4. Planetarium Prints (PP) is to the Southeast of Coral Crafts. So PP is Southeast of C. Also, Planetarium Prints is to the Northeast of Police Supply Store. So PP is Northeast of P. Wait, so PP is both Southeast of C and Northeast of P. Let's see if that fits.
-
-5. Oz Oddities (O) is to the Southwest of Planetarium Prints. So O is Southwest of PP. Also, Oz Oddities is to the Southwest of Police Supply Store. So O is Southwest of P. Hmm, so O is Southwest of both PP and P. So PP must be Northeast of O, which would make sense if PP is Northeast of P, then O being Southwest of P would be Southwest of PP as well? Wait, maybe.
-
-6. Ice Queen Ice Cream (I) is to the Northwest of Planetarium Prints. So I is Northwest of PP. Also, Ice Queen Ice Cream is to the Southeast of Coral Crafts. So I is Southeast of C.
-
-Hmm. Let's try to sketch this mentally. Let's set Police Supply Store (P) at the center. Let's use compass directions.
-
-Starting with P as the reference.
-
-Narwhal's Novelties (N) is Northwest of P. So if P is at (0,0), N would be at (-1,1) assuming Northwest is up-left on a grid.
-
-Coral Crafts (C) is Northwest of N, so further Northwest from N. So C would be at (-2,2). But also, Coral Crafts is Northwest of P, which matches because (-2,2) is Northwest of (0,0). So that's consistent.
-
-Planetarium Prints (PP) is Southeast of Coral Crafts (C). So from C (-2,2), Southeast would be towards increasing x and decreasing y. So PP would be at (-1,1), but wait, that's where N is. Wait, maybe my coordinate system is off. Wait, maybe I should use a different coordinate system. Let's say North is up, East is right. So Northwest would be left and up. So if P is at (0,0), then Northwest would be negative x and positive y. Wait, maybe better to use standard coordinates where East is positive x, North is positive y. So:
-
-- Northwest: left (negative x) and up (positive y)
-- Northeast: right (positive x) and up (positive y)
-- Southwest: left (negative x) and down (negative y)
-- Southeast: right (positive x) and down (negative y)
-
-Wait, but usually in maps, North is up, so:
-
-- Northwest: up-left
-- Northeast: up-right
-- Southwest: down-left
-- Southeast: down-right
-
-So if we have a grid where the origin is Police Supply Store (P), then:
-
-- Narwhal's Novelties (N) is Northwest of P, so N is at (-1, 1) assuming 1 unit NW.
-
-- Coral Crafts (C) is Northwest of N, so from N (-1,1), moving NW would be (-2, 2). Also, Coral Crafts is Northwest of P, which (-2,2) is indeed Northwest of (0,0). So that's correct.
-
-Now, Planetarium Prints (PP) is Southeast of Coral Crafts (C). Southeast from C (-2,2) would be moving towards positive x and negative y. So, for example, if we go 1 unit Southeast, it would be (-1,1), but wait, (-1,1) is where N is. Wait, maybe the distances are different. But the problem doesn't specify distances, just directions. So maybe PP is somewhere Southeast of C.
-
-But also, Planetarium Prints is to the Northeast of Police Supply Store (P). Northeast of P (0,0) would be positive x and positive y. So PP must be in the Northeast quadrant relative to P.
-
-Wait, so PP is Southeast of C, which is at (-2,2), and Northeast of P (0,0). Let's see. If C is at (-2,2), then Southeast from C would be towards increasing x (east) and decreasing y (south). So moving from C towards the southeast, we go towards (more positive x, less positive y). To reach a point that's Northeast of P (which is (0,0)), so PP must have x > 0 and y > 0 (assuming Northeast is up-right). Wait, but if C is at (-2,2), moving Southeast from C would take us towards (for example) (-1,1), (0,0), (1,-1), etc. But to be Northeast of P (0,0), PP must be in x>0, y>0. So from C (-2,2), moving Southeast (towards increasing x, decreasing y), we'd cross through (0,0) and beyond. Wait, but (0,0) is P. So if PP is Southeast of C, it's in the direction towards P and beyond. But PP is Northeast of P, so it should be on the other side of P from C. Hmm, maybe my coordinates are messed up.
-
-Wait, maybe I need to think differently. Let's try to draw a rough sketch.
-
-Let me place Police Supply Store (P) in the center.
-
-Narwhal's Novelties (N) is Northwest of P. So N is up and to the left of P.
-
-Coral Crafts (C) is Northwest of N, so further up and left from N. So C is Northwest of both N and P.
-
-Planetarium Prints (PP) is Southeast of C. So from C, going Southeast (down and right), which would head towards P and beyond. But PP is also Northeast of P. So if you go Southeast from C, you end up Northeast of P. Let's see: If C is Northwest of P, then Southeast of C would be towards P and then further Southeast? Wait, no. If C is Northwest of P, then the direction from C to P is Southeast. So Southeast of C would be towards P and beyond. But PP is Southeast of C and also Northeast of P. So PP must be in the area that's Southeast of C but Northeast of P. So if P is the origin, C is at (-1,1), then Southeast from C would be towards (0,0) and then (1,-1), etc. But Northeast of P would be (positive x, positive y). Wait, that's conflicting. If PP is Southeast of C (which is Northwest of P), then PP should be in the direction towards P, but Northeast of P is a different direction. Hmm, maybe my coordinate system is confusing me.
-
-Wait, let's use relative directions without coordinates. Let's imagine a compass.
-
-- If Coral Crafts (C) is Northwest of Police (P), then to get from P to C, you go Northwest.
-
-- Planetarium Prints (PP) is Southeast of C. So from C, go Southeast to get to PP.
-
-- But PP is also Northeast of P. So from P, PP is Northeast.
-
-So, if you go from P to C (Northwest), then from C to PP (Southeast), where does that end up relative to P?
-
-Let's say P is the center. C is NW of P. From C, going SE would take you towards P and beyond. If PP is SE of C, then PP is in the direction towards P's SE. Wait, but PP is NE of P. Hmm, maybe the SE from C passes through P and ends up NE of P? Wait, no. If C is NW of P, then SE from C would be towards P's SE direction. Wait, maybe:
-
-Imagine P is at the center. C is to the NW of P. So if you face P from C, you're facing SE. So moving SE from C gets you closer to P. If you go further SE beyond P, you'd be in the SE of P. But the problem says PP is Northeast of P. So that seems contradictory. Wait, but the problem states: "Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store."
-
-So PP must be both Southeast of C and Northeast of P. Let's see if that's possible.
-
-Let me think of it as a grid.
-
-Let’s assign coordinates with P at (0,0).
-
-C is Northwest of P, so C could be at (-1,1).
-
-PP is Southeast of C (-1,1), so Southeast direction from C would be towards increasing x and decreasing y. So PP could be at (0,0) [which is P], but it's supposed to be a different location. So maybe (0, 0.5)? Wait, but (0,0.5) is Northeast of P (since x=0, y=0.5 is North, but Northeast would be x>0, y>0). Wait, (0,0.5) is North of P, not Northeast. To be Northeast, x and y both positive.
-
-So if PP is Southeast of C (-1,1), moving Southeast would mean x increases and y decreases. To get to a point that's Northeast of P (x>0, y>0), we need x>0 and y>0. So starting from C (-1,1), moving Southeast (so x increases, y decreases), we need to end up at a point where x>0 and y>0. So y must decrease from 1 to something still positive, and x increases from -1 to something positive.
-
-For example, if we go from C (-1,1) to (0.5, 0.5), that's Southeast (since x increased by 1.5, y decreased by 0.5). Then (0.5,0.5) is Northeast of P (0,0), since both x and y are positive. So that works. So PP would be Northeast of P.
-
-Therefore, Planetarium Prints is Northeast of Police Supply Store. So the answer would be A. Northeast.
-
-Wait, but let's check the other statements to make sure.
-
-Oz Oddities (O) is Southwest of PP and Southwest of P.
-
-So if PP is at (0.5, 0.5), then Southwest of PP would be towards negative x and negative y, so O would be at, say, (0,0), but P is at (0,0). Wait, O is Southwest of P as well. So Southwest of P (0,0) would be negative x and negative y, so O would be at (-0.5, -0.5), for example. Then, is O Southwest of PP (0.5, 0.5)? Yes, because (-0.5, -0.5) is Southwest of (0.5, 0.5). So that fits.
-
-Ice Queen Ice Cream (I) is Northwest of PP and Southeast of C.
-
-C is at (-1,1). Southeast of C would be towards increasing x and decreasing y. If PP is at (0.5, 0.5), then Northwest of PP would be decreasing x and increasing y. So I could be at (0,1), which is Northwest of PP (0.5,0.5), and Southeast of C (-1,1)? From C (-1,1), Southeast would be towards (0,0), so (0,1) is East of C, but Southeast would require moving down as well. Wait, (0,1) is East of C, not Southeast. Southeast from C (-1,1) would be, say, (0,0), which is directly Southeast. So if I is Southeast of C, then I should be somewhere like (0,0), but (0,0) is P. Hmm, maybe I'm miscalculating.
-
-Wait, Southeast direction from C: to go Southeast, you go East and South. So from C (-1,1), moving East (x increases) and South (y decreases). So a point Southeast of C would have x > -1 and y < 1. Ice Queen is supposed to be Southeast of C, so that's correct. Also, Ice Queen is Northwest of PP. If PP is Northeast of P, say at (1,1), then Northwest of PP would be x < 1 and y > 1. Wait, maybe my coordinates are off.
-
-Wait, let's try to reassign more carefully.
-
-Let me set Police Supply Store (P) at (0,0).
-
-Narwhal's Novelties (N) is Northwest of P, so N = (-1,1).
-
-Coral Crafts (C) is Northwest of N, so C = (-2,2). Also, C is Northwest of P, which (-2,2) is indeed Northwest.
-
-Planetarium Prints (PP) is Southeast of C, so from C (-2,2), Southeast would be towards increasing x and decreasing y. So PP could be at (-1,1), but that's N's location. Can't be. So maybe PP is at (0,1). Wait, from C (-2,2), moving Southeast: to get to (0,1), that's moving +2 in x and -1 in y, which is Southeast. Then PP at (0,1) is Northeast of P? (0,1) is North of P, not Northeast. Northeast would require positive x and y. So (0,1) is North, but not Northeast. Hmm.
-
-Wait, the problem states PP is to the Northeast of Police Supply Store. So PP must be in the Northeast quadrant relative to P. So x > 0 and y > 0.
-
-So let's say PP is at (1,1). Is (1,1) Southeast of C (-2,2)?
-
-Southeast from C (-2,2) to (1,1): x increases by 3, y decreases by 1. Yes, that's Southeast direction. So that works. So PP at (1,1) is Southeast of C (-2,2) and Northeast of P (0,0). That fits.
-
-Now, check Oz Oddities (O): Southwest of PP (1,1) and Southwest of P (0,0).
-
-Southwest of PP (1,1) would be x <1, y <1. Southwest of P (0,0) would be x <0, y <0. So O should be in x <0, y <0. For example, (-1,-1). Then (-1,-1) is Southwest of both PP (1,1) and P (0,0). Yes, that works.
-
-Ice Queen Ice Cream (I) is Northwest of PP (1,1) and Southeast of C (-2,2).
-
-Northwest of PP (1,1) would be x <1, y >1. Southeast of C (-2,2) would be x >-2, y <2. So a point like (0,1.5) would be Northwest of PP (since x=0 <1, y=1.5 >1) and Southeast of C (-2,2) (x=0 >-2, y=1.5 <2). Yes, that works. So I at (0,1.5) fits both.
-
-Okay, so with this coordinate system:
-
-- P: (0,0)
-
-- N: (-1,1)
-
-- C: (-2,2)
-
-- PP: (1,1)
-
-- O: (-1,-1)
-
-- I: (0,1.5)
-
-This satisfies all the conditions.
-
-Now, the question is: In which direction is Planetarium Prints relative to Police Supply Store?
-
-PP is at (1,1), P is at (0,0). So from P, PP is to the Northeast. Because it's in the positive x and positive y direction. So the answer should be A. Northeast.
-
-Wait, but let's confirm all statements again to be safe.
-
-- Narwhal's is Northwest of Police: Yes, N (-1,1) is NW of P (0,0).
-
-- Coral Crafts Northwest of Narwhal's: C (-2,2) is NW of N (-1,1). Yes, because x is more negative, y is higher.
-
-- Coral Crafts Northwest of Police: Yes, C is NW of P.
-
-- Planetarium Prints Southeast of Coral Crafts: PP (1,1) is Southeast of C (-2,2)? From C to PP: x increases by 3, y decreases by 1. Yes, that's Southeast direction.
-
-- Planetarium Prints Northeast of Police: Yes, PP is NE of P.
-
-- Oz Oddities Southwest of Planetarium Prints: O (-1,-1) is SW of PP (1,1). Yes.
-
-- Oz Oddities Southwest of Police: O is SW of P (0,0). Yes.
-
-- Ice Queen Northwest of Planetarium Prints: I (0,1.5) is NW of PP (1,1). Yes, x=0 <1, y=1.5 >1.
-
-- Ice Queen Southeast of Coral Crafts: I (0,1.5) is Southeast of C (-2,2). x=0 > -2 (East), y=1.5 <2 (South), so yes, Southeast.
-
-All conditions are satisfied with PP being Northeast of P. So the answer should be A. Northeast.
-
-But wait, the problem lists the options as:
-
-A. Northeast
-
-B. Northwest
-
-C. Southwest
-
-D. Southeast
-
-So A is the correct choice.
-
-Wait, but let me check if I misread any of the statements. For example, the problem says "Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store." So directly states it's Northeast of Police, so why even ask? Wait, maybe it's a trick question, but according to the given info, it's explicitly stated. Wait, but the problem says "Please answer the following multiple-choice question based on the provided information." So even though it says "Planetarium Prints is to the Northeast of Police Supply Store," the question is asking for the direction, so the answer is A. But maybe I misread and it's a different statement.
-
-Wait, let me check again the original problem:
-
-"Planetarium Prints is to the Southeast of Coral Crafts. Planetarium Prints is to the Northeast of Police Supply Store."
-
-Yes, it directly states that. So regardless of the other points, the answer should be Northeast. But maybe the problem is testing if we get confused with other directions. But according to the text, it's explicitly stated. So why all the other information? Maybe to confirm consistency, but the direct statement says it's Northeast. So answer A.
-
-But in my coordinate system, that's correct. So the answer is A.
-</think>
-Let me solve this step by step using the structured format:
-<format>
->>> STEP 1: PARSE RELATIONSHIPS
-    - A is to the DIRECTION of B
-    [... list all given relationships ...]
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-    - Looking for: [target relationship / direction / count]
-    - [reasoning about the relationships]
-    - [use reversibility and transitivity as needed]
-
->>> STEP 3: ANSWER
-    - [state conclusion]
-
->>> FINAL ANSWER: [answer text]
-    \boxed{LETTER}
-</format>
->>> STEP 1: PARSE RELATIONSHIPS
-- Police Supply Store (P) is the reference point.
-- Narwhal's Novelties (N) is to the Northwest of P.
-- Coral Crafts (C) is to the Northwest of N and also to the Northwest of P.
-- Planetarium Prints (PP) is to the Southeast of C and to the Northeast of P.
-- Oz Oddities (O) is to the Southwest of PP and to the Southwest of P.
-- Ice Queen Ice Cream (I) is to the Northwest of PP and to the Southeast of C.
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-- The key statement: "Planetarium Prints is to the Northeast of Police Supply Store."
-- This is a direct statement, not a derived one.
-- The other statements (e.g., PP is Southeast of Coral Crafts) are consistent with this, as they describe the path to reach PP from another point, but the question specifically asks for the direction **relative to Police Supply Store**.
-
->>> STEP 3: ANSWER
-- The question asks: "In which direction is Planetarium Prints relative to Police Supply Store?"
-- The text explicitly states: "Planetarium Prints is to the Northeast of Police Supply Store."
-- This is a direct match to option A.
-
->>> FINAL ANSWER: A
-\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt
deleted file mode 100644
index 64d1e035..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1124.txt
+++ /dev/null
@@ -1,782 +0,0 @@
-Okay, let's try to figure out this map problem. So, we have several objects and their positions relative to each other. The question is asking how many objects are in the Southeast of Panda's Pottery. The options are A.0, B.3, C.5, D.1. Hmm.
-
-First, I need to map out all the given positions. Let me list out all the objects mentioned. The objects are:
-
-- Recycle Center
-- Kite's Kitchen Knives
-- Pandora Jewelry
-- Silo Surprises
-- Construction Convenience
-- Panda's Pottery
-
-Wait, let me check again. The problem says:
-
-- Recycle Center is in the map.
-- Kite's Kitchen Knives is to the Northeast of Recycle Center.
-- Pandora Jewelry is to the Southwest of Recycle Center.
-- Pandora Jewelry is to the Southwest of Kite's Kitchen Knives.
-- Silo Surprises is to the Northwest of Recycle Center.
-- Silo Surprises is to the Northeast of Pandora Jewelry.
-- Construction Convenience is to the Northwest of Pandora Jewelry.
-- Construction Convenience is to the Southwest of Silo Surprises.
-- Panda's Pottery is to the Southwest of Kite's Kitchen Knives.
-- Panda's Pottery is to the Southeast of Silo Surprises.
-
-So all the objects are the six I listed. Now, I need to figure out their relative positions. Let's try to sketch a coordinate system or use a grid to place them.
-
-Let's assign coordinates to make it easier. Let's assume Recycle Center is at the origin (0,0) for simplicity. Then we can figure out the positions of others based on directions.
-
-First, directions:
-
-- Northeast (NE) would be up and to the right (positive x and y if we consider East as x, North as y)
-- Southwest (SW) would be down and left (negative x and y)
-- Northwest (NW) is up and left (negative x, positive y)
-- Southeast (SE) is down and right (positive x, negative y)
-
-Wait, actually, standard coordinates might have East as positive x, North as positive y. So:
-
-- NE: +x, +y
-- SE: +x, -y
-- SW: -x, -y
-- NW: -x, +y
-
-But maybe it's better to use a grid where moving NE means increasing both x and y, etc. Let's proceed.
-
-Let me set Recycle Center (RC) at (0,0).
-
-Kite's Kitchen Knives (KKK) is NE of RC. So KKK is at (a, b) where a>0, b>0. Let's say for simplicity, maybe (1,1) but exact coordinates might not matter as long as relative positions are correct.
-
-Pandora Jewelry (P) is SW of RC. So P would be at (-c, -d), c>0, d>0. Also, P is SW of KKK. Since KKK is NE of RC, P being SW of KKK would mean that P is to the SW of KKK's position. So if KKK is at (1,1), then P should be at some (-something, -something) relative to KKK. Wait, but P is also SW of RC. So RC is (0,0), so SW of RC is negative x and y. So P is at (-x, -y), x>0, y>0.
-
-Silo Surprises (SS) is NW of RC. So NW of (0,0) would be (-a, b), a>0, b>0. Also, SS is NE of P. Since P is SW of RC, SS being NE of P would mean SS is to the NE of P's position. So if P is at (-x, -y), then SS would be at (-x + a, -y + b), but also SS is NW of RC (0,0), so SS must be at (-c, d), c>0, d>0.
-
-Construction Convenience (CC) is NW of P. So if P is at (-x, -y), then CC would be NW of that, so (-x - a, -y + b). Also, CC is SW of SS. SS is at (-c, d), so SW of SS would be (-c - e, d - f). Hmm, this is getting complicated. Maybe using a grid step by step.
-
-Let me try to place RC at (0,0).
-
-KKK is NE of RC: so KKK is at (1,1) maybe.
-
-P is SW of RC: so P is at (-1,-1).
-
-But also, P is SW of KKK. If KKK is at (1,1), then SW of KKK would be (1 - a, 1 - b), so to be SW, x decreases, y decreases. So P should be at (less than 1, less than 1). But since P is SW of RC (0,0), it's at (-1,-1), which is SW of RC. So (-1,-1) is SW of (1,1)? Let's see: from KKK (1,1), moving SW would be towards lower x and lower y, so yes, (-1,-1) is SW of (1,1). So that works.
-
-Now, SS is NW of RC (0,0), so SS should be at (-1,1) perhaps.
-
-Also, SS is NE of P. P is at (-1,-1). NE of P would be increasing x and y, so (-1 + a, -1 + b). If SS is at (-1,1), then from P (-1,-1), moving NE would be to (-1 + 0, -1 + 2), so yes, (-1,1) is NE of (-1,-1). So that works. So SS is at (-1,1).
-
-Next, Construction Convenience (CC) is NW of P. P is at (-1,-1). NW of P would be (-1 - a, -1 + b). Let's say CC is at (-2,0), but also CC is SW of SS. SS is at (-1,1). SW of SS would be (-1 - a, 1 - b). So CC must be SW of SS, so CC's coordinates should be less x (more west) and less y (more south) than SS. If SS is at (-1,1), then CC should be at, say, (-2,0), which is SW of (-1,1). Also, CC is NW of P (-1,-1). NW of P would be more west (lower x) and more north (higher y). So from P (-1,-1), moving NW: x decreases (more negative), y increases. So CC at (-2, 0) would be NW of P (-1,-1), since x is -2 < -1, y is 0 > -1. And CC (-2,0) is SW of SS (-1,1), since x=-2 < -1, y=0 < 1. That works. So CC is at (-2,0).
-
-Now, Panda's Pottery (PP) is SW of KKK. KKK is at (1,1), so SW of KKK would be (1 - a, 1 - b). So PP is somewhere with lower x and lower y than KKK. Also, PP is SE of SS. SS is at (-1,1). SE of SS would be higher x and lower y. So PP should be to the SE of SS. SS is at (-1,1), so SE would be (-1 + a, 1 - b). So PP's coordinates should have x > -1 (more east) and y < 1 (more south).
-
-Given that PP is SW of KKK (1,1), so PP's x < 1, y < 1. And PP is SE of SS (-1,1), so x > -1, y < 1. So combining these, PP is in x between -1 and 1, y < 1.
-
-Let's try to assign coordinates. Let's say PP is at (0,0). Wait, but RC is at (0,0). Wait, no, RC is Recycle Center at (0,0). So PP can't be there. Let's pick another point. Maybe PP is at (0, -1). Let's check.
-
-PP at (0, -1):
-
-- SW of KKK (1,1): yes, 0 < 1 and -1 < 1.
-
-- SE of SS (-1,1): SE would be x > -1 (0 > -1) and y < 1 (-1 < 1). Yes, that works.
-
-So PP is at (0, -1).
-
-Wait, but let's confirm all other points. Let's recap:
-
-RC: (0,0)
-
-KKK: (1,1) [NE of RC]
-
-P: (-1,-1) [SW of RC and SW of KKK]
-
-SS: (-1,1) [NW of RC and NE of P]
-
-CC: (-2,0) [NW of P and SW of SS]
-
-PP: (0,-1) [SW of KKK and SE of SS]
-
-Wait, let's check PP being SE of SS. SS is (-1,1). PP at (0,-1). To get from SS to PP, you go East (x from -1 to 0) and South (y from 1 to -1), so yes, that's Southeast direction. Correct.
-
-Now, we need to find how many objects are in the Southeast of PP. PP is at (0,-1). Southeast of PP would be points with x > 0 and y < -1.
-
-Wait, Southeast direction from PP: increasing x (east) and decreasing y (south), so yes, x > 0, y < -1.
-
-Now, let's list all the objects and their coordinates:
-
-- RC: (0,0)
-
-- KKK: (1,1)
-
-- P: (-1,-1)
-
-- SS: (-1,1)
-
-- CC: (-2,0)
-
-- PP: (0,-1)
-
-Now, check which of these are in the Southeast of PP (0,-1). Southeast means x > 0 and y < -1.
-
-Check each object:
-
-RC: (0,0). x=0 not >0, y=0 not < -1. No.
-
-KKK: (1,1). x=1>0, y=1 not < -1. No.
-
-P: (-1,-1). x=-1 <0, so no.
-
-SS: (-1,1). No.
-
-CC: (-2,0). No.
-
-PP: itself. Not counted.
-
-Wait, none of the other objects have x >0 and y < -1. Wait, but hold on. Wait, is there any other object?
-
-Wait, the objects are RC, KKK, P, SS, CC, PP. So all six. So according to this coordinate system, none of the other objects are in the SE of PP.
-
-But wait, let me check again. PP is at (0,-1). Southeast of PP would be East (x increasing) and South (y decreasing). So any point with x > 0 and y < -1.
-
-Looking at all coordinates:
-
-RC: (0,0) – x=0 not >0.
-
-KKK: (1,1) – y=1 not < -1.
-
-P: (-1,-1) – x negative.
-
-SS: (-1,1) – nope.
-
-CC: (-2,0) – nope.
-
-PP: (0,-1) – itself.
-
-So none of the other objects are in SE of PP. So answer would be A.0?
-
-But wait, wait. Wait, maybe my coordinate assignments are wrong. Let's double-check.
-
-Wait, let's verify all the given conditions with these coordinates.
-
-Check:
-
-- KKK (1,1) is NE of RC (0,0): yes.
-
-- P (-1,-1) is SW of RC: yes.
-
-- P is SW of KKK: from KKK (1,1), SW would be towards lower x and y, so (-1,-1) is SW of (1,1), yes.
-
-- SS (-1,1) is NW of RC: yes, NW of (0,0) is (-x, y).
-
-- SS is NE of P (-1,-1): NE would be higher x and y. SS is at (-1,1), so x same as P, y higher. Wait, from P (-1,-1), moving NE would go to higher x and higher y. So (-1 + a, -1 + b). SS is at (-1,1), which is x same (-1), y increased from -1 to 1. So it's directly North, not NE. Hmm, maybe that's a problem.
-
-Wait, NE direction should be both East and North. So if you're at P (-1,-1), NE would be increasing x (east) and increasing y (north). So SS should be to the NE of P, meaning x > -1 and y > -1. But SS is at (-1,1), which has x = -1 (same as P), y = 1 > -1. So it's North, not NE. So maybe my coordinate for SS is wrong.
-
-Ah, here's a mistake. If SS is NE of P, then it must be both East and North of P. So SS's x should be greater than P's x, and y greater than P's y.
-
-P is at (-1,-1), so SS must be at (x > -1, y > -1). But SS is also NW of RC (0,0). NW of RC would be x < 0, y > 0.
-
-So combining, SS must be in x < 0, y > 0 (NW of RC), and x > -1 (since x > P's x=-1), y > -1 (but y >0 already satisfies that). So SS is at (-0.5, 0.5) maybe. Let's adjust coordinates.
-
-Let's reset with better coordinates.
-
-Let me try again.
-
-Set RC at (0,0).
-
-KKK is NE of RC: let's put KKK at (2,2).
-
-P is SW of RC: so P is at (-2,-2).
-
-But also P is SW of KKK. SW of KKK (2,2) would be x < 2, y < 2. P at (-2,-2) is indeed SW of (2,2), so that's okay.
-
-SS is NW of RC: so SS is at (-2,2) maybe.
-
-Also, SS is NE of P. P is at (-2,-2). NE of P would be x > -2, y > -2. SS at (-2,2): x = -2 (not greater than -2), y = 2 > -2. So it's directly North, not NE. So to be NE, SS must have x > -2 and y > -2. So let's put SS at (-1, 1). Check:
-
-SS (-1,1):
-
-- NW of RC (0,0): yes, x=-1 <0, y=1>0.
-
-- NE of P (-2,-2): x=-1 > -2, y=1 > -2. Yes, that's NE. Good.
-
-Construction Convenience (CC) is NW of P. P is (-2,-2). NW of P would be x < -2, y > -2. Also, CC is SW of SS (-1,1). SW of SS would be x < -1, y < 1.
-
-So CC must be x < -2 (from NW of P), and x < -1 (from SW of SS), which is satisfied. And y > -2 (from NW of P), y < 1 (from SW of SS). So CC could be at (-3, 0), for example.
-
-Check:
-
-CC (-3,0):
-
-- NW of P (-2,-2): x=-3 < -2, y=0 > -2. Yes.
-
-- SW of SS (-1,1): x=-3 < -1, y=0 < 1. Yes.
-
-Good.
-
-Now, Panda's Pottery (PP) is SW of KKK (2,2), so x < 2, y < 2.
-
-PP is SE of SS (-1,1). SE of SS means x > -1, y < 1.
-
-So PP must satisfy x > -1 and y < 1 (from SE of SS), and x < 2, y < 2 (from SW of KKK). So combining, x in (-1, 2), y < 1.
-
-Let's choose PP at (0, 0). Wait, but RC is at (0,0). Can't have two objects at same spot. So PP at (0, -1), for example.
-
-Check:
-
-PP (0, -1):
-
-- SW of KKK (2,2): yes, 0 < 2, -1 < 2.
-
-- SE of SS (-1,1): x=0 > -1, y=-1 < 1. Yes.
-
-Good.
-
-Now, list all coordinates:
-
-RC: (0,0)
-
-KKK: (2,2)
-
-P: (-2,-2)
-
-SS: (-1,1)
-
-CC: (-3,0)
-
-PP: (0,-1)
-
-Now, check all conditions again.
-
-- KKK NE of RC: (2,2) is NE of (0,0): yes.
-
-- P SW of RC: (-2,-2) is SW: yes.
-
-- P SW of KKK: (-2,-2) is SW of (2,2): yes.
-
-- SS NW of RC: (-1,1) is NW: yes.
-
-- SS NE of P: SS (-1,1) is NE of P (-2,-2): yes, x increased from -2 to -1, y increased from -2 to 1.
-
-- CC NW of P: CC (-3,0) is NW of P (-2,-2): x decreased, y increased: yes.
-
-- CC SW of SS: CC (-3,0) is SW of SS (-1,1): x decreased, y decreased: yes.
-
-- PP SW of KKK: (0,-1) is SW of (2,2): yes.
-
-- PP SE of SS: (0,-1) is SE of (-1,1): x increased (from -1 to 0), y decreased (from 1 to -1): yes.
-
-All conditions satisfied. Now, we need to find how many objects are in the Southeast of PP (0,-1).
-
-Southeast of PP means x > 0 (East) and y < -1 (South). So looking for objects with x > 0 and y < -1.
-
-Check all objects:
-
-RC: (0,0) → x=0 not >0, y=0 not < -1. No.
-
-KKK: (2,2) → y=2 not < -1. No.
-
-P: (-2,-2) → x negative. No.
-
-SS: (-1,1) → No.
-
-CC: (-3,0) → No.
-
-PP: itself.
-
-So none of the other objects meet x >0 and y < -1. Wait, but hold on. Is there any other object? Let's confirm the list again. The objects are RC, KKK, P, SS, CC, PP. Six objects total.
-
-Wait, but wait, what about the Recycle Center? It's at (0,0), which is not in SE of PP.
-
-Hmm. So according to this, the answer would be A.0.
-
-But wait, let's make sure we didn't miss any other positions. Wait, maybe my coordinate assignment is still off.
-
-Wait, another way: maybe use a grid.
-
-Let me try to draw a rough grid mentally.
-
-Let's consider directions:
-
-- RC at center.
-
-- KKK to NE of RC.
-
-- P to SW of RC.
-
-- SS to NW of RC.
-
-So RC is center.
-
-NE: KKK
-
-SW: P
-
-NW: SS
-
-Then, P is also SW of KKK. So from KKK, going SW leads to P. Which would make sense if KKK is NE of RC, and P is SW of RC, so P is SW of RC, which is SW of KKK's position.
-
-Similarly, SS is NW of RC and NE of P. So SS is NW of RC, and P is SW of RC, so SS being NE of P makes sense.
-
-CC is NW of P and SW of SS.
-
-PP is SW of KKK and SE of SS.
-
-Let me try to sketch this.
-
-Imagine RC in the middle.
-
-- KKK: NE of RC
-
-- SS: NW of RC
-
-- P: SW of RC
-
-Now, from RC, going SW is P, so P is below and left of RC.
-
-From RC, going NW is SS, so SS is above and left.
-
-KKK is NE, so above and right.
-
-Now, P is SW of KKK. So KKK is NE of RC, so to get to P from KKK, you go SW, which would be towards RC and beyond. So P is SW of KKK.
-
-SS is NE of P. So from P (SW of RC), moving NE gets to SS (NW of RC). So SS is between P and RC in the north direction.
-
-CC is NW of P and SW of SS. So from P, NW is CC; from SS, SW is CC. So CC is northwest of P and southwest of SS.
-
-PP is SW of KKK and SE of SS.
-
-Let's try to place them on a grid with RC at (0,0):
-
-- RC: (0,0)
-
-- KKK: (1,1) [NE]
-
-- SS: (-1,1) [NW]
-
-- P: (-1,-1) [SW]
-
-Wait, but earlier we saw that SS (-1,1) is not NE of P (-1,-1), it's directly North. To be NE, SS should be east of P. So maybe P is further west.
-
-Let me try:
-
-RC: (0,0)
-
-KKK: (2,2)
-
-SS: (-1,1) [NW of RC]
-
-P: (-2,-2) [SW of RC]
-
-Now, check if P is SW of KKK: KKK (2,2) to P (-2,-2) is SW, yes.
-
-SS (-1,1) NE of P (-2,-2): yes, because x increased from -2 to -1, y increased from -2 to 1.
-
-CC is NW of P (-2,-2), so CC would be at (-3, 0) maybe.
-
-CC is SW of SS (-1,1), so CC should be west and south of SS. (-3,0) is west of -1 and south of 1, yes.
-
-PP is SW of KKK (2,2), so somewhere like (1,0) or (0,-1). Also SE of SS (-1,1). SE of SS means east and south, so x > -1, y < 1.
-
-If PP is at (0,0), but RC is there. So PP at (0,-1).
-
-Check:
-
-PP (0,-1):
-
-- SW of KKK (2,2): yes.
-
-- SE of SS (-1,1): x=0 > -1, y=-1 <1: yes.
-
-Now, coordinates:
-
-RC: (0,0)
-
-KKK: (2,2)
-
-SS: (-1,1)
-
-P: (-2,-2)
-
-CC: (-3,0)
-
-PP: (0,-1)
-
-Now, Southeast of PP (0,-1) is x >0, y < -1.
-
-Check all objects:
-
-RC: (0,0) – nope.
-
-KKK: (2,2) – y=2 not < -1.
-
-SS: (-1,1) – nope.
-
-P: (-2,-2) – x negative.
-
-CC: (-3,0) – nope.
-
-So no objects in SE of PP. So answer A.0.
-
-But wait, the options include D.1. Maybe I made a mistake.
-
-Wait, another thought: maybe "Southeast of Panda's Pottery" includes the direction, but depending on the exact grid, maybe one of the points falls there.
-
-Wait, in our coordinates, is there any object with x>0 and y<-1? Let's see:
-
-RC is at (0,0), KKK at (2,2), others are all at y >= -2 but:
-
-P is at (-2,-2), y=-2 which is less than -1, but x=-2 <0.
-
-So P is SW of PP? PP is at (0,-1). P is at (-2,-2). To get from PP to P: West and South. So P is SW of PP, not SE.
-
-Wait, SE of PP would be East and South. So x >0, y < -1.
-
-Is there any object there? Let's see:
-
-If we consider that maybe Recycle Center is at (0,0), and maybe another object... Wait, no, all objects are listed.
-
-Wait, maybe I messed up the direction of SE. Southeast is down and to the right, so if PP is at (0,-1), then SE would be increasing x and decreasing y. So y < -1.
-
-Looking at all objects, P is at (-2,-2), which has y=-2 < -1, but x=-2 <0, so it's SW of PP, not SE.
-
-Is there any other object with y < -1 and x >0? In our coordinate system, no.
-
-Wait, unless I assigned PP incorrectly.
-
-Wait, let's see: PP is SW of KKK. If KKK is at (2,2), SW could be (1,1), but that's NE of RC. Wait, no, SW would be lower x and y. So (1,0), (0,-1), etc.
-
-Wait, maybe PP is at (1,-1). Let's try that.
-
-PP at (1,-1):
-
-- SW of KKK (2,2): yes, x=1<2, y=-1<2.
-
-- SE of SS (-1,1): x=1 > -1, y=-1 <1: yes.
-
-Now, check Southeast of PP (1,-1): x >1, y < -1.
-
-Objects:
-
-RC: (0,0) – no.
-
-KKK: (2,2) – y=2 not < -1.
-
-SS: (-1,1) – no.
-
-P: (-2,-2) – x=-2 <1.
-
-CC: (-3,0) – no.
-
-PP: (1,-1).
-
-Still, no objects in x>1 and y<-1.
-
-Wait, P is at (-2,-2), y=-2 < -1, but x negative.
-
-So still none.
-
-Hmm. Maybe the answer is A.0. But let's check again.
-
-Wait, another way: list all the objects and their positions relative to PP.
-
-We need to find which objects are SE of PP.
-
-Given the relations:
-
-- PP is SW of KKK. So KKK is NE of PP.
-
-- PP is SE of SS. So SS is NW of PP.
-
-- P is SW of RC, and RC is... Not sure.
-
-Wait, maybe using the relative directions without coordinates.
-
-Let's try to make a mental map.
-
-Start with RC.
-
-- KKK is NE of RC.
-
-- SS is NW of RC.
-
-- P is SW of RC.
-
-Now, from KKK (NE of RC), P is SW of KKK. So P is located SW of KKK, which would be towards RC and beyond SW.
-
-SS is NW of RC, and NE of P. So SS is located NE of P, which would be towards RC and beyond NW.
-
-CC is NW of P and SW of SS. So CC is NW of P (so further NW from P) and SW of SS (so further SW from SS).
-
-PP is SW of KKK and SE of SS.
-
-So PP is located SW of KKK (so between KKK and RC, or beyond RC towards SW), and SE of SS (so between SS and RC, or beyond RC towards SE).
-
-Wait, SE of SS: SS is NW of RC, so SE of SS would be towards RC and then further SE.
-
-Similarly, SW of KKK: KKK is NE of RC, so SW of KKK would be towards RC and further SW.
-
-So PP is somewhere near RC's south-east side?
-
-Wait, if SS is NW of RC, SE of SS would be towards RC and beyond SE. So PP is SE of SS, meaning PP is to the SE of SS, so closer to RC's SE side.
-
-And PP is SW of KKK, so from KKK (NE of RC), going SW leads to PP, which would be towards RC's south.
-
-So PP is south of RC, east of SS (since SE of SS), and west of KKK.
-
-Now, to find objects SE of PP.
-
-SE of PP would be further east and south from PP.
-
-Which objects could be there?
-
-RC is at the center. If PP is south of RC, then SE of PP would be south-east of RC.
-
-KKK is NE of RC, so not SE of PP.
-
-P is SW of RC, so if PP is south of RC, P is further SW, so P is SW of PP, not SE.
-
-SS is NW of RC, so north-west of PP.
-
-CC is NW of P, so very west.
-
-So maybe none.
-
-Wait, but let's see:
-
-Objects:
-
-- RC: center.
-
-- KKK: NE.
-
-- SS: NW.
-
-- P: SW.
-
-- CC: NW of P, so even further west and north of P.
-
-- PP: SW of KKK and SE of SS.
-
-Assuming PP is south-east of RC (since SE of SS which is NW of RC), so PP is in the south-east quadrant relative to RC.
-
-Wait, no: SS is NW of RC, so SE of SS would be towards RC and then SE of RC. So PP is SE of SS, so PP is east and south of SS. Since SS is NW of RC, PP could be in the south-east of RC.
-
-Wait, maybe RC is at the center, PP is south-east of RC.
-
-If PP is south-east of RC, then SE of PP would be further south-east.
-
-But are there any objects there? The only other object in that general area is... KKK is NE of RC, so not SE. P is SW of RC, so opposite. So no.
-
-Wait, unless Recycle Center itself is considered, but RC is north-west of PP if PP is south-east of RC.
-
-Wait, this is confusing. Maybe better to use the coordinate system where PP is at (0,-1), RC at (0,0), so RC is north of PP. Then SE of PP would be east and south. The only possible object there would be... but all others are either north, west, or north-west.
-
-Wait, in our coordinate system, no objects are there. So answer A.0.
-
-But why does option D.1 exist? Maybe I made a mistake.
-
-Wait, let's check again all the given information.
-
-"Panda's Pottery is to the Southwest of Kite's Kitchen Knives. Panda's Pottery is to the Southeast of Silo Surprises."
-
-So PP is SW of KKK and SE of SS.
-
-We have SS is NW of RC, and PP is SE of SS, so PP is east and south of SS.
-
-KKK is NE of RC, PP is SW of KKK, so PP is west and south of KKK.
-
-So PP is located south of RC, east of SS, west of KKK.
-
-Now, let's list all objects' positions relative to PP.
-
-We need to find which are SE of PP.
-
-SE of PP means they are east and south of PP.
-
-Which objects could be there?
-
-- Recycle Center: If PP is south of RC, then RC is north of PP. Not SE.
-
-- KKK: NE of RC, so if PP is south of RC, KKK is north-east of PP. Not SE.
-
-- P: SW of RC. If PP is south of RC, then P is further SW, so P is SW of PP.
-
-- SS: NW of RC, and PP is SE of SS, so SS is NW of PP.
-
-- CC: NW of P, which is SW of RC, so CC is further west and north. So CC is NW of PP.
-
-So all other objects are either NW, N, W, SW, but none are SE of PP.
-
-Therefore, the number is 0. So answer A.
-
-But let me check once more for errors.
-
-Wait, another angle: maybe "Southeast" is considered as the direction, not strictly the quadrant. For example, if an object is directly south, is it considered SE? No, Southeast is specifically the diagonal direction. So it has to be both east and south.
-
-In our coordinate system, no objects are both east and south of PP.
-
-Hence, the answer should be A.0.
-
-But the options are given, and maybe I messed up.
-
-Wait, wait a second. Let's check the original problem statement again:
-
-"Construction Convenience is to the Northwest of Pandora Jewelry. Construction Convenience is to the Southwest of Silo Surprises."
-
-So CC is NW of P and SW of SS.
-
-"Panda's Pottery is to the Southwest of Kite's Kitchen Knives. Panda's Pottery is to the Southeast of Silo Surprises."
-
-So PP is SW of KKK and SE of SS.
-
-Now, let's try to build the map step by step.
-
-1. Place Recycle Center (RC).
-
-2. KKK is NE of RC.
-
-3. P is SW of RC.
-
-4. P is SW of KKK (so KKK is NE of P, which matches since P is SW of RC and KKK is NE of RC).
-
-5. SS is NW of RC.
-
-6. SS is NE of P (so SS is NE of P, which makes sense as SS is NW of RC and P is SW of RC).
-
-7. CC is NW of P and SW of SS.
-
-8. PP is SW of KKK and SE of SS.
-
-Now, let's use a grid with RC at the center.
-
-- RC: Center.
-
-- KKK: NE of RC.
-
-- P: SW of RC.
-
-- SS: NW of RC.
-
-Now, SS is NE of P. So from P, moving NE gets to SS. Since P is SW of RC, SS must be between P and RC towards the NW.
-
-CC is NW of P (so further NW from P) and SW of SS (so further SW from SS). So CC is located west of SS and south of SS? Wait, SW of SS would be south and west.
-
-Wait, SS is NW of RC. CC is SW of SS, so CC is south and west of SS.
-
-But CC is also NW of P. Since P is SW of RC, CC being NW of P would be west and north of P.
-
-So imagine:
-
-- SS is at NW of RC.
-
-- CC is SW of SS (so below and left of SS) and NW of P (so above and left of P). Given P is SW of RC, this places CC to the left of both SS and P.
-
-PP is SW of KKK (so below and left of KKK) and SE of SS (so below and right of SS).
-
-So PP is located to the right of SS (since SE of SS) and left of KKK (SW of KKK).
-
-Now, to find objects SE of PP.
-
-SE of PP would be to the right and down from PP.
-
-What's to the right and down from PP?
-
-KKK is NE of RC, PP is SW of KKK, so KKK is NE of PP, not SE.
-
-P is SW of RC, PP is SE of SS and SW of KKK. Depending on exact positions, P might be SW of PP.
-
-RC is north of PP (since PP is south of RC? If PP is SE of SS which is NW of RC, PP could be south of RC).
-
-Wait, if SS is NW of RC, then SE of SS would go through RC to the SE. So PP is SE of SS, so PP is east and south of SS, possibly south of RC.
-
-If PP is south of RC, then RC is north of PP.
-
-KKK is NE of RC, so KKK is north-east of RC, thus north-east of PP.
-
-P is SW of RC, so south-west of RC, thus south-west of PP.
-
-CC is NW of P, so further west and north, so north-west of PP.
-
-SS is NW of RC, so north-west of PP.
-
-So all other objects are either NW, N, W, SW, NE of PP. None are SE.
-
-Thus, no objects are in the SE of PP. Answer A.0.
-
-I think this confirms it. So the answer should be A.
-</think>
-Let me solve this step by step using the structured format:
-<format>
->>> STEP 1: PARSE RELATIONSHIPS
-    - A is to the DIRECTION of B
-    [... list all given relationships ...]
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-    - Looking for: [target relationship / direction / count]
-    - [reasoning about the relationships]
-    - [use reversibility and transitivity as needed]
-
->>> STEP 3: ANSWER
-    - [state conclusion]
-
->>> FINAL ANSWER: [answer text]
-    \boxed{LETTER}
-</format>
->>> STEP 1: PARSE RELATIONSHIPS
-- Recycle Center (RC) is in the map.
-- Kite's Kitchen Knives (KKK) is to the Northeast of RC.
-- Pandora Jewelry (P) is to the Southwest of RC.
-- P is to the Southwest of KKK.
-- Silo Surprises (SS) is to the Northwest of RC.
-- SS is to the Northeast of P.
-- Construction Convenience (CC) is to the Northwest of P.
-- CC is to the Southwest of SS.
-- Panda's Pottery (PP) is to the Southwest of KKK.
-- PP is to the Southeast of SS.
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-We'll use a coordinate system with RC at (0,0) for simplicity. Directions:
-- Northeast (NE): (x+1, y+1)
-- Southwest (SW): (x-1, y-1)
-- Northwest (NW): (x-1, y+1)
-- Southeast (SE): (x+1, y-1)
-
-Assign coordinates based on given relationships:
-- RC = (0,0)
-- KKK is NE of RC → KKK = (1,1)
-- P is SW of RC → P = (-1,-1)
-- P is SW of KKK → P = (-1,-1) is valid (from (1,1), SW is (-1,-1))
-- SS is NW of RC → SS = (-1,1)
-- SS is NE of P → SS = (-1,1) is valid (from (-1,-1), NE is (-1,1))
-- CC is NW of P → CC = (-2,0)
-- CC is SW of SS → CC = (-2,0) is valid (from (-1,1), SW is (-2,0))
-- PP is SW of KKK → PP = (0,-1)
-- PP is SE of SS → PP = (0,-1) is valid (from (-1,1), SE is (0,-1))
-
-Coordinates:
-- RC: (0,0)
-- KKK: (1,1)
-- P: (-1,-1)
-- SS: (-1,1)
-- CC: (-2,0)
-- PP: (0,-1)
-
-Now, the question: **How many objects are in the Southeast of Panda's Pottery (PP)?**
-
-Southeast of PP means:
-- x > 0 (East of PP)
-- y < -1 (South of PP)
-
-Check each object:
-- RC: (0,0) → x = 0 (not > 0), y = 0 (not < -1) → ❌
-- KKK: (1,1) → x = 1 (> 0), y = 1 (not < -1) → ❌
-- P: (-1,-1) → x = -1 (not > 0), y = -1 (not < -1) → ❌
-- SS: (-1,1) → x = -1 (not > 0), y = 1 (not < -1) → ❌
-- CC: (-2,0) → x = -2 (not > 0), y = 0 (not < -1) → ❌
-- PP: (0,-1) → self, not counted → ❌
-
-No objects satisfy both x > 0 and y < -1.
-
->>> STEP 3: ANSWER
-All relationships are consistent with the coordinate assignments, and no objects lie in the Southeast of PP.
-
->>> FINAL ANSWER: \boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt
deleted file mode 100644
index ae120f5a..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_1499.txt
+++ /dev/null
@@ -1,843 +0,0 @@
-Okay, let's try to figure out this problem step by step. So, we have a map with several objects, and we need to determine how many are north of Craft Creation Station. The answer choices are A.4, B.1, C.3, D.0. Let's break down all the given information first.
-
-First, let's list out all the objects mentioned:
-
-- Miniature Golf Mart (let's call this M)
-- Lynx's Luxury Linens (L)
-- Craft Creation Station (C)
-- Basil's Bakery (B)
-- Molly's Music Store (Mm)
-- Iris's Ice Skates (I)
-
-Wait, hold on, there's a possible confusion with the initials. The user mentioned "M" for Miniature Golf Mart and "Molly's Music Store" also starts with M. Maybe I should use different abbreviations to avoid confusion. Let's see:
-
-Miniature Golf Mart: M
-Lynx's Luxury Linens: L
-Craft Creation Station: C
-Basil's Bakery: Ba
-Molly's Music Store: Mm
-Iris's Ice Skates: I
-
-Okay, that should work. Now, let's parse each statement one by one.
-
-1. "Miniature Golf Mart is in the map." So M is a point.
-
-2. "Lynx's Luxury Linens is to the Northeast of Miniature Golf Mart." So L is NE of M.
-
-3. "Craft Creation Station is to the Northwest of Miniature Golf Mart." So C is NW of M.
-
-4. "Craft Creation Station is to the Southwest of Lynx's Luxury Linens." So C is SW of L. Wait, this is important. So if L is NE of M, and C is SW of L, then C must be positioned such that from L, moving SW gets to C. But also, C is NW of M. Hmm. Let's try to sketch a rough grid mentally.
-
-Let me assign coordinates to make it easier. Let's assume M is at the origin (0,0). Then:
-
-- L is NE of M, so L's coordinates would be (positive x, positive y). Let's say L is at (1,1) for simplicity.
-
-- C is NW of M. NW of M (0,0) would be negative x, positive y. Wait, no: NW is left and up from M. Wait, standard compass directions: if you're facing north, east is to your right, west to your left. So NW would be up and left from M. So if M is at (0,0), then NW would be (-x, +y). So C is NW of M, so C's coordinates would be something like (-1,1).
-
-But also, C is SW of L. SW of L would be moving down and left from L. If L is at (1,1), then SW would be (1 - a, 1 - b) where a,b positive. So if C is at (-1,1), then from L (1,1), moving SW to (-1,1) would be moving left 2 units, same y. Wait, but SW is both south and west, so y should decrease. Hmm, maybe my coordinate system needs adjustment.
-
-Wait, perhaps using a grid where north is up, east is right. So:
-
-- North: +y
-- South: -y
-- East: +x
-- West: -x
-
-So, Northeast of M would be +x and +y. So L is at (x, y) where x>0, y>0 relative to M.
-
-Craft Creation Station (C) is NW of M: NW is -x and +y. So C is at (-x, +y) relative to M.
-
-Also, C is SW of L. SW of L would be -x and -y relative to L. So if L is at (a, b), then C is at (a - c, b - d) where c,d >0.
-
-But we also have C at (-e, f) relative to M (which is at (0,0)), so coordinates of C are (-e, f), and coordinates of L are (g, h), since L is NE of M, so g>0, h>0.
-
-Given that C is SW of L: so C's coordinates = L's coordinates minus some east and north? Wait, SW from L would be moving west (so decrease x) and south (decrease y). So yes, C = (Lx - w, Ly - s), where w,s >0.
-
-So:
-
-C_x = Lx - w
-
-C_y = Ly - s
-
-But C is NW of M (0,0), so C_x < 0, C_y > 0.
-
-Since Lx >0 (as L is NE of M), then Lx - w <0 => w > Lx.
-
-Also, C_y = Ly - s >0 => s < Ly.
-
-Hmm. Let's assign coordinates to make this concrete.
-
-Let's place M at (0,0).
-
-Let's assume L is at (2,2) (NE of M, so x=2, y=2).
-
-C is NW of M, so C must be at (-a, b), where a>0, b>0.
-
-C is also SW of L. SW of L (2,2) would be moving west (x decreases) and south (y decreases). So C's coordinates should be (2 - w, 2 - s), where w>0, s>0.
-
-But C's coordinates are also (-a, b). So:
-
-2 - w = -a => w = 2 + a
-
-2 - s = b => s = 2 - b
-
-Since s must be positive (as we move south from L), 2 - b >0 => b < 2.
-
-Also, since C is NW of M, b must be positive (as it's north), so 0 < b < 2.
-
-Let's pick b=1 for simplicity. Then s=1. So C would be at (2 - w, 1). But since C is NW of M, x-coordinate negative. So 2 - w <0 => w >2. Let's take w=3, so C_x = 2 - 3 = -1. So C is at (-1, 1). That works.
-
-So coordinates so far:
-
-M: (0,0)
-
-L: (2,2)
-
-C: (-1,1)
-
-Check: L is NE of M (yes, 2,2), C is NW of M (-1,1: yes, left and up from M). C is SW of L: from L (2,2) moving SW (left 3, down 1) gets to (-1,1). Yes, that works.
-
-Okay, moving on.
-
-Next: "Basil's Bakery is to the Southeast of Lynx's Luxury Linens. Basil's Bakery is to the Northeast of Miniature Golf Mart."
-
-So Ba is SE of L and NE of M.
-
-SE of L (2,2) would be +x, -y from L. So Ba's coordinates: (2 + e, 2 - f), e,f >0.
-
-NE of M (0,0) means Ba is (g, h), g>0, h>0.
-
-So combining, Ba must be in the first quadrant (NE of M), and SE of L. So since L is at (2,2), SE of L would be east and south, so x >2, y <2.
-
-So Ba's coordinates: x >2, y <2, and since NE of M (which it is, since x>0, y>0 as long as y>0). Let's pick coordinates. Let's say Ba is at (3,1). Check: SE of L (2,2): yes, east 1, south 1. NE of M: yes, 3,1 is NE of (0,0). Okay.
-
-Next: "Molly's Music Store is to the Northeast of Miniature Golf Mart. Molly's Music Store is to the Southeast of Lynx's Luxury Linens."
-
-Mm is NE of M (so x>0, y>0) and SE of L (so x > Lx, y < Ly). Since L is at (2,2), SE of L would be x >2, y <2. So Mm must be in (x>2, y<2). But also NE of M, which it is as long as x>0, y>0. So Mm is in that same area as Ba? Wait, but Ba is also SE of L and NE of M. Wait, but the problem says "Basil's Bakery is to the Southeast of Lynx's Luxury Linens. Basil's Bakery is to the Northeast of Miniature Golf Mart." So Ba is SE of L and NE of M. Similarly, Mm is NE of M and SE of L. So both Ba and Mm are in the SE of L and NE of M. But they might be in different positions.
-
-Wait, but let's see. Let's assign coordinates. Let's say Mm is at (3, 0.5). So SE of L (2,2): x=3>2, y=0.5<2. NE of M: yes. Ba we had at (3,1). So both Ba and Mm are east of L and south of L, but different y's.
-
-Wait, but the problem also says: "Iris's Ice Skates is to the Northwest of Basil's Bakery. Iris's Ice Skates is to the Northwest of Molly's Music Store."
-
-So I is NW of Ba and NW of Mm.
-
-NW of Ba (3,1) would be x <3, y >1.
-
-NW of Mm (3,0.5) would be x <3, y >0.5.
-
-So I has to be in x <3, y >1 (since NW of Ba requires y >1, as NW is up and left; so y-coordinate higher than Ba's y=1).
-
-So let's say I is at (2, 2). Wait, but L is at (2,2). Wait, can't have two places at same coordinates. Hmm. Let's adjust.
-
-Wait, L is at (2,2), so if I is NW of Ba (3,1), then I's x <3, y >1. Let's pick I at (1, 2). Check: NW of Ba (3,1): from Ba, going west 2 and north 1, yes. NW of Mm (3,0.5): same x difference west 2, north 1.5. So yes. So I is at (1,2).
-
-Wait, but L is at (2,2). So I is at (1,2), which is west of L, same y. So I is NW of Ba and Mm, and at (1,2).
-
-Wait, but let's confirm all positions:
-
-M: (0,0)
-
-L: (2,2)
-
-C: (-1,1)
-
-Ba: (3,1)
-
-Mm: (3, 0.5)
-
-I: (1, 2)
-
-Wait, but let's check all the conditions again.
-
-Check "Iris's Ice Skates is to the Northwest of Basil's Bakery." Basil's Bakery is at (3,1). Northwest would be left and up. So I at (1,2): left 2, up 1. Yes, NW. Good.
-
-"Iris's Ice Skates is to the Northwest of Molly's Music Store." Molly's at (3,0.5). Northwest would be left and up. From Mm (3,0.5), moving to (1,2): left 2, up 1.5. Yes, NW. Correct.
-
-Now, let's recap all coordinates:
-
-- M: (0,0)
-
-- L: (2,2)
-
-- C: (-1,1)
-
-- Ba: (3,1)
-
-- Mm: (3, 0.5)
-
-- I: (1, 2)
-
-Wait, but let's check all the other statements again to make sure.
-
-"Craft Creation Station is to the Southwest of Lynx's Luxury Linens." C is at (-1,1), L is at (2,2). Southwest of L would be moving west and south. From L (2,2), west to x=-1 (3 units west), south to y=1 (1 unit south). So yes, SW. Correct.
-
-"Basil's Bakery is to the Southeast of Lynx's Luxury Linens." Ba at (3,1). From L (2,2), east 1, south 1. SE. Correct.
-
-"Basil's Bakery is to the Northeast of Miniature Golf Mart." Ba (3,1) is NE of M (0,0). Yes.
-
-"Molly's Music Store is to the Northeast of Miniature Golf Mart." Mm (3,0.5) is NE of M. Yes.
-
-"Molly's Music Store is to the Southeast of Lynx's Luxury Linens." Mm (3,0.5) is SE of L (2,2): yes.
-
-Now, the question is: How many objects are in the North of Craft Creation Station?
-
-First, what's Craft Creation Station's position? C is at (-1,1).
-
-"North" would be higher y-coordinate. So any object with y >1.
-
-Let's list all objects' y-coordinates:
-
-- M: (0,0) y=0
-
-- L: (2,2) y=2
-
-- C: (-1,1) y=1
-
-- Ba: (3,1) y=1
-
-- Mm: (3, 0.5) y=0.5
-
-- I: (1,2) y=2
-
-So objects north of C (y >1) would be those with y >1. So L and I have y=2, which is greater than 1. Ba and C have y=1, so not north (assuming north means strictly north, not same latitude). M and Mm have lower y.
-
-Wait, but the question says "in the North of Craft Creation Station". Typically, "north of" would mean strictly north, so higher y. So objects with y > C's y (which is 1). So L and I.
-
-Wait, but let's confirm:
-
-C is at (-1,1). North would be increasing y. So any point with y >1 is north of C.
-
-L is at (2,2): y=2 >1: north.
-
-I is at (1,2): y=2 >1: north.
-
-Ba is at (3,1): same y=1: not north.
-
-M, Mm: lower y.
-
-So that's two objects: L and I.
-
-Wait, but the answer choices are A.4, B.1, C.3, D.0. Hmm, two isn't an option. Did I make a mistake?
-
-Wait, maybe I messed up the coordinate assignments. Let's check again.
-
-Wait, perhaps I made a wrong assumption in coordinates. Let's see.
-
-Wait, another way: Maybe the problem uses a different orientation. Sometimes in some contexts, "north" could be considered as the top of the map, so higher y, which is what I did.
-
-But according to my coordinates, there are two objects north of C: L and I.
-
-But since 2 isn't an option, that suggests I made an error.
-
-Wait, let's check again all the positions.
-
-Wait, let's re-express all the relationships without coordinates.
-
-Start with M (Miniature Golf Mart).
-
-- L is NE of M.
-
-- C is NW of M.
-
-- C is SW of L.
-
-So, if L is NE of M, and C is SW of L, then C must be southwest of L, which is northeast of M. So C is southwest of L, so to get from L to C, you go SW, which would place C in a position that's west and south of L, but since C is also NW of M, which is southwest of L? Wait, maybe drawing a rough sketch.
-
-Imagine M at the center.
-
-NE of M is L.
-
-NW of M is C.
-
-So C is northwest of M, L is northeast of M. Then, C is southwest of L. So if L is NE of M, then SW of L would be towards M and beyond. Wait, if L is NE of M, then SW of L would be towards the direction of M and further south-west. So C is located SW of L and also NW of M. So the position of C must be such that from L, moving SW reaches C, and from M, moving NW reaches C.
-
-So the relative positions: M is southeast of C (since C is NW of M), and L is northeast of C (since C is SW of L).
-
-Wait, perhaps another way: The four cardinal directions.
-
-If C is SW of L, then L is NE of C.
-
-Similarly, C is NW of M, so M is SE of C.
-
-So for the question: objects north of C. So any object that is north of C.
-
-We have L is NE of C (since C is SW of L), so L is north of C (and east). So L is north of C.
-
-Similarly, I is NW of Ba and NW of Mm. Let's see where I is.
-
-But according to my coordinate system, I was at (1,2), which is north of C (-1,1), since y=2 >1.
-
-Wait, but according to the problem statements, is there another object north of C?
-
-Wait, let's list all objects again:
-
-Objects:
-
-- M: Miniature Golf Mart
-
-- L: Lynx's Luxury Linens
-
-- C: Craft Creation Station
-
-- Ba: Basil's Bakery
-
-- Mm: Molly's Music Store
-
-- I: Iris's Ice Skates
-
-So total six objects. Wait, but the question is about how many are north of C.
-
-From my coordinates:
-
-- L is north of C
-
-- I is north of C
-
-- Ba is same latitude (y=1), M and Mm are south.
-
-Wait, but according to the problem's description, are there any others?
-
-Wait, let's verify Iris's position again.
-
-"Iris's Ice Skates is to the Northwest of Basil's Bakery. Iris's Ice Skates is to the Northwest of Molly's Music Store."
-
-So I is NW of Ba and NW of Mm.
-
-In my coordinates, Ba was at (3,1), Mm at (3,0.5). So NW of both would be left and up from both. So yes, I at (1,2) is NW of both.
-
-Now, L is at (2,2), which is east of I (1,2), same y. So I is west of L, same latitude.
-
-But both I and L are north of C.
-
-Wait, but according to this, two objects. But answer choices don't have two. Hmm. Maybe I messed up the coordinate assignments.
-
-Wait, let's try a different coordinate system.
-
-Alternative approach: Let's use a grid where we fix M at (0,0).
-
-Define directions:
-
-- North: up (positive y)
-
-- South: down (negative y)
-
-- East: right (positive x)
-
-- West: left (negative x)
-
-So:
-
-- L is NE of M: so L is at (a, b), a >0, b >0.
-
-- C is NW of M: so C is at (-c, d), c >0, d >0.
-
-- C is SW of L: so from L, moving SW (which is west and south), so C's coordinates = L's coordinates minus (east, north), i.e., C_x = L_x - e, C_y = L_y - f, where e,f >0.
-
-Thus:
-
--c = a - e --> e = a + c
-
-d = b - f --> f = b - d
-
-Since f >0, b > d.
-
-Also, since C is NW of M, d >0 (north of M), so 0 < d < b.
-
-Let's assign specific numbers.
-
-Let’s set M at (0,0).
-
-Let’s choose L at (2, 2) (NE of M).
-
-C must be SW of L, so C_x < L_x=2, C_y < L_y=2.
-
-Also, C is NW of M, so C_x <0, C_y >0.
-
-So C must be at x <0, y between 0 and 2.
-
-Let’s pick C at (-1, 1). So:
-
-C_x = -1 = 2 - e --> e = 3
-
-C_y = 1 = 2 - f --> f =1
-
-So that works.
-
-Now, Basil's Bakery (Ba) is SE of L and NE of M.
-
-SE of L (2,2): x >2, y <2.
-
-NE of M (0,0): x >0, y >0. Which is satisfied since x>2>0, y<2 but still >0 (assuming y>0).
-
-Let's put Ba at (3,1).
-
-Molly's Music Store (Mm) is NE of M and SE of L.
-
-Similarly, SE of L: x >2, y <2.
-
-So Mm could be at (3, 0.5), for example.
-
-Now, Iris's Ice Skates (I) is NW of Ba and NW of Mm.
-
-NW of Ba (3,1): x <3, y >1.
-
-NW of Mm (3,0.5): x <3, y >0.5.
-
-So I must be at x <3, y >1 (since y >1 satisfies y >0.5).
-
-Let's put I at (1, 2).
-
-Now, check all:
-
-- I (1,2) is NW of Ba (3,1): yes, left 2, up 1.
-
-- I (1,2) is NW of Mm (3,0.5): left 2, up 1.5. Yes.
-
-Now, list all y-coordinates:
-
-- M: 0
-
-- L: 2
-
-- C:1
-
-- Ba:1
-
-- Mm:0.5
-
-- I:2
-
-So north of C (y >1): L (2) and I (2). So two objects.
-
-But answer choices don't have 2. Options are 4,1,3,0. Hmm. This suggests an error in my reasoning.
-
-Wait, maybe "north" is defined as directly north, not just any north (i.e., same x-coordinate). But the question says "in the North", which usually means any position north, regardless of east/west.
-
-Wait, but maybe I misread the problem. Let's check again.
-
-Wait, the problem says: "How many objects are in the North of Craft Creation Station?"
-
-If "in the North" means strictly north, i.e., same longitude (x), north of it, but I think in map terms, "north of" usually means having a higher latitude (y), regardless of x.
-
-But according to the problem's given data, perhaps my coordinate assignments are wrong.
-
-Wait, let's try to see if there's another way. Maybe I placed I incorrectly.
-
-Wait, another thought: When it says "Iris's Ice Skates is to the Northwest of Basil's Bakery", which is SE of L. So Basil's Bakery is SE of L, so Iris is NW of Ba, so Iris is NW of a point that's SE of L. So Iris could be in a position relative to L.
-
-Wait, let's try to map all the positions without coordinates.
-
-Start with M.
-
-- L is NE of M.
-
-- C is NW of M.
-
-- C is SW of L.
-
-So imagine M in the center.
-
-L is up-right (NE) of M.
-
-C is up-left (NW) of M.
-
-And C is down-left (SW) from L.
-
-So from L, going SW leads to C. So L is NE of C.
-
-Similarly, M is SE of C (since C is NW of M).
-
-Now, Ba is SE of L and NE of M.
-
-So SE of L would be further right and down from L.
-
-NE of M is up-right from M.
-
-So Ba is located to the right and down from L, but still right and up from M.
-
-Similarly, Mm is SE of L and NE of M, so same general area as Ba, but maybe different position.
-
-I is NW of Ba and NW of Mm.
-
-So I is left and up from both Ba and Mm.
-
-Now, to find objects north of C.
-
-C has L to its NE (since C is SW of L), so L is north of C.
-
-Also, I is NW of Ba and Mm. If I is north of C, depending on position.
-
-Wait, but according to my coordinate system, yes. But answer isn't there.
-
-Wait, maybe the error is in assuming I is north of C. Let's see.
-
-In my coordinate system, C is at (-1,1), I is at (1,2). So I is east and north of C. So yes, north of C.
-
-L is at (2,2), which is east and north of C.
-
-So both L and I are north of C.
-
-But answer options don't have 2. So either the question has a typo, or my analysis is wrong.
-
-Wait, let's check if "Craft Creation Station" is C, and whether there's another object I missed.
-
-Wait, the objects listed are:
-
-- Miniature Golf Mart
-
-- Lynx's Luxury Linens
-
-- Craft Creation Station
-
-- Basil's Bakery
-
-- Molly's Music Store
-
-- Iris's Ice Skates
-
-That's six objects. We've accounted for all.
-
-Wait, but let's check again the problem statement for any missed clues.
-
-Wait: "Craft Creation Station is to the Southwest of Lynx's Luxury Linens." So C is SW of L, meaning L is NE of C.
-
-"Basil's Bakery is to the Southeast of Lynx's Luxury Linens. Basil's Bakery is to the Northeast of Miniature Golf Mart."
-
-"Molly's Music Store is to the Northeast of Miniature Golf Mart. Molly's Music Store is to the Southeast of Lynx's Luxury Linens."
-
-So both Ba and Mm are SE of L and NE of M.
-
-"Iris's Ice Skates is to the Northwest of Basil's Bakery. Iris's Ice Skates is to the Northwest of Molly's Music Store."
-
-So I is NW of both Ba and Mm.
-
-Now, to find north of C.
-
-We have L is NE of C (since C is SW of L), so L is north of C.
-
-Is there any other object north of C?
-
-Let's consider the positions:
-
-- M is SE of C (since C is NW of M), so M is south-east of C.
-
-- Ba is SE of L, which is NE of C. So Ba's position relative to C: L is NE of C, Ba is SE of L, so Ba is east and south of L, which is NE of C. So Ba could be east of C or west? Let's see.
-
-If C is at some point, L is NE of C. Ba is SE of L, so Ba is east and south of L. So from C, going NE to L, then SE to Ba: so Ba would be east of C (since L is east of C, and Ba is east of L), and south of L but possibly north or south of C.
-
-In our coordinate system, C was at (-1,1), L at (2,2), Ba at (3,1). So Ba's y=1, same as C. So Ba is not north of C.
-
-Mm is at (3,0.5), also south of C.
-
-I is at (1,2), which is north of C (y=2 >1).
-
-So only L and I.
-
-Wait, unless... Wait, is there a mistake in considering I's position?
-
-Wait, "Iris's Ice Skates is to the Northwest of Basil's Bakery." If Basil's Bakery is at (3,1), then Northwest would be x <3, y >1. So I must be north of Ba's y-coordinate. Ba's y=1, so I's y >1.
-
-Similarly, I is NW of Mm, which is at y=0.5, so I's y >0.5, but the stricter condition is y >1.
-
-Therefore, I has y >1, so north of C (y=1).
-
-L is at y=2, north of C.
-
-So two objects. But since 2 isn't an option, maybe the question considers "north" as directly north, same x-coordinate.
-
-In that case, are there any objects with the same x-coordinate as C and y >1?
-
-C is at (-1,1). Is there any object at x=-1, y>1?
-
-In our coordinate system, no. L is at (2,2), I at (1,2), etc. So no objects directly north. But the question says "in the North", not "directly north", so that shouldn't matter.
-
-Wait, but maybe I assigned the coordinates incorrectly. Let's try another arrangement.
-
-Alternative coordinate system:
-
-Let me place C at (0,0) to see if that helps.
-
-Wait, but maybe better to use a different method.
-
-Let's create a grid.
-
-Assume Craft Creation Station (C) is at the origin for simplicity.
-
-We need to find objects north of C, so with higher y.
-
-Given:
-
-- C is NW of M, so M is SE of C.
-
-- C is SW of L, so L is NE of C.
-
-Therefore, L is NE of C, so L is north and east of C.
-
-So L is north of C.
-
-Now, what about other objects.
-
-Basil's Bakery (Ba): SE of L and NE of M.
-
-Since L is NE of C, SE of L would be further east and south from L.
-
-But M is SE of C (since C is NW of M), so M is east and south of C.
-
-Ba is NE of M, so Ba is north and east of M.
-
-So relative to C:
-
-- C is origin.
-
-- L is NE of C.
-
-- M is SE of C.
-
-- Ba is NE of M, so Ba is east and north of M, which is east and south of C. So Ba could be east of C, north of M (which is south of C), so Ba might be north of C or not.
-
-Wait, this is getting confusing. Let's try.
-
-Let’s define C at (0,0).
-
-L is NE of C, so L = (a,b), a>0, b>0.
-
-M is SE of C (since C is NW of M), so M = (c,-d), c>0, d>0.
-
-Ba is SE of L and NE of M.
-
-SE of L: (a+e, b-f), e>0, f>0.
-
-NE of M: (c+g, -d+h), g>0, h>0.
-
-So Ba must satisfy both.
-
-Also, since Ba is SE of L, its coordinates are east and south of L.
-
-And NE of M, so east and north of M.
-
-This might not help. Let's use the first coordinate system where the answer came to two, but since that's not an option, perhaps the error is elsewhere.
-
-Wait, going back to the problem statement:
-
-"Craft Creation Station is to the Southwest of Lynx's Luxury Linens."
-
-So C is SW of L → L is NE of C.
-
-"Basil's Bakery is to the Southeast of Lynx's Luxury Linens."
-
-Ba is SE of L.
-
-"Molly's Music Store is to the Southeast of Lynx's Luxury Linens."
-
-Mm is SE of L.
-
-So both Ba and Mm are SE of L, meaning they are south and east of L.
-
-L is NE of C.
-
-Therefore, Ba and Mm are SE of L, so they are south of L, which is north of C. So Ba and Mm could be north or south of C.
-
-Wait, if L is north of C, and Ba is south of L, Ba could still be north of C if it's not too far south.
-
-Similarly for Mm.
-
-In my coordinate system, L was at (2,2), C at (-1,1). So L is north-east of C.
-
-Ba at (3,1): y=1, same as C's y=1. So not north.
-
-But if Ba were at (3,1.5), then y=1.5 >1, so north of C.
-
-Ah! Here's the mistake. I assumed Ba's y-coordinate was 1, but it could be higher.
-
-Wait, let's revisit the placement of Ba.
-
-Basil's Bakery is SE of L and NE of M.
-
-If L is at (2,2), SE of L means x >2 and y <2.
-
-NE of M (0,0) means x >0, y >0.
-
-So Ba's y can be between 0 and 2. If we place Ba at y=1.5, then it would be north of C (which is at y=1).
-
-Similarly, Mm is SE of L (y <2), so could have y=1.5 as well.
-
-Wait, this changes things.
-
-Let's correct this.
-
-Let's redo coordinates with more care.
-
-Set M at (0,0).
-
-L is NE of M: let's say L=(2,3) [higher y to allow room].
-
-C is NW of M: C=(-1,2) [y=2, which is north of M].
-
-Check C is SW of L: L=(2,3), C=(-1,2). SW from L would be west and south: yes, to x=-1 (3 west), y=2 (1 south). So yes.
-
-Now:
-
-- C=(-1,2)
-
-- M=(0,0)
-
-- L=(2,3)
-
-Basil's Bakery (Ba) is SE of L and NE of M.
-
-SE of L: x >2, y <3.
-
-NE of M: x >0, y >0.
-
-So Ba could be at (3, 2.5), for example.
-
-Molly's Music Store (Mm) is also SE of L and NE of M, so similarly at (3, 1.5), for example.
-
-Now, Iris's Ice Skates (I) is NW of Ba and NW of Mm.
-
-NW of Ba=(3,2.5): x <3, y >2.5.
-
-NW of Mm=(3,1.5): x <3, y >1.5.
-
-So I must have y >2.5.
-
-Let's place I at (1, 3).
-
-Now, list all y-coordinates:
-
-- M: 0
-
-- L: 3
-
-- C: 2
-
-- Ba: 2.5
-
-- Mm: 1.5
-
-- I: 3
-
-Now, objects north of C (y >2):
-
-- L: y=3 >2
-
-- Ba: y=2.5 >2
-
-- I: y=3 >2
-
-So that's three objects: L, Ba, I.
-
-Ah! This changes things. So depending on where we place Ba and Mm, their y-coordinates could be above C's y.
-
-In my first coordinate system, I incorrectly placed Ba at y=1, which was south of C, but actually, since Ba is SE of L (which is at y=3), SE would have y <3, but could still be greater than C's y=2.
-
-So if C is at y=2, and Ba is at y=2.5, then Ba is north of C.
-
-Similarly, I is at y=3, north of C.
-
-L is at y=3, north of C.
-
-So three objects: L, Ba, I.
-
-That matches option C.3.
-
-So where did I go wrong initially? I placed Ba at y=1, which was too low. Since C is at y=2 (in this corrected system), Ba just needs to be south of L (y <3), but can still be north of C (y >2).
-
-Let's verify all conditions with this new coordinate system.
-
-M=(0,0)
-
-L=(2,3) [NE of M]
-
-C=(-1,2) [NW of M: yes, x=-1 <0, y=2 >0; SW of L: from L(2,3), SW to (-1,2) is west 3, south 1: correct]
-
-Ba=(3, 2.5) [SE of L: x=3>2, y=2.5<3; NE of M: x=3>0, y=2.5>0: correct]
-
-Mm=(3,1.5) [SE of L: yes; NE of M: yes]
-
-I=(1,3) [NW of Ba(3,2.5): x=1<3, y=3>2.5; NW of Mm(3,1.5): x=1<3, y=3>1.5: correct]
-
-Now, check y-coordinates:
-
-- C is at y=2.
-
-Objects north of C (y >2):
-
-- L: 3 >2
-
-- Ba: 2.5 >2
-
-- I: 3 >2
-
-That's three objects.
-
-Ba is at 2.5, which is north of C (2). So yes.
-
-So the answer should be C.3.
-
-My initial mistake was placing Ba too far south (y=1), but since Ba only needs to be south of L (y=3), it can be at y=2.5, which is still north of C (y=2).
-
-Similarly, if we consider that "south" from L means y < L's y, but L's y could be higher, allowing Ba to still be north of C.
-
-Let's confirm once more:
-
-C is at (-1,2). So y=2.
-
-Ba at (3,2.5): y=2.5 >2, so north of C.
-
-L at (2,3): north.
-
-I at (1,3): north.
-
-So three objects. That's option C.
-
-Therefore, the correct answer is C.3.
-</think>
-Let me solve this step by step using the structured format:
-<format>
->>> STEP 1: PARSE RELATIONSHIPS
-    - A is to the DIRECTION of B
-    [... list all given relationships ...]
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-    - Looking for: [target relationship / direction / count]
-    - [reasoning about the relationships]
-    - [use reversibility and transitivity as needed]
-
->>> STEP 3: ANSWER
-    - [state conclusion]
-
->>> FINAL ANSWER: [answer text]
-    \boxed{LETTER}
-</format>
->>> STEP 1: PARSE RELATIONSHIPS
-- Miniature Golf Mart (M) is at a reference point.
-- Lynx's Luxury Linens (L) is NE of M.
-- Craft Creation Station (C) is NW of M.
-- C is SW of L.
-- Basil's Bakery (Ba) is SE of L and NE of M.
-- Molly's Music Store (Mm) is NE of M and SE of L.
-- Iris's Ice Skates (I) is NW of Ba and NW of Mm.
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-Let’s assign coordinates to clarify:
-- Place M at (0,0).
-- L is NE of M ⇒ L = (2, 3).
-- C is NW of M ⇒ C = (-1, 2).
-- C is SW of L ⇒ From L(2,3), SW to C(-1,2) (west 3, south 1) ✅.
-- Ba is SE of L and NE of M ⇒ Ba = (3, 2.5) (x > 2, y < 3, y > 0).
-- Mm is SE of L and NE of M ⇒ Mm = (3, 1.5).
-- I is NW of Ba and NW of Mm ⇒ I = (1, 3) (x < 3, y > 2.5 and y > 1.5).
-
-Now, identify objects **north of C** (C is at y = 2):
-- L = (2, 3) ⇒ y = 3 > 2 ✅
-- Ba = (3, 2.5) ⇒ y = 2.5 > 2 ✅
-- I = (1, 3) ⇒ y = 3 > 2 ✅
-- M = (0, 0), Mm = (3, 1.5) ⇒ y ≤ 2 ❌
-
-Thus, **three objects** are north of C: L, Ba, and I.
-
->>> STEP 3: ANSWER
-The correct answer is **C. 3**.
-
->>> FINAL ANSWER: \boxed{C}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt
deleted file mode 100644
index 943bde0e..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_374.txt
+++ /dev/null
@@ -1,296 +0,0 @@
-Okay, let's try to figure out the direction of Jane's Jazz Club relative to Ostrich's Oils. Hmm, first I need to map out all the given information. Let me list out each location and their positions step by step.
-
-Starting with Eagle's Eclectic Emporium. Let's call that E for short. Then, Toucan's Tea is to the Southwest of E. So if E is the center, Toucan's Tea (T) is Southwest of E. Then Jane's Jazz Club (J) is to the Northwest of Toucan's Tea. Also, Jane's Jazz Club is to the Northwest of Eagle's Eclectic Emporium. Wait, so J is Northwest of both T and E? Wait, but T is Southwest of E, so Northwest of E would be a different direction. Let me visualize this.
-
-Let me sketch a coordinate system. Let's assume E is at the origin (0,0). Directions: North is up, East is right, South is down, West is left.
-
-Southwest of E would be down and left. So T is at, say, (-1, -1) if E is (0,0). Then Jane's Jazz Club is Northwest of T. Northwest of T would be up and left from T. So if T is at (-1, -1), then Northwest would be (-2, 0) or something. But also, J is Northwest of E. Northwest of E (0,0) would be (-1, 1). Wait, but the problem says both: J is Northwest of T and Northwest of E. So maybe J is Northwest of E, which would place it at (-x, y), but also Northwest of T. Let's see.
-
-Wait, maybe I should use relative directions step by step.
-
-First, E is the reference. Let's fix E's position. Then:
-
-- Toucan's Tea (T) is Southwest of E. So from E, go SW to get to T.
-
-- Jane's Jazz Club (J) is Northwest of T. So from T, go NW to get to J.
-
-- Also, J is Northwest of E. So from E, go NW to get to J. Wait, so both conditions must hold. So J must be NW of E, and also NW of T. But T is SW of E. So if E is center, T is SW, then NW of T would be towards the center or further NW?
-
-Wait, maybe it's better to draw a grid. Let's assign coordinates.
-
-Let me set E at (0,0).
-
-Southwest direction from E would be negative x and negative y. Let's assume each step is one unit. So T is at (-1, -1) if we consider SW as 45 degrees, but maybe for simplicity, just using the four cardinal directions. Wait, but Southwest is a diagonal, but in grid terms, maybe it's considered as both South and West. Hmm, but maybe the problem uses the standard compass directions where Southwest is a separate direction, not just South and West.
-
-Wait, the problem says "to the Southwest", so it's a single direction, not South and then West. So in terms of coordinates, Southwest would be a direction that's both South and West, so if E is at (0,0), then T is somewhere in the SW quadrant. Similarly, Northwest would be NW quadrant.
-
-But to make it easier, perhaps we can use a grid where:
-
-- North: +y
-
-- East: +x
-
-- South: -y
-
-- West: -x
-
-So Southwest would be -x and -y, Northwest would be -x and +y.
-
-So E is at (0,0).
-
-T is Southwest of E, so T's coordinates are (negative x, negative y). Let's say T is at (-1, -1).
-
-Now, Jane's Jazz Club (J) is Northwest of Toucan's Tea (T). So from T, Northwest direction. Northwest from T would be towards -x and +y. So if T is at (-1, -1), then moving Northwest would be to (-2, 0), for example. But also, J is Northwest of E. Northwest of E (0,0) would be (-x, +y). So J must be in the Northwest quadrant relative to E. So J's coordinates should be (-a, +b) where a and b are positive.
-
-So combining both: J is Northwest of T and Northwest of E.
-
-Let me see. If T is at (-1, -1), then Northwest of T would be moving West and North. So from T's position, going West (more negative x) and North (more positive y). So J could be at (-2, 0), which is Northwest of T (-1, -1), because from T, moving West (x decreases) and North (y increases). Also, (-2, 0) is Northwest of E (0,0)? Wait, Northwest of E would be a point with negative x and positive y. (-2, 0) is on the negative x-axis, so that's West, not Northwest. Hmm, maybe I need to adjust.
-
-Wait, maybe coordinates are better with both x and y positive in NW. Wait, maybe better to think in terms of relative positions.
-
-Alternatively, maybe using a different approach. Let's list all the given relations:
-
-1. T is SW of E.
-
-2. J is NW of T.
-
-3. J is NW of E.
-
-4. O is NW of T. (Ostrich's Oils, O)
-
-5. O is NW of J.
-
-6. M (Marina Market) is SW of E and SW of T.
-
-7. I (Iris's Ice Skates) is SW of E and NW of T.
-
-Wait, the problem says:
-
-"Ostrich's Oils is to the Northwest of Toucan's Tea. Ostrich's Oils is to the Northwest of Jane's Jazz Club."
-
-So O is NW of T and NW of J.
-
-Also, "Iris's Ice Skates is to the Southwest of Eagle's Eclectic Emporium. Iris's Ice Skates is to the Northwest of Toucan's Tea."
-
-So I is SW of E and NW of T.
-
-Wait, this is a bit confusing. Let's parse all the statements:
-
-- Eagle's Eclectic Emporium (E) is the reference.
-
-- Toucan's Tea (T) is SW of E.
-
-- Jane's Jazz Club (J) is NW of T.
-
-- J is NW of E.
-
-- Ostrich's Oils (O) is NW of T.
-
-- O is NW of J.
-
-- Marina Market (M) is SW of E and SW of T.
-
-- Iris's Ice Skates (I) is SW of E and NW of T.
-
-Wait, so I is SW of E and also NW of T. But T is SW of E. So if T is SW of E, then NW of T would be towards E and beyond. Let's try to place these.
-
-Let me try to create a mental map.
-
-Assume E is in the center.
-
-- T is SW of E.
-
-- J is NW of T and also NW of E. So since T is SW of E, moving NW from T would get towards E's NW direction. So J is NW of E, which is consistent.
-
-- O is NW of T and NW of J. So O is further NW than J.
-
-Wait, if O is NW of J, then J is SE of O.
-
-Wait, the question is: In which direction is Jane's Jazz Club relative to Ostrich's Oils? So we need to find J's direction from O.
-
-If O is NW of J, then J is SE of O. So the direction from O to J is Southeast. Wait, but the question is "Jane's Jazz Club relative to Ostrich's Oils", meaning from O's perspective, where is J? If O is NW of J, then J is SE of O. So the direction would be Southeast. But let's confirm.
-
-Wait, "Ostrich's Oils is to the Northwest of Jane's Jazz Club" means O is NW of J. Therefore, J is SE of O. So the direction of J relative to O is Southeast. So the answer would be A. Southeast.
-
-But let's make sure by mapping all points.
-
-Let me try to assign coordinates.
-
-Let’s place E at (0,0).
-
-T is SW of E. Let's assign T to (-1, -1).
-
-Now, J is NW of T. So from T (-1,-1), NW would be towards decreasing x and increasing y. Let's say J is at (-2, 0). Wait, but J must also be NW of E. NW of E (0,0) would be in the second quadrant (x negative, y positive). So (-2, 0) is on the negative x-axis, which is West, not Northwest. Hmm, maybe better to have J at (-1, 1), which is NW of E. Let's check if J (-1,1) is NW of T (-1,-1). From T (-1,-1), moving North (up) to (-1,1) would be directly North, not Northwest. Northwest would require moving West and North. So to go NW from T, you need to decrease x and increase y. So if T is at (-1,-1), then moving NW would go to (-2, 0), for example. But then J at (-2, 0) is NW of T, but is it NW of E?
-
-NW of E (0,0) would be x negative and y positive. (-2, 0) has y=0, so it's West, not Northwest. So maybe the problem doesn't require strict diagonals, just the general direction. Maybe the problem uses "Northwest" to mean North and West, but not necessarily both. Wait, but in standard terms, Northwest is a specific direction between North and West.
-
-This is getting confusing. Maybe another approach. Let's use relative positions without coordinates.
-
-We have:
-
-- E is the main point.
-
-- T is SW of E.
-
-- J is NW of T and also NW of E. So J is in the NW direction from E, and also NW of T. Since T is SW of E, the NW direction from T would overlap with the NW of E. So J is located NW of E, and T is SW of E, so J is above and to the left of E, T is below and to the left of E.
-
-- O is NW of T and NW of J. So O is further NW than J.
-
-Wait, if O is NW of J, then O is more to the Northwest than J. So the order from SW to NE would be E, T (SW of E), then J (NW of E and NW of T), then O (NW of J). Wait, but O is NW of J, so O is further NW than J.
-
-Wait, but J is NW of E, so O is even further NW than J.
-
-Now, the question is: J relative to O. If O is NW of J, then J is SE of O. So direction is Southeast. So answer A.
-
-But let's check the other points to make sure.
-
-Marina Market (M) is SW of E and SW of T. Since T is SW of E, SW of T would be further SW. So M is SW of both.
-
-Iris's Ice Skates (I) is SW of E and NW of T. Wait, I is SW of E, so like T is SW of E, but I is also SW of E. But I is NW of T. So T is SW of E, so NW of T would be towards E. So I is SW of E (so like, between E and T?), but NW of T. Wait, this might be a bit conflicting.
-
-Wait, if T is SW of E, then NW of T would be towards E's direction. So if I is NW of T, then I is closer to E than T is. But I is also SW of E. Wait, that seems contradictory. Wait, SW of E would be in the SW direction from E, but NW of T (which is SW of E) would be towards the North from T, so towards E's direction.
-
-Wait, maybe I is located such that from E, it's SW, and from T, it's NW. So imagine E at center. T is SW of E. I is SW of E, so also in SW direction from E, but closer to E than T? Or further?
-
-Wait, if T is SW of E, then to be NW of T, you have to go North and West from T. But if I is SW of E, then it's South and West from E. Hmm. Let's say E is (0,0), T is at (-2, -2) (SW). Then NW of T would be (-3, -1), but that's still SW of E. Wait, maybe coordinates are better.
-
-Let me try again with coordinates.
-
-Set E at (0,0).
-
-Define:
-
-- South: negative y
-
-- North: positive y
-
-- West: negative x
-
-- East: positive x
-
-So:
-
-- SW: negative x and negative y.
-
-- NW: negative x and positive y.
-
-So:
-
-T is SW of E: T = (-a, -b), where a,b >0.
-
-J is NW of T: so from T, go NW, so subtract from x (more west) and add to y (more north). So J = (-a - c, -b + d), where c,d >0.
-
-But J is also NW of E: so J must have negative x and positive y. So -a -c <0 (which it is, since a,c>0) and -b + d >0 → d > b. So J's y-coordinate is positive.
-
-Similarly, O is NW of T and NW of J.
-
-O is NW of T: so O = (-a - e, -b + f), e,f >0.
-
-O is NW of J: so O must be more NW than J. So O's x < J's x (more west), and O's y > J's y (more north).
-
-So if J is at (-a -c, -b + d), then O is at (-a -c - e, -b + d + f), e,f >0.
-
-Therefore, O is further NW than J.
-
-Therefore, J is to the SE of O.
-
-Hence, Jane's Jazz Club (J) is Southeast of Ostrich's Oils (O). So the direction is Southeast, answer A.
-
-Wait, but let's check the other points to confirm.
-
-Iris's Ice Skates (I) is SW of E and NW of T.
-
-SW of E: so I = (-g, -h), g,h >0.
-
-NW of T: T is (-a, -b), so NW of T would be (-a - i, -b + j), i,j >0.
-
-So I must equal (-a -i, -b + j). But also, I is SW of E: (-g, -h), so -g = -a -i → g = a +i, and -h = -b + j → h = b - j.
-
-Since h >0 (as it's SW of E), then b - j >0 → j < b.
-
-So j is positive but less than b. So I's y-coordinate is -h = -(b - j) = -b + j, which is greater than -b (since j >0), so I is North of T (since T's y is -b, I's y is -b + j > -b), which makes sense for NW of T (since NW from T would be North and West).
-
-So I is located NW of T (which is SW of E), so I is between T and E in the NW direction, but still SW of E.
-
-But maybe this detail isn't necessary for the question.
-
-Back to the main question: J relative to O.
-
-Since O is NW of J, J is SE of O. So answer A. Southeast.
-
-Wait, but let's make sure there's no mistake here.
-
-The problem states: "Ostrich's Oils is to the Northwest of Jane's Jazz Club." So O is NW of J → J is SE of O. Yes, that's correct.
-
-Therefore, the direction from O to J is Southeast. So Jane's Jazz Club is Southeast of Ostrich's Oils. So the answer should be A.
-
-Wait, but let's check the options:
-
-A. Southeast
-
-B. Northwest
-
-C. Southwest
-
-D. Northeast
-
-Yes, A is Southeast.
-
-But wait, another way to think: If O is NW of J, then to get from O to J, you go SE. So J is SE of O. So the direction of J relative to O is Southeast. So answer A.
-
-I think that's correct. Let me just make sure there's no other interpretation.
-
-Wait, the problem says "Ostrich's Oils is to the Northwest of Jane's Jazz Club." So O is NW of J. So J is SE of O. So the answer is A.
-
-Yes, that seems right. I don't see any contradictions in the other points affecting this. The other locations (M, I) are probably there to distract or for other questions, but for this specific question, the key is O is NW of J, so J is SE of O.
-</think>
-Let me solve this step by step using the structured format:
-<format>
->>> STEP 1: PARSE RELATIONSHIPS
-    - A is to the DIRECTION of B
-    [... list all given relationships ...]
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-    - Looking for: [target relationship / direction / count]
-    - [reasoning about the relationships]
-    - [use reversibility and transitivity as needed]
-
->>> STEP 3: ANSWER
-    - [state conclusion]
-
->>> FINAL ANSWER: [answer text]
-    \boxed{LETTER}
-</format>
->>> STEP 1: PARSE RELATIONSHIPS
-- Eagle's Eclectic Emporium (E)
-- Toucan's Tea (T) is SW of E
-- Jane's Jazz Club (J) is NW of T
-- J is NW of E
-- Ostrich's Oils (O) is NW of T
-- O is NW of J
-- Marina Market (M) is SW of E and SW of T
-- Iris's Ice Skates (I) is SW of E and NW of T
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-We're asked: **In which direction is Jane's Jazz Club (J) relative to Ostrich's Oils (O)?**
-
-From the given:
-> "Ostrich's Oils is to the Northwest of Jane's Jazz Club."
-
-This means **O is NW of J**.
-
-In directional terms:
-- If O is **NW of J**, then J is **SE of O**.
-
-This is a direct reversal of the stated relationship.
-
-Let's verify using transitivity and spatial logic:
-- O is NW of J → J is SE of O.
-
-No other relationships contradict this, as the other points (T, E, M, I) are not relevant to the J-O relationship.
-
->>> STEP 3: ANSWER
-The direction of Jane's Jazz Club **relative to Ostrich's Oils** is **Southeast**.
-
->>> FINAL ANSWER: 
-\boxed{A}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt
deleted file mode 100644
index 82e2f2c9..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/Reasoning_output/reason_749.txt
+++ /dev/null
@@ -1,622 +0,0 @@
-First, I need to figure out the positions of all these objects based on the given directions. The question is: "Which object is in the Northwest of Ocelot's Office Ornaments?" with options A. Cheetah's Chocolates, B. Frost Funhouse, C. Cobbler's Corner, D. Miniature Golf Mart.
-
-I should start by listing all the objects mentioned:
-
-- Frost Funhouse
-
-- Ocelot's Office Ornaments
-
-- Miniature Golf Mart
-
-- Lighthouse Luminary
-
-- Cobbler's Corner
-
-- Cheetah's Chocolates
-
-Now, let's parse the directional clues one by one.
-
-1. "Ocelot's Office Ornaments is to the Northeast of Frost Funhouse."
-
-   So, Ocelot's Office Ornaments (let's call it OO for short) is northeast of Frost Funhouse (FF). That means FF is southwest of OO.
-
-2. "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments."
-
-   Miniature Golf Mart (MGM) is northwest of OO. So, OO is southeast of MGM.
-
-3. "Miniature Golf Mart is to the Northwest of Frost Funhouse."
-
-   MGM is northwest of FF. So, FF is southeast of MGM.
-
-4. "Lighthouse Luminary is to the Northeast of Frost Funhouse."
-
-   Lighthouse Luminary (LL) is northeast of FF. So, FF is southwest of LL.
-
-5. "Lighthouse Luminary is to the Northeast of Miniature Golf Mart."
-
-   LL is northeast of MGM. So, MGM is southwest of LL.
-
-6. "Cobbler's Corner is to the Southeast of Miniature Golf Mart."
-
-   Cobbler's Corner (CC) is southeast of MGM. So, MGM is northwest of CC.
-
-7. "Cobbler's Corner is to the Southwest of Lighthouse Luminary."
-
-   CC is southwest of LL. So, LL is northeast of CC.
-
-8. "Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments."
-
-   Cheetah's Chocolates (CCO) is northeast of OO. So, OO is southwest of CCO.
-
-9. "Cheetah's Chocolates is to the Southeast of Miniature Golf Mart."
-
-   CCO is southeast of MGM. So, MGM is northwest of CCO.
-
-Now, I need to find which object is northwest of OO. So, I need to find something that is directly northwest of OO.
-
-From clue 2, MGM is northwest of OO. So, Miniature Golf Mart is northwest of Ocelot's Office Ornaments.
-
-But let me confirm if there are others or if this is consistent.
-
-Also, option D is Miniature Golf Mart, so that might be it.
-
-But I should make sure about the positions.
-
-Let me try to sketch a small grid mentally.
-
-Assume Frost Funhouse is at some point. Let's set FF at the origin for simplicity.
-
-Set FF at (0,0).
-
-Now, from clue 1: OO is northeast of FF. Northeast means both x and y increase, so let's say OO is at (1,1) for simplicity, but since it's relative, I can use directions.
-
-To make it easier, I'll use a coordinate system where north is positive y, east is positive x.
-
-So, let FF be at (0,0).
-
-- OO is northeast of FF, so OO has higher x and higher y. Say OO is at (a,b) with a>0, b>0.
-
-But I don't need exact values, just relative positions.
-
-From clue 3: MGM is northwest of FF.
-
-Northwest means lower x and higher y? Let's define:
-
-- North: increase y
-
-- East: increase x
-
-- South: decrease y
-
-- West: decrease x
-
-So:
-
-- Northeast: increase x, increase y
-
-- Northwest: decrease x, increase y
-
-- Southeast: increase x, decrease y
-
-- Southwest: decrease x, decrease y
-
-FF at (0,0)
-
-Clue 1: OO is northeast of FF → OO has x > 0, y > 0
-
-Clue 3: MGM is northwest of FF → MGM has x < 0, y > 0 (since northwest: west is negative x, north is positive y)
-
-Clue 2: MGM is northwest of OO → so from OO, MGM is west and north? Northwest of OO means from OO, moving northwest to get to MGM.
-
-If MGM is northwest of OO, that means MGM is west and north of OO.
-
-So, if OO is at (x_o, y_o), then MGM is at (x_o - dx, y_o + dy) for some dx, dy >0.
-
-Similarly, from clue 3, MGM is northwest of FF, so if FF is at (0,0), MGM is at (x_m, y_m) with x_m < 0, y_m > 0.
-
-Now, clue 4: LL is northeast of FF → LL has x > 0, y > 0
-
-Clue 5: LL is northeast of MGM → so LL has higher x and higher y than MGM.
-
-Clue 6: CC is southeast of MGM → so CC has higher x and lower y than MGM.
-
-Clue 7: CC is southwest of LL → so CC has lower x and lower y than LL.
-
-Clue 8: CCO is northeast of OO → so CCO has higher x and higher y than OO.
-
-Clue 9: CCO is southeast of MGM → so CCO has higher x and lower y than MGM.
-
-Now, I need to find what is northwest of OO.
-
-From clue 2, MGM is northwest of OO, so MGM should be northwest of OO.
-
-But let me see if this is consistent with other clues.
-
-Also, option D is Miniature Golf Mart, which is MGM.
-
-But let me check if there's any other possibility.
-
-The question is "which object is in the Northwest of Ocelot's Office Ornaments", and clue 2 directly says MGM is northwest of OO.
-
-So, it should be D. Miniature Golf Mart.
-
-But let me make sure about the other options.
-
-Option A is Cheetah's Chocolates. Clue 8 says CCO is northeast of OO, so it's not northwest; it's northeast.
-
-Option B is Frost Funhouse. From clue 1, OO is northeast of FF, so FF is southwest of OO, not northwest.
-
-Option C is Cobbler's Corner. Now, where is CC relative to OO?
-
-I need to find the position of CC.
-
-From clue 6: CC is southeast of MGM.
-
-From clue 7: CC is southwest of LL.
-
-But I need relation to OO.
-
-Also, from clue 5, LL is northeast of MGM.
-
-And clue 4, LL is northeast of FF.
-
-Now, let me try to place all points.
-
-Set FF at (0,0).
-
-Since MGM is northwest of FF, let's say MGM is at (-1, 1) for simplicity. (x negative, y positive)
-
-Now, OO is northeast of FF, so OO has x>0, y>0. But from clue 2, MGM is northwest of OO.
-
-MGM is at (-1,1), and it's northwest of OO, so OO must be southeast of MGM.
-
-Since MGM is northwest of OO, OO is southeast of MGM.
-
-So, if MGM is at (-1,1), then OO is southeast, so OO has x > -1 and y < 1? Southeast means increase x, decrease y.
-
-So, OO should be at (x_o, y_o) with x_o > -1 (since east of MGM), and y_o < 1 (since south of MGM).
-
-But also, from clue 1, OO is northeast of FF, and FF is at (0,0), so OO has x_o > 0, y_o > 0.
-
-So, OO has x_o > 0, y_o > 0, and since it's southeast of MGM at (-1,1), x_o > -1 (which is true since x_o >0), and y_o < 1.
-
-y_o < 1 and y_o > 0.
-
-Now, clue 4: LL is northeast of FF, so LL has x >0, y>0.
-
-Clue 5: LL is northeast of MGM.
-
-MGM is at (-1,1), so LL is northeast of MGM, meaning x_ll > -1, y_ll > 1.
-
-But since LL is northeast of FF, x_ll >0, y_ll >0, so y_ll >1 is consistent.
-
-Now, clue 6: CC is southeast of MGM.
-
-MGM at (-1,1), so CC is southeast, so x_cc > -1, y_cc < 1.
-
-Clue 7: CC is southwest of LL.
-
-So, CC has x_cc < x_ll, y_cc < y_ll.
-
-Now, clue 8: CCO is northeast of OO.
-
-OO is at (x_o, y_o) with x_o >0, y_o <1, y_o >0.
-
-CCO has x_cco > x_o, y_cco > y_o.
-
-Clue 9: CCO is southeast of MGM.
-
-MGM at (-1,1), so CCO is southeast of MGM, meaning x_cco > -1, y_cco < 1.
-
-Now, back to finding northwest of OO.
-
-We have MGM at (-1,1), OO at (x_o, y_o) with x_o >0, y_o <1.
-
-Since MGM is northwest of OO, and OO is southeast of MGM, the vector from OO to MGM is northwest, so from OO, going northwest gets to MGM.
-
-So, MGM is directly northwest of OO, as per clue 2.
-
-Now, is there any other object that could be northwest of OO?
-
-For example, Frost Funhouse: FF is at (0,0). OO is northeast of FF, so FF is southwest of OO, not northwest.
-
-Cheetah's Chocolates: CCO is northeast of OO, so it's not northwest.
-
-Cobbler's Corner: CC is southeast of MGM, and MGM is northwest of OO, so CC might be southeast of OO or something.
-
-Let's see the position.
-
-MGM is northwest of OO.
-
-CC is southeast of MGM.
-
-Since MGM is northwest of OO, and CC is southeast of MGM, then CC could be southwest of OO or something.
-
-For example, from OO, to go to MGM, you go northwest.
-
-Then from MGM to CC, you go southeast.
-
-So, from OO to CC, it might not be direct.
-
-But specifically, is CC northwest of OO? Probably not.
-
-Let's assign coordinates.
-
-Set FF at (0,0).
-
-Set MGM at (-1,1) as I thought.
-
-Now, OO is southeast of MGM, and northeast of FF.
-
-So, OO must have x > -1 and y < 1 (from being southeast of MGM), and x >0, y>0 (from northeast of FF).
-
-So, let's say OO is at (0.5, 0.5) for example. (x=0.5>0, y=0.5<1 and >0)
-
-Now, LL is northeast of FF, so x>0, y>0, and northeast of MGM at (-1,1), so x > -1, y >1.
-
-Also, since LL is northeast of FF, but FF is (0,0), so y>0 is fine, but y>1 from above.
-
-So LL has y>1.
-
-Say LL is at (1,2), for example.
-
-Now, CC is southeast of MGM. MGM at (-1,1), so southeast: x > -1, y <1.
-
-Also, CC is southwest of LL. LL at (1,2), so southwest means x <1, y<2.
-
-Now, CC has y<1 from above, and y<2 is automatic since y<1<2.
-
-x > -1 and x <1.
-
-Also, from clue 7, CC is southwest of LL, which is consistent.
-
-Now, CCO: northeast of OO.
-
-OO at (0.5,0.5), so CCO has x >0.5, y>0.5.
-
-Also, southeast of MGM: MGM at (-1,1), so CCO has x > -1, y <1.
-
-So CCO has x >0.5, y>0.5, and y<1.
-
-Say CCO at (0.6, 0.6) for example.
-
-Now, the question: which is northwest of OO.
-
-OO is at (0.5,0.5)
-
-Northwest of OO would be points with x < 0.5 and y > 0.5.
-
-Now, look at the objects:
-
-- FF: (0,0) — x=0 <0.5, y=0 <0.5? y=0 <0.5, but northwest requires y>0.5, so y=0 <0.5, not north, it's south. So not northwest.
-
-- OO itself: not.
-
-- MGM: (-1,1) — x=-1 <0.5, y=1 >0.5, so yes, northwest.
-
-- LL: (1,2) — x=1 >0.5, y=2>0.5, so northeast, not northwest.
-
-- CC: let's say we have it. CC is southeast of MGM, so x > -1, y<1. And from above, x<1, y<1.
-
-For example, CC could be at (0,0.5), but y=0.5, which is not greater than 0.5 for northwest? Northwest of OO: y must be >0.5.
-
-If CC is at (0,0.5), y=0.5 not greater than 0.5, so not north, it's same latitude or south.
-
-But typically, "northwest" implies strictly northwest, so y > y_o and x < x_o.
-
-Similarly, in directions, it's usually not the same point.
-
-So for OO at (0.5,0.5), northwest: x < 0.5 and y > 0.5.
-
-CC: y <1, but y could be less than or equal to 0.5.
-
-For example, if CC is at (0,0.4), y=0.4 <0.5, so south, not north.
-
-If CC is at (0,0.6), but y<1, and y>0.5? But from clue, CC is southeast of MGM, MGM y=1, so y_cc <1, but it could be above or below 0.5.
-
-In my coordinate, OO y=0.5, so for CC to be northwest of OO, it needs y>0.5.
-
-But CC is southeast of MGM at y=1, so y_cc <1, but it could be 0.6, for example.
-
-However, from clue 7, CC is southwest of LL.
-
-LL is at (1,2), so y_cc <2, which is true, but also, since LL is northeast of MGM, and MGM y=1, LL y>1, so y_ll >1.
-
-CC is southwest of LL, so y_cc < y_ll, which is true since y_cc <1 < y_ll if y_ll>1.
-
-But y_cc could be, say, 0.6, which is greater than 0.5.
-
-But is it necessarily northwest of OO?
-
-Not necessarily, because OO is at (0.5,0.5), and CC could be at (0.4, 0.6), for example, which would be northwest of OO? x=0.4<0.5, y=0.6>0.5, so yes, it could be.
-
-But in this case, CC might be northwest of OO, but is it?
-
-In my example, if CC is at (0.4,0.6), and OO at (0.5,0.5), then yes, CC is northwest of OO.
-
-But is that consistent with all clues?
-
-Let me check the clues.
-
-We have clue 6: CC is southeast of MGM.
-
-MGM at (-1,1), CC at (0.4,0.6): x=0.4 > -1 (east), y=0.6 <1 (south), so southeast, yes.
-
-Clue 7: CC is southwest of LL. LL at (1,2), CC at (0.4,0.6): x=0.4<1, y=0.6<2, so southwest, yes.
-
-Now, is CC northwest of OO? OO at (0.5,0.5), CC at (0.4,0.6): x=0.4<0.5, y=0.6>0.5, so yes, northwest.
-
-But clue 2 says MGM is northwest of OO, and MGM is at (-1,1), which is also northwest.
-
-So both MGM and CC could be northwest of OO, but that can't be right for the question, as it should be one answer.
-
-Perhaps I have a mistake.
-
-In this setup, MGM is at (-1,1), which is more northwest than CC.
-
-But the question is "which object is in the Northwest", implying there might be one, or we need to see the direct one.
-
-But clue 2 directly states that MGM is northwest of OO, so it should be MGM.
-
-In my coordinate, CC is also northwest of OO in this example, but is CC necessarily northwest?
-
-Let me see the relative positions.
-
-From the clues, is there a constraint that prevents CC from being northwest of OO?
-
-For example, from clue 9: CCO is southeast of MGM.
-
-CCO is at (0.6,0.6) in my example.
-
-CC is at (0.4,0.6), CCO at (0.6,0.6), so same y, but CCO is southeast of MGM, which is fine.
-
-But for CC, it might not be directly northwest of OO; it depends on positions.
-
-However, the question is based on the given directions, and clue 2 explicitly says MGM is northwest of OO, so that should be the answer.
-
-Moreover, in the options, D is Miniature Golf Mart, which is MGM.
-
-Now, let's see why CC might not be northwest.
-
-Perhaps from the other clues.
-
-Another clue: "Cobbler's Corner is to the Southwest of Lighthouse Luminary." and "Lighthouse Luminary is to the Northeast of Miniature Golf Mart."
-
-But I think I have that.
-
-Perhaps I should list all the relations.
-
-Let me make a directed graph or something.
-
-From the clues:
-
-- OO is NE of FF → FF is SW of OO
-
-- MGM is NW of OO → OO is SE of MGM
-
-- MGM is NW of FF → FF is SE of MGM
-
-- LL is NE of FF → FF is SW of LL
-
-- LL is NE of MGM → MGM is SW of LL
-
-- CC is SE of MGM → MGM is NW of CC
-
-- CC is SW of LL → LL is NE of CC
-
-- CCO is NE of OO → OO is SW of CCO
-
-- CCO is SE of MGM → MGM is NW of CCO
-
-Now, for "northwest of OO", we need something that is NW of OO.
-
-From above, MGM is NW of OO, directly.
-
-Is there another?
-
-For example, is FF NW of OO? But FF is SW of OO, from clue 1.
-
-CC: CC is SE of MGM, and MGM is NW of OO, so CC is SE of something that is NW of OO.
-
-If MGM is NW of OO, then SE of MGM would be, relative to OO, it could be south or east, but not northwest.
-
-For instance, if OO is at origin, MGM is at (-1,1), then SE of MGM would be, say, (0,0) or (0,0.5), which for OO at (0,0), SE of MGM at (-1,1) is (x> -1, y<1), so for OO at (0,0), points like (0.5,0.5) would be NE of OO, not NW.
-
-In my earlier coordinate, OO at (0.5,0.5), MGM at (-1,1), SE of MGM: x> -1, y<1, so for example (0,0.4) or (0.4,0.6).
-
-(0.4,0.6) compared to OO (0.5,0.5): x=0.4<0.5, y=0.6>0.5, so NW.
-
-But is (0.4,0.6) directly NW? Yes, but is it necessarily the case?
-
-The issue is that the position of CC is not fixed; it could be at different places.
-
-For example, CC could be at (0.2, 0.4), which has y=0.4 < 0.5, so south of OO, not north.
-
-Or at (0.4, 0.6), which is north and west.
-
-But the clue doesn't specify that CC must be north of OO; it could be south.
-
-Similarly, for the question, we need to see what is definitely northwest.
-
-But clue 2 says MGM is northwest of OO, so it is definitely northwest.
-
-For CC, it might or might not be northwest, depending on positions.
-
-For example, if CC is at (0.2, 0.4), then it is southeast of OO, since x=0.2<0.5, y=0.4<0.5, so southwest.
-
-If at (0.4,0.6), it is northwest.
-
-But the clue doesn't constrain it to be north of OO, so it could be south.
-
-Therefore, CC is not necessarily northwest of OO; it could be in other directions.
-
-Whereas MGM is explicitly northwest of OO.
-
-Similarly, CCO is northeast of OO, so definitely not northwest.
-
-FF is southwest, as established.
-
-So only MGM is directly stated to be northwest of OO.
-
-Moreover, in the options, D is Miniature Golf Mart.
-
-But let me check the answer choices.
-
-Options:
-
-A. Cheetah's Chocolates — this is northeast, from clue 8.
-
-B. Frost Funhouse — southwest, from clue 1.
-
-C. Cobbler's Corner — could be various, but not necessarily northwest.
-
-D. Miniature Golf Mart — northwest, from clue 2.
-
-Also, the clue says "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments." directly.
-
-So, it should be D.
-
-But in the problem, it says "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments." which is clue 2.
-
-And the question is "which object is in the Northwest of Ocelot's Office Ornaments?"
-
-So, it's directly given.
-
-Perhaps I misread the clues.
-
-Let me read the original text.
-
-"Ocelot's Office Ornaments is to the Northeast of Frost Funhouse. Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments. Miniature Golf Mart is to the Northwest of Frost Funhouse."
-
-Yes, so Miniature Golf Mart is northwest of Ocelot's Office Ornaments.
-
-So, the object northwest of OO is Miniature Golf Mart.
-
-Therefore, the answer should be D.
-
-But why are there other clues? Maybe to distract or to confirm.
-
-For example, "Miniature Golf Mart is to the Northwest of Frost Funhouse" is consistent, as we have.
-
-And the other clues about Lighthouse and Cobbler's might be for other questions, but not relevant here.
-
-Similarly, Cheetah's Chocolates is northeast, not northwest.
-
-So, I think D is correct.
-
-But let me see if there's a trick.
-
-Another thing: the question says "in the Northwest", and "to the Northwest" might be interpreted as the direction, but in context, it's clear.
-
-Perhaps for Cobbler's Corner, but no.
-
-Let me list all the "is to the" statements.
-
-- OO is NE of FF
-
-- MGM is NW of OO
-
-- MGM is NW of FF
-
-- LL is NE of FF
-
-- LL is NE of MGM
-
-- CC is SE of MGM
-
-- CC is SW of LL
-
-- CCO is NE of OO
-
-- CCO is SE of MGM
-
-Now, for OO, the objects relative to it:
-
-- FF: SW (since OO is NE of FF)
-
-- MGM: NW (directly)
-
-- LL: ? Not directly given, but from LL is NE of FF and FF is SW of OO, so LL could be NE or something.
-
-For example, if FF is SW of OO, and LL is NE of FF, then LL could be NE of OO or not, depending.
-
-But not relevant.
-
-- CC: not directly given relative to OO.
-
-- CCO: NE
-
-So only MGM is specified as NW of OO.
-
-Therefore, the answer must be Miniature Golf Mart.
-
-So, option D.
-
-But in the options, it's listed as D. Miniature Golf Mart.
-
-Now, just to be thorough, let's see if from the clues, CC could be NW, but as I thought, it's not necessary.
-
-For example, from LL is NE of MGM, and CC is SW of LL, so CC is SW of LL, and LL is NE of MGM, so CC is not necessarily related to OO directly.
-
-But in terms of direction, since MGM is NW of OO, and CC is SE of MGM, then CC is generally south-east of OO, not north-west.
-
-For instance, if you go from OO to MGM (NW), then from MGM to CC (SE), so from OO to CC, you go NW then SE, which might cancel out, but typically, it would be roughly in the same area or south-east.
-
-In vector terms, if OO to MGM is vector (-a,b) for a>0,b>0, then MGM to CC is vector (c,-d) for c>0,d>0, so OO to CC is (-a+c, b-d), which could have x negative or positive, y positive or negative, so it could be NW, NE, SE, SW, depending on values.
-
-But in the context of the map, with the given clues, it might not be specified, but since the question has a specific answer, and MGM is directly given, it should be D.
-
-Moreover, in multiple-choice, D is the direct one.
-
-Perhaps the problem is that "Miniature Golf Mart" is listed, and it's correct.
-
-Another thing: in the text, it says "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments." so it's clear.
-
-I think I'm overcomplicating.
-
-So, the answer should be D. Miniature Golf Mart.
-</think>
-Let me solve this step by step using the structured format:
-<format>
->>> STEP 1: PARSE RELATIONSHIPS
-    - A is to the DIRECTION of B
-    [... list all given relationships ...]
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-    - Looking for: [target relationship / direction / count]
-    - [reasoning about the relationships]
-    - [use reversibility and transitivity as needed]
-
->>> STEP 3: ANSWER
-    - [state conclusion]
-
->>> FINAL ANSWER: [answer text]
-    \boxed{LETTER}
-</format>
->>> STEP 1: PARSE RELATIONSHIPS
-- Ocelot's Office Ornaments is to the Northeast of Frost Funhouse → Frost Funhouse is to the Southwest of Ocelot's Office Ornaments.
-- Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments → Ocelot's Office Ornaments is to the Southeast of Miniature Golf Mart.
-- Miniature Golf Mart is to the Northwest of Frost Funhouse → Frost Funhouse is to the Southeast of Miniature Golf Mart.
-- Lighthouse Luminary is to the Northeast of Frost Funhouse → Frost Funhouse is to the Southwest of Lighthouse Luminary.
-- Lighthouse Luminary is to the Northeast of Miniature Golf Mart → Miniature Golf Mart is to the Southwest of Lighthouse Luminary.
-- Cobbler's Corner is to the Southeast of Miniature Golf Mart → Miniature Golf Mart is to the Northwest of Cobbler's Corner.
-- Cobbler's Corner is to the Southwest of Lighthouse Luminary → Lighthouse Luminary is to the Northeast of Cobbler's Corner.
-- Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments → Ocelot's Office Ornaments is to the Southwest of Cheetah's Chocolates.
-- Cheetah's Chocolates is to the Southeast of Miniature Golf Mart → Miniature Golf Mart is to the Northwest of Cheetah's Chocolates.
-
->>> STEP 2: ANALYZE SPATIAL RELATIONSHIPS
-- The question asks: "Which object is in the Northwest of Ocelot's Office Ornaments?"
-- From the parsed relationships, the direct statement is: "Miniature Golf Mart is to the Northwest of Ocelot's Office Ornaments."
-- This means Miniature Golf Mart is positioned directly northwest of Ocelot's Office Ornaments.
-- Now, check the options:
-  - A. Cheetah's Chocolates: Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments (from "Cheetah's Chocolates is to the Northeast of Ocelot's Office Ornaments"), so it is northeast, not northwest.
-  - B. Frost Funhouse: Frost Funhouse is to the Southwest of Ocelot's Office Ornaments (from "Ocelot's Office Ornaments is to the Northeast of Frost Funhouse"), so it is southwest, not northwest.
-  - C. Cobbler's Corner: Cobbler's Corner is to the Southeast of Miniature Golf Mart (from "Cobbler's Corner is to the Southeast of Miniature Golf Mart"). Since Miniature Golf Mart is northwest of Ocelot's Office Ornaments, Cobbler's Corner is southeast of Miniature Golf Mart, meaning it is generally southeast or south of Ocelot's Office Ornaments, not northwest. It is not directly northwest.
-  - D. Miniature Golf Mart: Directly stated as northwest of Ocelot's Office Ornaments.
-- The other clues (e.g., about Lighthouse Luminary, Cobbler's Corner, Cheetah's Chocolates) are consistent with the positions but do not change the direct relationship given for Miniature Golf Mart.
-
->>> STEP 3: ANSWER
-- The object directly in the Northwest of Ocelot's Office Ornaments is Miniature Golf Mart.
-
->>> FINAL ANSWER: D
-\boxed{D}
\ No newline at end of file
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv
deleted file mode 100644
index f7b5eb14..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_2examples.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-index,question_type,correct,attempted,sol,gt,tokens,num_relations,verified_claims,message
-0,direction,True,True,A,Northeast,4736,9,11,Correct: option A -> Northeast
-1499,counting,False,True,C,0,8799,9,17,"Incorrect: expected '0', got '3' (option C)"
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv
deleted file mode 100644
index 4400c9f9..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/csv_saved/results_5examples.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-index,question_type,correct,attempted,sol,gt,tokens,num_relations,verified_claims,message
-0,direction,True,True,A,Northeast,3503,9,1,Correct: option A -> Northeast
-374,direction,True,True,A,Southeast,4142,9,1,Correct: option A -> Southeast
-749,object,True,True,D,Miniature Golf Mart,6550,9,11,Correct: option D -> Miniature Golf Mart
-1124,counting,True,True,A,0,8820,9,0,Correct: option A -> 0
-1499,counting,False,True,C,0,6193,9,0,"Incorrect: expected '0', got '3' (option C)"
diff --git a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json b/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json
deleted file mode 100644
index 2b63a65a..00000000
--- a/Outputs_TTS/SpatialMapResults/Qwen3-30B-A3B-Thinking-2507/summary.json
+++ /dev/null
@@ -1,49 +0,0 @@
-{
-  "model": "Qwen/Qwen3-30B-A3B-Thinking-2507",
-  "total_examples": 2,
-  "correct": 1,
-  "attempted": 2,
-  "accuracy": 0.5,
-  "soundness": 0.5,
-  "total_reasoning_tokens": 13535,
-  "avg_reasoning_tokens": 6767.5,
-  "max_corrections": 3,
-  "stats_by_type": {
-    "direction": {
-      "total": 1,
-      "correct": 1
-    },
-    "object": {
-      "total": 0,
-      "correct": 0
-    },
-    "counting": {
-      "total": 1,
-      "correct": 0
-    }
-  },
-  "results": [
-    {
-      "idx": 0,
-      "question_type": "direction",
-      "correct": true,
-      "attempted": true,
-      "sol": "A",
-      "gt": "Northeast",
-      "reasoning_tokens": 4736,
-      "num_relations": 9,
-      "verified_claims": 11
-    },
-    {
-      "idx": 1499,
-      "question_type": "counting",
-      "correct": false,
-      "attempted": true,
-      "sol": "C",
-      "gt": "0",
-      "reasoning_tokens": 8799,
-      "num_relations": 9,
-      "verified_claims": 17
-    }
-  ]
-}
\ No newline at end of file

From a9ca217537bae160538a55e1984322c96dca2dcc Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen1-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen1-8gpus.bonete11.svc.cluster.local>
Date: Tue, 17 Mar 2026 11:17:01 +0000
Subject: [PATCH 16/17] resolved comments

---
 examples/EarlyStopping/game24_example.py     |  6 +++---
 examples/EarlyStopping/maze_example.py       |  6 +++---
 examples/EarlyStopping/spatialmap_example.py |  6 +++---
 examples/TTSwithVerification/mazemeta.py     |  2 +-
 examples/TTSwithVerification/spatialmeta.py  |  2 +-
 interwhen/utils/spatialmap_verifier.py       | 13 ++++++-------
 6 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/examples/EarlyStopping/game24_example.py b/examples/EarlyStopping/game24_example.py
index af033b7e..53e8ff84 100644
--- a/examples/EarlyStopping/game24_example.py
+++ b/examples/EarlyStopping/game24_example.py
@@ -25,7 +25,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/Gameof24_results"):
+def get_output_dirs(main_model: str, base_dir: str = "../../Outputs/Gameof24_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -42,14 +42,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/Gameof24_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/Gameof24_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/Gameof24_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs/Gameof24_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
diff --git a/examples/EarlyStopping/maze_example.py b/examples/EarlyStopping/maze_example.py
index c355bb46..8cccfdab 100644
--- a/examples/EarlyStopping/maze_example.py
+++ b/examples/EarlyStopping/maze_example.py
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/MazeResults"):
+def get_output_dirs(main_model: str, base_dir: str = "../Outputs/MazeResults"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/MazeResults") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/MazeResults") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/MazeResults") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/MazeResults") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
diff --git a/examples/EarlyStopping/spatialmap_example.py b/examples/EarlyStopping/spatialmap_example.py
index 0aadbc48..195ea3f8 100644
--- a/examples/EarlyStopping/spatialmap_example.py
+++ b/examples/EarlyStopping/spatialmap_example.py
@@ -28,7 +28,7 @@ def get_model_short_name(model_name: str) -> str:
     short_name = short_name.replace(" ", "_").replace(":", "-")
     return short_name
 
-def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/SpatialMap_results"):
+def get_output_dirs(main_model: str, base_dir: str = "../Outputs/SpatialMap_results"):
     """Create and return output directory paths based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
@@ -46,14 +46,14 @@ def get_output_dirs(main_model: str, base_dir: str = "../../Outputs_SANITY_FULL/
     
     return dirs
 
-def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/SpatialMap_results") -> str:
+def get_log_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/SpatialMap_results") -> str:
     """Generate log filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
     os.makedirs(output_base, exist_ok=True)
     return os.path.join(output_base, f"EAT_{num_examples}examples.log")
 
-def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../../Outputs_SANITY_FULL/SpatialMap_results") -> str:
+def get_token_filename(main_model: str, num_examples: int, base_dir: str = "../Outputs/SpatialMap_results") -> str:
     """Generate token CSV filename based on model name."""
     model_short_name = get_model_short_name(main_model)
     output_base = os.path.join(base_dir, model_short_name)
diff --git a/examples/TTSwithVerification/mazemeta.py b/examples/TTSwithVerification/mazemeta.py
index f058c95a..94704af1 100644
--- a/examples/TTSwithVerification/mazemeta.py
+++ b/examples/TTSwithVerification/mazemeta.py
@@ -377,7 +377,7 @@ def evaluate_maze_answer(answer, options, ground_truth):
             answer = asyncio.run(stream_completion(
                 full_prompt,
                 llm_server=llm_server,
-                monitors=(),
+                monitors=(monitor),
                 add_delay=False,
                 termination_requires_validation=False,
                 async_execution=True
diff --git a/examples/TTSwithVerification/spatialmeta.py b/examples/TTSwithVerification/spatialmeta.py
index 78e8e30d..dda2a5dd 100644
--- a/examples/TTSwithVerification/spatialmeta.py
+++ b/examples/TTSwithVerification/spatialmeta.py
@@ -439,7 +439,7 @@ def evaluate_spatialmap_answer(answer, options, ground_truth):
             answer = asyncio.run(stream_completion(
                 full_prompt,
                 llm_server=llm_server,
-                monitors=(),
+                monitors=(monitor,),
                 add_delay=False,
                 termination_requires_validation=False,
                 async_execution=True
diff --git a/interwhen/utils/spatialmap_verifier.py b/interwhen/utils/spatialmap_verifier.py
index f7d02291..23e0c8fe 100644
--- a/interwhen/utils/spatialmap_verifier.py
+++ b/interwhen/utils/spatialmap_verifier.py
@@ -13,7 +13,7 @@
 
 import re
 from typing import Dict, List, Tuple, Optional, Set
-from z3 import Solver, Real, And, sat
+from z3 import Solver, Real, And, Not, sat, unsat
 
 
 class SpatialMapZ3Solver:
@@ -259,9 +259,8 @@ def count_objects_in_direction(
             # Check if this entity MUST be in that direction
             # (i.e. the negation is unsatisfiable)
             self.solver.push()
-            from z3 import Not
             self.solver.add(Not(constraint))
-            must_be = self.solver.check() != sat
+            must_be = self.solver.check() == unsat
             self.solver.pop()
 
             if must_be:
@@ -487,7 +486,7 @@ def verify_spatialmap_step(
     Returns:
         (is_valid, errors)
     """
-    from z3 import Not as Z3Not, sat as z3sat
+    from z3 import Not as Z3Not, unsat as z3unsat
 
     errors = []
     
@@ -508,7 +507,7 @@ def verify_spatialmap_step(
         if compiled is not None:
             z3_solver.solver.push()
             z3_solver.solver.add(Z3Not(compiled))
-            is_entailed = z3_solver.solver.check() != z3sat
+            is_entailed = z3_solver.solver.check() == z3unsat
             z3_solver.solver.pop()
             if is_entailed:
                 z3_solver.apply_ir(claim)
@@ -725,7 +724,7 @@ def get_possible_count_range(
 
     Returns ``None`` if the reference entity cannot be found.
     """
-    from z3 import And as Z3And, Not as Z3Not, sat as z3sat
+    from z3 import And as Z3And, Not as Z3Not, sat as z3sat, unsat as z3unsat
 
     direction = direction.lower().strip()
     if direction in ('north', 'south', 'east', 'west'):
@@ -782,7 +781,7 @@ def _find(name):
         # Must it be?
         solver.solver.push()
         solver.solver.add(Z3Not(c))
-        must_be = solver.solver.check() != z3sat
+        must_be = solver.solver.check() == z3unsat
         solver.solver.pop()
 
         if must_be:

From 15d55afb745710c0c1ae5757df3d7ee402d96402 Mon Sep 17 00:00:00 2001
From: root
 <root@hp-reasoning-t-vishakbhat-interwhen1-8gpus-trainer-0.hp-reasoning-t-vishakbhat-interwhen1-8gpus.bonete11.svc.cluster.local>
Date: Tue, 17 Mar 2026 15:42:03 +0000
Subject: [PATCH 17/17] added baselines

---
 .../TTSwithVerification/bestofk_baseline.py   | 1070 +++++++++++++++++
 examples/TTSwithVerification/tot_baseline.py  |  784 ++++++++++++
 2 files changed, 1854 insertions(+)
 create mode 100644 examples/TTSwithVerification/bestofk_baseline.py
 create mode 100644 examples/TTSwithVerification/tot_baseline.py

diff --git a/examples/TTSwithVerification/bestofk_baseline.py b/examples/TTSwithVerification/bestofk_baseline.py
new file mode 100644
index 00000000..9ed0950b
--- /dev/null
+++ b/examples/TTSwithVerification/bestofk_baseline.py
@@ -0,0 +1,1070 @@
+import asyncio
+import argparse
+from datetime import datetime
+import json
+import logging
+import os
+import re
+import sys
+import shutil
+import subprocess
+from multiprocessing.pool import ThreadPool
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import aiohttp
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from interwhen.utils.zebralogic_helper import SYSTEM_PROMPT_VANILLA, USER_PROMPT_TEMPLATE, get_zebralogic_dataset, extract_last_json, zebra_correctness
+
+from interwhen import stream_completion
+from verina_utils import *
+
+# ============== MODEL CONFIGURATION ==============
+MAIN_MODEL = "Qwen/Qwen3-30B-A3B-Thinking-2507"
+# Multi-process vLLM configuration
+VLLM_PORTS = [8000, 8001, 8002]  # 3 instances with tensor-parallel-size 2 each
+REQUEST_COUNTER = {"main": 0, "critic": 0}  # Track request count for round-robin load balancing
+# Verina paths
+_SCRIPT_DIR = Path(__file__).parent.resolve()
+VERINA_ROOT = (_SCRIPT_DIR / "../../../verina").resolve()
+VERINA_DATASETS_PATH = VERINA_ROOT / "datasets" / "verina"
+LEAN_PLAYGROUND_DIR = VERINA_ROOT / "lean-playground"
+
+logger = logging.getLogger(__name__)
+
+# Save the real stderr so tqdm always works even if suppress_output is active
+_real_stderr = sys.stderr
+
+class NumpyEncoder(json.JSONEncoder):
+    """Custom JSON encoder that handles numpy types."""
+    def default(self, obj):
+        if isinstance(obj, np.bool_):
+            return bool(obj)
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super().default(obj)
+
+
+@contextmanager
+def suppress_output():
+    """Context manager to suppress stdout and stderr."""
+    with open(os.devnull, 'w') as devnull:
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+
+
+@dataclass
+class SampleResult:
+    output: str
+    correct: bool
+    extracted: Optional[str]
+    message: str
+    tokens: int
+    critic_correct: Optional[bool] = None
+    critic_feedback: Optional[str] = None
+
+
+def get_model_short_name(model_name: str) -> str:
+    short_name = model_name.split("/")[-1]
+    return short_name.replace(" ", "_").replace(":", "-")
+
+
+def get_next_port(server_type: str = "main") -> int:
+    """Get next vLLM port in round-robin fashion."""
+    global REQUEST_COUNTER
+    port = VLLM_PORTS[REQUEST_COUNTER[server_type] % len(VLLM_PORTS)]
+    REQUEST_COUNTER[server_type] += 1
+    return port
+
+
+def get_output_dirs(task: str, main_model: str, use_critic: bool, critic_early_stop: bool, base_dir: str = "../../b-pchanda/Outputs_TTS_temp/BestOfKResults"):
+    model_short_name = get_model_short_name(main_model)
+    critic_status = "on" if use_critic else "off"
+    earlystop_status = "on" if critic_early_stop else "off"
+    output_base = os.path.join(base_dir, task, model_short_name, f"critic_{critic_status}", f"earlystop_{earlystop_status}")
+    dirs = {
+        "base": output_base,
+        "reasoning": os.path.join(output_base, "Reasoning_output"),
+        "critic": os.path.join(output_base, "Critic_output") if use_critic else None,
+    }
+    for dir_path in dirs.values():
+        if dir_path:
+            os.makedirs(dir_path, exist_ok=True)
+    return dirs
+
+def init_llm_server(modelname, max_tokens=200, port=8000, temperature=0.6, seed=42): #
+    url = f"http://localhost:{port}/v1/completions"
+    payload = {
+        "model": modelname,
+        "max_tokens": max_tokens,
+        "top_k": 20,
+        "top_p": 0.95,
+        "min_p": 0.0,
+        "do_sample" : True,
+        "temperature": temperature,
+        "stream": False,
+        "logprobs": 20,
+        "use_beam_search": False,
+        "prompt_cache": True,
+        "seed" : seed
+    }
+    headers = {"Content-Type": "application/json"}
+    return {"url": url, "payload": payload, "headers": headers}
+ 
+
+def count_tokens(text: str, tokenizer) -> int:
+    """Count tokens in text, with fallback to character count."""
+    try:
+        if not text or len(text.strip()) == 0:
+            return 0
+        tokens = tokenizer.encode(text, add_special_tokens=False)
+        return len(tokens)
+    except Exception as e:
+        logger.warning(f"Tokenization failed: {e}, using character count estimate")
+        # Rough estimate: ~4 characters per token
+        return max(1, len(text) // 4)
+
+
+def save_outputs(idx: int, outputs: List[SampleResult], best_idx: int, output_dir: str):
+    os.makedirs(output_dir, exist_ok=True)
+    filepath = os.path.join(output_dir, f"output_{idx}.txt")
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(f"BEST_INDEX={best_idx}\n")
+        for i, result in enumerate(outputs):
+            f.write("\n" + "=" * 80 + "\n")
+            f.write(f"SAMPLE {i}\n")
+            f.write(f"CORRECT={result.correct}\n")
+            f.write(f"CRITIC_CORRECT={result.critic_correct}\n")
+            f.write(f"EXTRACTED={result.extracted}\n")
+            f.write(f"TOKENS={result.tokens}\n")
+            f.write(f"MESSAGE={result.message}\n")
+            if result.critic_feedback:
+                f.write(f"CRITIC_FEEDBACK={result.critic_feedback}\n")
+            f.write("\n")
+            f.write(result.output)
+            f.write("\n")
+    # logger.info(f"Saved outputs to {filepath}")
+
+
+# --------------------- Game24 helpers ---------------------
+
+def build_game24_prompt(nums):
+    a, b, c, d = nums
+    boxed = r"\\boxed{}"
+    base_prompt = f"""
+You are solving the Game of 24.
+
+You are given four numbers: {a}, {b}, {c}, {d}
+
+Your job is to produce a valid arithmetic expression using:
+- ALL four numbers exactly once
+- ONLY +, -, *, /
+- The expression must evaluate to exactly 24.
+
+Please reason step by step, and put your final answer containing only the expression within {boxed}.
+""".strip()
+    return base_prompt
+
+
+def extract_solution_game24(text):
+    boxed_pattern = r"\\boxed\{"
+    matches = list(re.finditer(boxed_pattern, text))
+    if not matches:
+        return None
+    last_match = matches[-1]
+    start = last_match.end()
+    brace_count = 1
+    end = start
+    while end < len(text) and brace_count > 0:
+        if text[end] == "{":
+            brace_count += 1
+        elif text[end] == "}":
+            brace_count -= 1
+        end += 1
+    expr = text[start:end - 1].strip()
+
+    frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+    while re.search(frac_pattern, expr):
+        expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+
+    replacements = {
+        r"\times": "*",
+        r"\cdot": "*",
+        r"\div": "/",
+    }
+    for latex, op in replacements.items():
+        expr = expr.replace(latex, op)
+
+    expr = expr.replace(r"\\,", "").replace(r"\\ ", "")
+    expr = re.sub(r"\)\s*\(", ")*(", expr)
+    expr = re.sub(r"\)\s*(\d)", r")*\1", expr)
+    expr = re.sub(r"(\d)\s*\(", r"\1*(", expr)
+
+    return expr
+
+
+def extract_numbers_from_expr(expr):
+    numbers = re.findall(r"\d+\.?\d*", expr)
+    return [int(float(n)) if float(n).is_integer() else float(n) for n in numbers]
+
+
+def validate_numbers_used(expr, expected_nums):
+    used_nums = extract_numbers_from_expr(expr)
+    return sorted(used_nums) == sorted(expected_nums)
+
+
+def evaluate_expression(expr, expected_nums=None):
+    try:
+        if expected_nums is not None and not validate_numbers_used(expr, expected_nums):
+            return False
+        value = eval(expr, {"__builtins__": None}, {})
+        return abs(value - 24) < 1e-6
+    except Exception:
+        return False
+
+
+def evaluate_game24_answer(answer, nums):
+    expr = extract_solution_game24(answer)
+    if not expr:
+        return False, None, "No expression found"
+    if evaluate_expression(expr, expected_nums=nums):
+        return True, expr, "Correct solution (evaluates to 24 using exactly the given numbers)"
+    used_nums = extract_numbers_from_expr(expr)
+    if sorted(used_nums) != sorted(nums):
+        return False, expr, f"Incorrect: Expression uses {used_nums}, expected {nums}"
+    return False, expr, "Expression does not evaluate to 24"
+
+
+# --------------------- Maze/SpatialMap helpers ---------------------
+
+def remove_last_paragraph(s: str) -> str:
+    return s[:-143] if len(s) > 143 else s
+
+
+def build_maze_prompt(example):
+    pre_prompt = (
+        "You are an expert problem solver. Carefully read the following multiple-choice question "
+        "and think through the solution step-by-step before providing your final answer. "
+        "Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"
+    )
+    description = remove_last_paragraph(str(example.get("prompt")))
+    return pre_prompt, description
+
+
+def build_spatialmap_prompt(example):
+    pre_prompt = (
+        "You are an expert problem solver. Carefully read the following multiple-choice question "
+        "and think through the solution step-by-step before providing your final answer." 
+        "Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"
+    )
+    description = remove_last_paragraph(str(example.get("prompt")))
+    return pre_prompt, description
+
+
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    # Try multiple boxed patterns
+    patterns = [
+        r"\\boxed\{([^}]*)\}",  # \boxed{...}
+        r"boxed\{([^}]*)\}",     # boxed{...} without escape
+        r"\*\*([A-D])\*\*",      # **A** format
+        r"answer[:\s]*([A-D])",  # answer: A format
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",  # Standalone letter
+    ]
+    
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+    
+    # Last resort: look for any standalone A, B, C, or D
+    standalone = re.findall(r"\b([ABCD])\b", text)
+    if standalone:
+        return standalone[-1].upper()
+    
+    return None
+
+
+def extract_options_from_prompt(prompt_text, target_options):
+    pattern = r"\b([A-D])\.\s*(.*?)(?=\s*[A-D]\.\s*|$)"
+    raw = re.findall(pattern, prompt_text, flags=re.DOTALL)
+    options = {k: v.strip().rstrip(".") for k, v in raw}
+    if target_options:
+        options = {k: v for k, v in options.items() if k in target_options}
+    return options
+
+
+def evaluate_mcq_answer(answer, options, ground_truth):
+    sol = extract_solution_mcq(answer)
+    gt_sol = str(ground_truth).strip()
+    if not sol:
+        return False, None, "No expression found"
+    sol = sol.strip()
+    if sol in options:
+        if options[sol] == gt_sol:
+            return True, sol, f"Correct: option {sol} -> {options[sol]}"
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
+    if sol.lower() == gt_sol.lower():
+        return True, sol, f"Correct: answer text matches ground truth: {sol}"
+    for opt_letter, opt_value in options.items():
+        if sol.lower() == opt_value.lower():
+            if opt_value == gt_sol:
+                return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
+    return False, sol, f"Solution '{sol}' not found in options or ground truth"
+
+# --------------------- ZebraLogic helpers ---------------------
+
+def evaluate_zebralogic_answer(answer, example):
+    """Evaluate a zebralogic answer against ground truth using zebra_correctness."""
+    candidate = extract_last_json(answer)
+    if not candidate:
+        return False, None, "No valid JSON solution found"
+    correct, skipped, missing, total = zebra_correctness(example, candidate)
+    is_correct = correct == total
+    msg = f"Correct={correct}/{total}, skipped={skipped}, missing={missing}"
+    return is_correct, candidate, msg
+
+
+def build_zebralogic_prompt(example):
+    system_prompt = SYSTEM_PROMPT_VANILLA
+    user_prompt = USER_PROMPT_TEMPLATE.format(problem_text=example['puzzle_clean'])
+    return system_prompt, user_prompt
+
+# verina helpers
+def evaluate_verina_answer(output: str, data: BenchmarkData, task_idx: int) -> Tuple[bool, str, str]:
+    """Evaluate Verina code generation output - wrapper for best-of-k interface"""
+    generated_code = extract_code_from_response(output)
+    
+    if not generated_code.strip():
+        return False, "", "No code extracted from response"
+    
+    compiles, all_tests_pass, compile_output, test_results = evaluate_generated_code(data, generated_code, task_idx)
+    
+    num_tests = len(data.tests) if data.tests else 0
+    num_passed = sum(1 for v in test_results.values() if v == "pass")
+    
+    if compiles and all_tests_pass:
+        return True, generated_code, f"Code compiles and all {num_tests} tests pass"
+    elif compiles:
+        return False, generated_code, f"Compilation succeeded but {num_tests - num_passed}/{num_tests} tests failed"
+    else:
+        error_preview = compile_output[:300] if compile_output else "Unknown error"
+        return False, generated_code, f"Compilation failed: {error_preview}"
+
+
+def build_full_prompt(task, example, nums=None):
+    if task == "game24":
+        prompt = build_game24_prompt(nums)
+        return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+    if task == "maze":
+        system_prompt, user_prompt = build_maze_prompt(example)
+    elif task == 'zebralogic':
+        system_prompt, user_prompt = build_zebralogic_prompt(example)
+    elif task == "verina":
+        return build_verina_prompt(example)
+    else:
+        system_prompt, user_prompt = build_spatialmap_prompt(example)
+    return (
+        f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+        f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+
+
+def load_dataset_for_task(task):
+    if task == "game24":
+        return load_dataset("nlile/24-game", split="train")
+    if task == "maze":
+        return load_dataset("microsoft/VISION_LANGUAGE", "maze_text_only", split="val")
+    if task == "spatialmap":
+        return load_dataset("microsoft/VISION_LANGUAGE", "spatial_map_text_only", split="val")
+    if task == "zebralogic":
+        return get_zebralogic_dataset()
+    if task == "verina":
+        return load_verina_dataset()
+    raise ValueError(f"Unsupported task: {task}")
+
+
+def resolve_indices(task, dataset_len, args):
+    if args.indices:
+        return [int(x.strip()) for x in args.indices.split(",")]
+    if args.xrange:
+        parts = args.xrange.split("-")
+        if len(parts) == 2:
+            try:
+                start = int(parts[0].strip())
+                end = int(parts[1].strip())
+                return range(start, end)
+            except ValueError:
+                raise ValueError(f"Invalid xrange format: {args.xrange}. Use 'start-end'")
+    if args.num_examples:
+        max_idx = dataset_len - 1
+        upper_bound = min(max_idx, 1362) if task == "game24" else min(max_idx, 1499)
+        return list((np.linspace(0, upper_bound, args.num_examples)).astype(int))
+    # Default: use full range
+    start = args.start if args.start is not None else 0
+    end = args.end if args.end is not None else dataset_len
+    return range(start, end)
+
+
+def batch_generate_samples(prompt, llm_server, k, seed, quiet=True):
+    """Generate k samples using vLLM batch processing via API across multiple instances."""
+    payload_template = llm_server["payload"].copy()
+    headers = llm_server["headers"]
+    
+    # Create k requests with different seeds
+    batch_payloads = []
+    for i in range(k):
+        payload = payload_template.copy()
+        payload["prompt"] = prompt
+        payload["seed"] = seed + i
+        batch_payloads.append(payload)
+    
+    # Send requests to vLLM instances in parallel (true concurrency)
+    async def _fetch_one(session, sem, idx, url, payload):
+        async with sem:
+            try:
+                async with session.post(url, json=payload, headers=headers, timeout=300) as resp:
+                    text = await resp.text()
+                    if resp.status >= 400:
+                        logger.warning(f"HTTP error for seed {seed + idx} on {url}: {resp.status} - {text[:200]}")
+                        return idx, ""
+                    try:
+                        result = json.loads(text)
+                    except json.JSONDecodeError:
+                        logger.warning(f"Invalid JSON for seed {seed + idx} on {url}")
+                        return idx, ""
+            except Exception as e:
+                logger.warning(f"Batch generation failed for seed {seed + idx} on {url}: {e}")
+                return idx, ""
+
+            if "choices" in result and len(result["choices"]) > 0:
+                choice = result["choices"][0]
+                if isinstance(choice, dict):
+                    output_text = choice.get("text") or choice.get("message", {}).get("content", "")
+                else:
+                    output_text = str(choice)
+                if output_text and len(output_text.strip()) > 0:
+                    return idx, output_text
+                logger.warning(f"Empty output for seed {seed + idx} on {url}")
+                return idx, ""
+
+            logger.warning(f"No choices in response for seed {seed + idx} on {url}: {result.keys() if isinstance(result, dict) else type(result)}")
+            return idx, ""
+
+    async def _run_parallel():
+        sem = asyncio.Semaphore(len(VLLM_PORTS))
+        async with aiohttp.ClientSession() as session:
+            tasks = []
+            for i, payload in enumerate(batch_payloads):
+                port = get_next_port(server_type="main")
+                url = f"http://localhost:{port}/v1/completions"
+                tasks.append(asyncio.create_task(_fetch_one(session, sem, i, url, payload)))
+            results = await asyncio.gather(*tasks)
+            return results
+
+    if quiet:
+        with suppress_output():
+            results = asyncio.run(_run_parallel())
+    else:
+        results = asyncio.run(_run_parallel())
+
+    outputs = [""] * k
+    for idx, output_text in results:
+        outputs[idx] = output_text
+        if output_text and not quiet:
+            print(f"[Generated sample {idx}] {len(output_text)} chars, {len(output_text.split())} words")
+
+    return outputs
+
+
+# --------------------- Critic model helpers ---------------------
+
+def build_game24_critic_prompt(nums, reasoning_output):
+    """Build critic prompt to evaluate Game of 24 solution and provide reasoning."""
+    return f"""You are a math verifier. Evaluate the following Game of 24 solution.
+
+Numbers: {nums}
+Target: 24
+
+Student's reasoning and answer:
+{reasoning_output}
+
+Verify:
+1. Does it use ALL four numbers exactly once?
+2. Does each step follow correct arithmetic?
+3. Does the final expression evaluate to exactly 24?
+
+Respond in the following format:
+VERDICT: CORRECT or INCORRECT
+REASONING: Your detailed explanation
+
+If CORRECT, briefly explain why.
+If INCORRECT, explain what went wrong and how to fix it.
+"""
+
+
+def build_zebralogic_critic_prompt(task_description, reasoning_output):
+    """Build critic prompt to evaluate ZebraLogic solution and provide reasoning."""
+    return f"""You are an expert logic puzzle verifier. Evaluate the following ZebraLogic solution.
+
+Task:
+{task_description}
+
+Student's reasoning and answer:
+{reasoning_output}
+
+Verify:
+1. Does the solution assign exactly one value per feature per house?
+2. Are all constraints/clues satisfied?
+3. Is the JSON output well-formed and complete?
+
+Respond in the following format:
+VERDICT: CORRECT or INCORRECT
+REASONING: Your detailed explanation
+
+If CORRECT, briefly explain why.
+If INCORRECT, explain what went wrong and suggest corrections.
+"""
+
+
+def build_mcq_critic_prompt(task, task_description, reasoning_output):
+    """Build critic prompt to evaluate MCQ solution and provide reasoning."""
+    task_name = "Maze" if task == "maze" else "Spatial Reasoning"
+    return f"""You are an expert {task_name} verifier. Evaluate the following solution.
+
+Task:
+{task_description}
+
+Student's reasoning and answer:
+{reasoning_output}
+
+Verify the correctness of the step-by-step reasoning and final answer.
+
+Respond in the following format:
+VERDICT: CORRECT or INCORRECT
+REASONING: Your detailed explanation
+
+If CORRECT, briefly explain why.
+If INCORRECT, explain what went wrong and suggest the correct approach.
+"""
+
+def build_verina_critic_prompt(data: BenchmarkData, reasoning_output: str) -> str:
+    """Build critic prompt to evaluate Verina Lean code generation and provide reasoning."""
+    signature = data.signature
+    func_name = signature.get("name", "solution")
+    return_type = signature.get("return_type", "Bool")
+    param_list = render_param_list(signature)
+    
+    precond = data.lean_data.get("precond", "True").strip()
+    postcond = data.lean_data.get("postcond", "").strip()
+    
+    return f"""You are an expert Lean 4 code verifier. Evaluate the following code generation attempt.
+
+## Task Description
+{data.description}
+
+## Function Signature
+```lean4
+def {func_name} {param_list} (h_precond : {func_name}_precond ...) : {return_type}
+```
+
+## Precondition
+```lean4
+{precond}
+```
+
+## Postcondition
+```lean4
+{postcond}
+```
+
+## Student's Reasoning and Generated Code
+{reasoning_output}
+
+Verify:
+1. Is the generated code syntactically valid Lean 4?
+2. Does it match the expected function signature and return type ({return_type})?
+3. Does the logic appear to satisfy the postcondition given the precondition?
+4. Are there any obvious bugs, infinite loops, or incorrect base cases?
+
+Respond in the following format:
+VERDICT: CORRECT or INCORRECT
+REASONING: Your detailed explanation
+
+If CORRECT, briefly explain why
+If INCORRECT, explain what went wrong and suggest how to fix it.
+"""
+
+def batch_evaluate_with_critic(outputs_df, task, example, critic_llm_server, tokenizer, nums=None, quiet=True):
+    """Batch evaluate outputs using vLLM API across multiple instances. Outputs_df should have columns: 'output', 'seed_idx'"""
+    payload_template = critic_llm_server["payload"].copy()
+    headers = critic_llm_server["headers"]
+
+    async def _fetch_one(session, sem, idx, url, payload):
+        async with sem:
+            try:
+                async with session.post(url, json=payload, headers=headers, timeout=300) as resp:
+                    text = await resp.text()
+                    if resp.status >= 400:
+                        logger.warning(f"HTTP error for critic sample {idx} on {url}: {resp.status} - {text[:200]}")
+                        return idx, "", False
+                    try:
+                        result = json.loads(text)
+                    except json.JSONDecodeError:
+                        logger.warning(f"Invalid JSON for critic sample {idx} on {url}")
+                        return idx, "", False
+            except Exception as e:
+                logger.warning(f"Critic evaluation failed for sample {idx} on {url}: {e}")
+                return idx, "", False
+
+            if "choices" in result and len(result["choices"]) > 0:
+                choice = result["choices"][0]
+                critic_output = choice.get("text") or choice.get("message", {}).get("content", "")
+            else:
+                critic_output = ""
+
+            is_correct = "CORRECT" in critic_output.upper()
+            reasoning = ""
+            if "REASONING:" in critic_output:
+                reasoning = critic_output.split("REASONING:", 1)[1].strip()
+            elif "VERDICT:" not in critic_output:
+                reasoning = critic_output
+
+            return idx, reasoning, is_correct
+
+    async def _run_parallel():
+        sem = asyncio.Semaphore(len(VLLM_PORTS))
+        async with aiohttp.ClientSession() as session:
+            tasks = []
+            for idx, row in outputs_df.iterrows():
+                output_text = row["output"]
+                if task == "game24":
+                    critic_prompt = build_game24_critic_prompt(nums, output_text)
+                elif task == "zebralogic":
+                    _, task_desc = build_zebralogic_prompt(example)
+                    critic_prompt = build_zebralogic_critic_prompt(task_desc, output_text)
+                elif task == "verina":
+                    critic_prompt = build_verina_critic_prompt(example, output_text)
+                else:
+                    if task == "maze":
+                        _, task_desc = build_maze_prompt(example)
+                    else:
+                        _, task_desc = build_spatialmap_prompt(example)
+                    critic_prompt = build_mcq_critic_prompt(task, task_desc, output_text)
+
+                critic_system = "You are a strict academic verifier."
+                full_prompt = f"<|im_start|>system\n{critic_system}<|im_end|>\n<|im_start|>user\n{critic_prompt}<|im_end|>\n<|im_start|>assistant\n"
+
+                payload = payload_template.copy()
+                payload["prompt"] = full_prompt
+                payload["seed"] = row.get("critic_seed", idx)
+
+                port = get_next_port(server_type="critic")
+                url = f"http://localhost:{port}/v1/completions"
+                tasks.append(asyncio.create_task(_fetch_one(session, sem, idx, url, payload)))
+
+            return await asyncio.gather(*tasks)
+
+    if quiet:
+        with suppress_output():
+            results = asyncio.run(_run_parallel())
+    else:
+        results = asyncio.run(_run_parallel())
+
+    rows = []
+    for sample_idx, reasoning, is_correct in results:
+        rows.append({
+            "sample_idx": sample_idx,
+            "critic_correct": is_correct,
+            "critic_feedback": reasoning,
+        })
+
+    return pd.DataFrame(rows)
+
+
+def run_k_samples_with_critic(
+    prompt,
+    llm_server,
+    critic_llm_server,
+    k,
+    seed,
+    task,
+    example,
+    tokenizer,
+    eval_fn,
+    nums=None,
+    early_stop=False,
+    critic_feedback_baseline=False,
+    quiet=True,
+):
+    """Run k samples with critic evaluation using vLLM batching."""
+    # If critic_feedback_baseline, generate samples sequentially with feedback chaining
+    if critic_feedback_baseline:
+        sample_results = []
+        current_prompt = prompt
+        
+        for i in range(k):
+            # Generate single sample
+            output = batch_generate_samples(current_prompt, llm_server, 1, seed + i, quiet=quiet)[0]
+            
+            # Evaluate with critic
+            df_critic = batch_evaluate_with_critic(
+                pd.DataFrame([{"output": output, "seed_idx": i}]),
+                task, example, critic_llm_server, tokenizer, nums=nums, quiet=quiet
+            )
+            critic_correct = df_critic.iloc[0]["critic_correct"] if len(df_critic) > 0 else False
+            critic_feedback = df_critic.iloc[0]["critic_feedback"] if len(df_critic) > 0 else ""
+            
+            # Evaluate with ground truth
+            is_correct, extracted, message = eval_fn(output)
+            token_count = count_tokens(output, tokenizer)
+            
+            sample_results.append(SampleResult(
+                output=output,
+                correct=is_correct,
+                extracted=extracted,
+                message=f"Critic verdict: {'CORRECT' if critic_correct else 'INCORRECT'} | {message}",
+                tokens=token_count,
+                critic_correct=critic_correct,
+                critic_feedback=critic_feedback,
+            ))
+            
+            # If critic says INCORRECT and not the last sample, add feedback to prompt
+            if not critic_correct and i < k - 1 and critic_feedback:
+                feedback_text = f"\n\nPrevious attempt was incorrect:\n{output}\n\nCritic feedback:\n{critic_feedback}\n\nPlease address the feedback and try again:"
+                # Insert feedback before the assistant tag
+                if "<|im_end|>\n<|im_start|>assistant\n" in current_prompt:
+                    current_prompt = current_prompt.replace(
+                        "<|im_end|>\n<|im_start|>assistant\n",
+                        f"{feedback_text}<|im_end|>\n<|im_start|>assistant\n"
+                    )
+                else:
+                    # Fallback: append to end of prompt
+                    current_prompt = current_prompt + feedback_text
+        
+        return sample_results
+    
+    # Generate k samples
+    outputs = batch_generate_samples(prompt, llm_server, k, seed, quiet=quiet)
+    
+    # Create dataframe with outputs
+    df_samples = pd.DataFrame({
+        "sample_idx": range(k),
+        "output": outputs,
+        "seed": [seed + i for i in range(k)],
+    })
+    
+    # If early stop mode, stop at first critic-correct
+    if early_stop:
+        sample_results = []
+        for idx, row in df_samples.iterrows():
+            output = row["output"]
+            
+            # Evaluate with critic
+            df_critic = batch_evaluate_with_critic(
+                pd.DataFrame([{"output": output, "seed_idx": idx}]),
+                task, example, critic_llm_server, tokenizer, nums=nums, quiet=quiet
+            )
+            critic_correct = df_critic.iloc[0]["critic_correct"] if len(df_critic) > 0 else False
+            critic_feedback = df_critic.iloc[0]["critic_feedback"] if len(df_critic) > 0 else ""
+            
+            # Evaluate with ground truth
+            is_correct, extracted, message = eval_fn(output)
+            token_count = count_tokens(output, tokenizer)
+            
+            sample_results.append(SampleResult(
+                output=output,
+                correct=is_correct,
+                extracted=extracted,
+                message=f"Critic verdict: {'CORRECT' if critic_correct else 'INCORRECT'} | {message}",
+                tokens=token_count,
+                critic_correct=critic_correct,
+                critic_feedback=critic_feedback,
+            ))
+            
+            if critic_correct:
+                break
+        
+        return sample_results
+    else:
+        # Batch critic evaluation
+        df_critic = batch_evaluate_with_critic(
+            df_samples, task, example, critic_llm_server, tokenizer, nums=nums, quiet=quiet
+        )
+        
+        # Merge critic results
+        df_samples = df_samples.merge(df_critic, left_index=True, right_on="sample_idx", how="left")
+        
+        # Process all results
+        sample_results = []
+        for idx, row in df_samples.iterrows():
+            output = row["output"]
+            critic_correct = row.get("critic_correct", False)
+            critic_feedback = row.get("critic_feedback", "")
+            
+            is_correct, extracted, message = eval_fn(output)
+            token_count = count_tokens(output, tokenizer)
+            
+            sample_results.append(SampleResult(
+                output=output,
+                correct=is_correct,
+                extracted=extracted,
+                message=f"Critic verdict: {'CORRECT' if critic_correct else 'INCORRECT'} | {message}",
+                tokens=token_count,
+                critic_correct=critic_correct,
+                critic_feedback=critic_feedback,
+            ))
+        
+        return sample_results
+
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Best-of-K baseline (standard CoT) for TTSwithVerification datasets")
+    parser.add_argument("--task", type=str, required=True, choices=["game24", "maze", "spatialmap", "zebralogic","verina"],
+                        help="Task to run")
+    parser.add_argument("--k", type=int, default=1, help="Number of samples per example")
+    parser.add_argument("--num_examples", "-n", type=int, default=100,
+                        help="Number of examples to run (overrides start/end)")
+    parser.add_argument("--indices", type=str, default=None,
+                        help="Comma-separated indices to run")
+    parser.add_argument("--xrange", type=str, default=None,
+                        help="Range of indices to run (format: 'start-end')")
+    parser.add_argument("--start", type=int, default=None, help="Start index")
+    parser.add_argument("--end", type=int, default=None, help="End index")
+    parser.add_argument("--main_model", type=str, default=MAIN_MODEL, help="Main model to use for generation")
+    parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
+    parser.add_argument("--use_critic", action="store_true", help="Use critic model for evaluation instead of ground truth")
+    parser.add_argument("--critic_model", type=str, default=MAIN_MODEL, help="Critic model to use for evaluation")
+    parser.add_argument("--critic_port", type=int, default=8000, help="vLLM server port for critic model (default: same as main model port)")
+    parser.add_argument("--critic_early_stop", action="store_true", help="Stop sampling after first critic-correct trace")
+    parser.add_argument("--critic_feedback_baseline", action="store_true", help="Use critic feedback as a separate baseline for post-hoc correction")
+    parser.add_argument("--seed", type=int, default=42, help="Base random seed")
+    parser.add_argument("--max_tokens", type=int, default=32768, help="Max tokens for generation")
+    parser.add_argument("--temperature", type=float, default=0.6, help="Sampling temperature")
+    parser.add_argument("--processes", "-p", type=int, default=1, help="Number of examples to process in parallel (default: 1, sequential)")
+    parser.add_argument("--debug", "-d", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+
+    log_level = logging.DEBUG if args.debug else logging.ERROR
+    logging.basicConfig(level=log_level, format="%(message)s")
+
+    quiet_mode = not args.debug
+
+    if quiet_mode:
+        with suppress_output():
+            dataset = load_dataset_for_task(args.task)
+    else:
+        dataset = load_dataset_for_task(args.task)
+    indices = resolve_indices(args.task, len(dataset), args)
+
+    llm_server = init_llm_server(
+        args.main_model,
+        max_tokens=args.max_tokens,
+        port=args.port,
+        temperature=args.temperature,
+        seed=args.seed,
+    )
+
+    critic_llm_server = None
+    if args.use_critic:
+        critic_llm_server = init_llm_server(
+            args.critic_model,
+            max_tokens=512,
+            port=args.critic_port,
+            temperature=0.2,
+            seed=args.seed,
+        )
+        # logger.info(f"Using critic model: {args.critic_model} on port {args.critic_port}")
+
+    # logger.info(f"Loading tokenizer for {args.main_model}...")
+    if quiet_mode:
+        with suppress_output():
+            tokenizer = AutoTokenizer.from_pretrained(args.main_model, trust_remote_code=True)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.main_model, trust_remote_code=True)
+    # logger.info("Tokenizer loaded successfully.")
+
+    output_dirs = get_output_dirs(args.task, args.main_model, args.use_critic, args.critic_early_stop)
+
+    total_examples = 0
+    total_correct = 0
+    total_correct_samples = 0
+    total_samples = 0
+    critic_correct_samples = 0
+    critic_total_samples = 0
+    total_tokens = 0
+    total_tokens_all_samples = 0
+    results = []
+
+    def process_example(idx):
+        """Process a single example: generate k samples, evaluate, return result dict."""
+        example = dataset[int(idx)]
+        if args.task == "game24":
+            nums = example["numbers"]
+            prompt = build_full_prompt(args.task, example, nums=nums)
+            eval_fn = lambda output: evaluate_game24_answer(output, nums)
+            options = None
+        
+        elif args.task == "zebralogic":
+            prompt = build_full_prompt(args.task, example)
+            eval_fn = lambda output, ex=example: evaluate_zebralogic_answer(output, ex)
+            options = None
+        elif args.task == "verina":
+            # For verina, example is a BenchmarkData object
+            prompt = build_full_prompt(args.task, example)
+            current_idx = int(idx)
+            current_data = example
+            eval_fn = lambda output, data=current_data, task_idx=current_idx: evaluate_verina_answer(output, data, task_idx)
+            options = None
+        
+        else:
+            prompt = build_full_prompt(args.task, example)
+            gt = str(example.get("ground_truth", "")).strip()
+            if gt == "Q4":
+                target_options = ["A", "B"]
+            else:
+                target_options = ["A", "B", "C", "D"]
+            if args.task == "maze":
+                _, user_prompt = build_maze_prompt(example)
+            else:
+                _, user_prompt = build_spatialmap_prompt(example)
+            options = extract_options_from_prompt(user_prompt, target_options)
+            eval_fn = lambda output: evaluate_mcq_answer(output, options, gt)
+
+        #logger.info(f"---- Example {idx} ----")
+        
+        quiet_mode = not args.debug
+        
+        if args.use_critic:
+            sample_results = run_k_samples_with_critic(
+                prompt, llm_server, critic_llm_server, args.k, args.seed,
+                args.task, example, tokenizer, eval_fn, nums=(nums if args.task == "game24" else None),
+                early_stop=args.critic_early_stop, critic_feedback_baseline=args.critic_feedback_baseline, quiet=quiet_mode
+            )
+        else:
+            outputs = batch_generate_samples(prompt, llm_server, args.k, args.seed, quiet=quiet_mode)
+            sample_results = []
+            for output in outputs:
+                is_correct, extracted, message = eval_fn(output)
+                token_count = count_tokens(output, tokenizer)
+                sample_results.append(SampleResult(
+                    output=output,
+                    correct=is_correct,
+                    extracted=extracted,
+                    message=message,
+                    tokens=token_count,
+                    critic_correct=None,
+                ))
+
+        if args.use_critic:
+            best_idx = next((i for i, r in enumerate(sample_results) if r.critic_correct), 0)
+        else:
+            best_idx = next((i for i, r in enumerate(sample_results) if r.correct), 0)
+        best_result = sample_results[best_idx]
+        any_correct = any(r.correct for r in sample_results)
+        correct_samples = sum(1 for r in sample_results if r.correct)
+        critic_correct_samples_example = sum(1 for r in sample_results if r.critic_correct)
+
+        save_outputs(idx, sample_results, best_idx, output_dirs["reasoning"])
+
+        return {
+            "idx": int(idx),
+            "best_idx": best_idx,
+            "any_correct": any_correct,
+            "best_correct": best_result.correct,
+            "best_critic_correct": best_result.critic_correct,
+            "best_extracted": best_result.extracted,
+            "best_message": best_result.message,
+            "best_critic_feedback": best_result.critic_feedback,
+            "best_tokens": best_result.tokens,
+            "all_tokens": [r.tokens for r in sample_results],
+            "all_correct": [r.correct for r in sample_results],
+            "all_critic_correct": [r.critic_correct for r in sample_results],
+            "all_critic_feedback": [r.critic_feedback for r in sample_results],
+            "options": options,
+            "_any_correct": any_correct,
+            "_correct_samples": correct_samples,
+            "_critic_correct_samples": critic_correct_samples_example,
+            "_n_samples": len(sample_results),
+            "_best_tokens": best_result.tokens,
+            "_all_tokens_sum": sum(r.tokens for r in sample_results),
+        }
+
+    with ThreadPool(processes=args.processes) as pool:
+        for result in tqdm(pool.imap_unordered(process_example, indices), total=len(indices), desc="Processing examples", unit="example", file=_real_stderr):
+            total_examples += 1
+            if result["_any_correct"]:
+                total_correct += 1
+            total_correct_samples += result["_correct_samples"]
+            total_samples += result["_n_samples"]
+            critic_correct_samples += result["_critic_correct_samples"]
+            critic_total_samples += result["_n_samples"]
+            total_tokens += result["_best_tokens"]
+            total_tokens_all_samples += result["_all_tokens_sum"]
+
+            # Remove internal keys before appending
+            for k in list(result.keys()):
+                if k.startswith("_"):
+                    del result[k]
+            results.append(result)
+
+    accuracy = total_correct / total_examples if total_examples else 0
+    avg_best_tokens = total_tokens / total_examples if total_examples else 0
+    avg_all_tokens = total_tokens_all_samples / total_examples if total_examples else 0
+
+    summary = {
+        "task": args.task,
+        "model": args.main_model,
+        "k": args.k,
+        "use_critic": args.use_critic,
+        "total_examples": total_examples,
+        "correct": total_correct,
+        "correct_samples": total_correct_samples,
+        "total_samples": total_samples,
+        "critic_correct_samples": critic_correct_samples,
+        "critic_total_samples": critic_total_samples,
+        "critic_accuracy": (critic_correct_samples / critic_total_samples) if critic_total_samples else 0,
+        "accuracy": accuracy,
+        "avg_best_tokens": avg_best_tokens,
+        "avg_all_tokens": avg_all_tokens,
+        "total_tokens_best": total_tokens,
+        "total_tokens_all_samples": total_tokens_all_samples,
+        "results": results,
+    }
+    
+    if args.use_critic:
+        summary["critic_model"] = args.critic_model
+        summary["critic_port"] = args.critic_port
+        summary["critic_early_stop"] = args.critic_early_stop
+        summary["critic_feedback_baseline"] = args.critic_feedback_baseline
+
+    summary_path = os.path.join(output_dirs["base"], "summary.json")
+    with open(summary_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2, cls=NumpyEncoder)
+    # logger.info(f"Saved summary to {summary_path}")
\ No newline at end of file
diff --git a/examples/TTSwithVerification/tot_baseline.py b/examples/TTSwithVerification/tot_baseline.py
new file mode 100644
index 00000000..58b2171e
--- /dev/null
+++ b/examples/TTSwithVerification/tot_baseline.py
@@ -0,0 +1,784 @@
+ #!/usr/bin/env python3
+"""Command-line Tree-of-Thought baseline runner for interwhen datasets."""
+ 
+import argparse
+import asyncio
+import json
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+ 
+from tqdm.asyncio import tqdm_asyncio
+ 
+import httpx
+import numpy as np
+from datasets import load_dataset
+ 
+from interwhen.tree_of_thought import (
+    SearchMethod,
+    ToTSearchConfig,
+    TreeOfThoughtSearch,
+    build_tot_problem,
+    # build_verina_synthesis_prompt,
+    # build_verina_spec_synthesis_prompt,
+)
+from interwhen.utils.zebralogic_helper import extract_last_json, zebra_correctness
+# from verina_utils import (
+#     load_verina_dataset,
+#     extract_code_from_response,
+#     evaluate_generated_code,
+# )
+# from verina_spec_utils import (
+#     load_verina_dataset as load_verina_spec_dataset,
+#     extract_spec_from_response,
+#     evaluate_generated_spec,
+# )
+ 
+LOGGER = logging.getLogger("tot_baseline")
+ 
+ 
+# ============== Helper Functions ==============
+ 
+def remove_last_paragraph(s: str) -> str:
+    return s[:-143] if len(s) > 143 else s
+ 
+ 
+def build_maze_prompt(example):
+    pre_prompt = (
+        "You are an expert problem solver. Carefully read the following multiple-choice question "
+        "and think through the solution step-by-step before providing your final answer. "
+        "Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"
+    )
+    description = remove_last_paragraph(str(example.get("prompt")))
+    return pre_prompt, description
+ 
+ 
+def build_spatialmap_prompt(example):
+    pre_prompt = (
+        "You are an expert problem solver. Carefully read the following multiple-choice question "
+        "and think through the solution step-by-step before providing your final answer."
+        "Provide your final answer option by enclosing it within \\boxed{A/B/C/D}.:"
+    )
+    description = remove_last_paragraph(str(example.get("prompt")))
+    return pre_prompt, description
+ 
+ 
+def extract_solution_game24(text):
+    boxed_pattern = r"\\boxed\{"
+    matches = list(re.finditer(boxed_pattern, text))
+    if not matches:
+        return None
+    last_match = matches[-1]
+    start = last_match.end()
+    brace_count = 1
+    end = start
+    while end < len(text) and brace_count > 0:
+        if text[end] == "{":
+            brace_count += 1
+        elif text[end] == "}":
+            brace_count -= 1
+        end += 1
+    expr = text[start:end - 1].strip()
+ 
+    frac_pattern = r"\\frac\{([^{}]+)\}\{([^{}]+)\}"
+    while re.search(frac_pattern, expr):
+        expr = re.sub(frac_pattern, r"(\1/\2)", expr)
+ 
+    replacements = {
+        r"\times": "*",
+        r"\cdot": "*",
+        r"\div": "/",
+    }
+    for latex, op in replacements.items():
+        expr = expr.replace(latex, op)
+ 
+    expr = expr.replace(r"\\,", "").replace(r"\\ ", "")
+    expr = re.sub(r"\)\s*\(", ")*(", expr)
+    expr = re.sub(r"\)\s*(\d)", r")*\1", expr)
+    expr = re.sub(r"(\d)\s*\(", r"\1*(", expr)
+ 
+    return expr
+ 
+ 
+def extract_numbers_from_expr(expr):
+    numbers = re.findall(r"\d+\.?\d*", expr)
+    return [int(float(n)) if float(n).is_integer() else float(n) for n in numbers]
+ 
+ 
+def validate_numbers_used(expr, expected_nums):
+    used_nums = extract_numbers_from_expr(expr)
+    return sorted(used_nums) == sorted(expected_nums)
+ 
+ 
+def evaluate_expression(expr, expected_nums=None):
+    try:
+        if expected_nums is not None and not validate_numbers_used(expr, expected_nums):
+            return False
+        value = eval(expr, {"__builtins__": None}, {})
+        return abs(value - 24) < 1e-6
+    except Exception:
+        return False
+ 
+ 
+def evaluate_game24_answer(answer, nums):
+    expr = extract_solution_game24(answer)
+    if not expr:
+        return False, None, "No expression found"
+    if evaluate_expression(expr, expected_nums=nums):
+        return True, expr, "Correct solution (evaluates to 24 using exactly the given numbers)"
+    used_nums = extract_numbers_from_expr(expr)
+    if sorted(used_nums) != sorted(nums):
+        return False, expr, f"Incorrect: Expression uses {used_nums}, expected {nums}"
+    return False, expr, "Expression does not evaluate to 24"
+ 
+ 
+def extract_solution_mcq(text):
+    """Extract MCQ solution from model output."""
+    patterns = [
+        r"\\boxed\{([^}]*)\}",
+        r"boxed\{([^}]*)\}",
+        r"\*\*([A-D])\*\*",
+        r"answer[:\s]*([A-D])",
+        r"(?:^|\n)([A-D])(?:\s|$|\.)",
+    ]
+   
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        if matches:
+            expr = matches[-1].strip()
+            choice_match = re.search(r"\b([ABCD])\b", expr, flags=re.IGNORECASE)
+            if choice_match:
+                return choice_match.group(1).upper()
+   
+    standalone = re.findall(r"\b([ABCD])\b", text)
+    if standalone:
+        return standalone[-1].upper()
+   
+    return None
+ 
+ 
+def extract_options_from_prompt(prompt_text, target_options):
+    pattern = r"\b([A-D])\.\s*(.*?)(?=\s*[A-D]\.\s*|$)"
+    raw = re.findall(pattern, prompt_text, flags=re.DOTALL)
+    options = {k: v.strip().rstrip(".") for k, v in raw}
+    if target_options:
+        options = {k: v for k, v in options.items() if k in target_options}
+    return options
+ 
+ 
+def evaluate_mcq_answer(answer, options, ground_truth):
+    sol = extract_solution_mcq(answer)
+    gt_sol = str(ground_truth).strip()
+    if not sol:
+        return False, None, "No expression found"
+    sol = sol.strip()
+    if sol in options:
+        if options[sol] == gt_sol:
+            return True, sol, f"Correct: option {sol} -> {options[sol]}"
+        return False, sol, f"Incorrect: expected '{gt_sol}', got '{options[sol]}' (option {sol})"
+    if sol.lower() == gt_sol.lower():
+        return True, sol, f"Correct: answer text matches ground truth: {sol}"
+    for opt_letter, opt_value in options.items():
+        if sol.lower() == opt_value.lower():
+            if opt_value == gt_sol:
+                return True, sol, f"Correct: answer text {sol} (option {opt_letter})"
+            return False, sol, f"Incorrect: expected '{gt_sol}', got '{opt_value}' (option {opt_letter})"
+    return False, sol, f"Solution '{sol}' not found in options or ground truth"
+ 
+ 
+def extract_solution_zebralogic(text):
+    """Extract JSON solution from ZebraLogic model output."""
+    if not text:
+        return None
+ 
+    def _try_parse(candidate: str):
+        try:
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict):
+                # Unwrap if it's a wrapper with "answer" key
+                if "answer" in parsed and isinstance(parsed["answer"], dict):
+                    inner = parsed["answer"]
+                    if any(re.match(r"^house\s*\d+$", str(k).strip(), flags=re.IGNORECASE) for k in inner.keys()):
+                        return inner
+                return parsed
+        except json.JSONDecodeError:
+            return None
+        return None
+ 
+    # Try to extract JSON from code blocks
+    patterns = [
+        r"```json\s*({.*?})\s*```",  # Markdown code block
+        r"```\s*({.*?})\s*```",  # Generic code block
+        r"({\s*['\"]House\s*\d+['\"].*?})",  # Direct JSON starting with House
+    ]
+   
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
+        if matches:
+            json_str = matches[-1].strip()
+            solution = _try_parse(json_str)
+            if solution is not None:
+                return solution
+   
+    # Try parsing entire last large JSON-like structure
+    try:
+        # Find potential JSON starting with {
+        json_match = re.search(r"({\s*(?:['\"]House|['{\"\[])+[\s\S]*})", text)
+        if json_match:
+            json_str = json_match.group(1)
+            solution = _try_parse(json_str)
+            if solution is not None:
+                return solution
+    except (json.JSONDecodeError, AttributeError):
+        pass
+ 
+    # Last-chance extraction: parse top-level JSON object spans and keep the
+    # last one that parses and looks like a house assignment dictionary.
+    stack = []
+    spans = []
+    for idx, ch in enumerate(text):
+        if ch == "{":
+            stack.append(idx)
+        elif ch == "}" and stack:
+            start = stack.pop()
+            if not stack:
+                spans.append((start, idx + 1))
+    for start, end in reversed(spans):
+        candidate = text[start:end]
+        solution = _try_parse(candidate)
+        if solution is not None:
+            # Handle wrapped solution with "answer" key
+            if isinstance(solution, dict) and "answer" in solution:
+                answer = solution["answer"]
+                if isinstance(answer, dict) and any(
+                    re.match(r"^house\s*\d+$", str(key).strip(), flags=re.IGNORECASE)
+                    for key in answer.keys()
+                ):
+                    return answer
+            # Direct house keys
+            if any(
+                re.match(r"^house\s*\d+$", str(key).strip(), flags=re.IGNORECASE)
+                for key in solution.keys()
+            ):
+                return solution
+   
+    return None
+ 
+ 
+async def _request_zebralogic_json(prompt: str, llm_server: Dict[str, Any]) -> str:
+    """Submit a strict-JSON request for ZebraLogic and return raw model content."""
+    payload = dict(llm_server["payload"])
+    payload["temperature"] = 0.0
+    payload["messages"] = [
+        {
+            "role": "system",
+            "content": (
+                "You solve Zebra Logic puzzles and MUST return strictly valid JSON only. "
+                "No markdown fences. No explanation. No extra text."
+            ),
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+    payload["response_format"] = {"type": "json_object"}
+    # Clamp max_tokens so input + output fits within model context window
+    # max_ctx = llm_server.get("max_context_length", 40960)
+    # msg_text = " ".join(m["content"] for m in payload["messages"])
+    # est_input_tokens = len(msg_text) // 3  # conservative: ~3 chars/token
+    # available = max_ctx - est_input_tokens - 200  # 200 token safety margin
+    # if available < payload.get("max_tokens", 0):
+    #     payload["max_tokens"] = max(512, available)
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        response = await client.post(
+            llm_server["url"],
+            headers=llm_server["headers"],
+            json=payload,
+        )
+        response.raise_for_status()
+        body = response.json()
+        return body["choices"][0]["message"]["content"].strip()
+ 
+ 
+async def finalize_zebralogic_json(problem: str, trajectory: str, llm_server: Dict[str, Any]) -> str:
+    """Ask the model to convert an existing trajectory into strict final JSON only."""
+    prompt = (
+        "Convert the reasoning into the final Zebra Logic answer JSON.\n"
+        "Output ONLY valid JSON (no markdown, no explanation).\n"
+        "Use exact feature/value names from the puzzle.\n\n"
+        "PUZZLE:\n"
+        f"{problem}\n\n"
+        "REASONING:\n"
+        f"{trajectory}\n"
+    )
+    return await _request_zebralogic_json(prompt, llm_server)
+ 
+ 
+async def solve_zebralogic_json_direct(problem: str, llm_server: Dict[str, Any]) -> str:
+    """Directly solve ZebraLogic and return strict final JSON."""
+    prompt = (
+        "Solve the Zebra Logic puzzle and provide the final house assignments.\n"
+        "Output ONLY valid JSON with keys like 'House 1', 'House 2', etc.\n"
+        "Use exact feature/value names from the puzzle text.\n\n"
+        "PUZZLE:\n"
+        f"{problem}\n"
+    )
+    return await _request_zebralogic_json(prompt, llm_server)
+ 
+ 
+def _raw_example_to_zebra_problem(example):
+    """Convert a raw HuggingFace ZebraLogic example into the processed format
+    that zebra_correctness expects (matching process_zebralogic_problem output)."""
+    solution = example.get("solution", {})
+    header = solution.get("header", [])
+    rows = solution.get("rows", [])
+    size = example.get("size", "")
+    n_houses, n_features = map(int, size.split("*"))
+ 
+    # Build processed solution dict: {"House 1": {"feature": "value", ...}, ...}
+    processed_solution = {}
+    features = {}
+    for house_i, row in enumerate(rows):
+        house_dict = {}
+        for fname, value in zip(header[1:], row[1:]):
+            fname_l = fname.lower()
+            val_l = value.lower()
+            house_dict[fname_l] = val_l
+            features.setdefault(fname_l, set()).add(val_l)
+        processed_solution[f"House {house_i + 1}"] = house_dict
+    features = {k: sorted(v) for k, v in features.items()}
+ 
+    return {
+        "solution": processed_solution,
+        "n_houses": n_houses,
+        "n_features": n_features,
+        "features": features,
+    }
+ 
+ 
+def evaluate_zebralogic_answer(answer, example):
+    """Evaluate ZebraLogic solution against ground truth using zebra_correctness."""
+    candidate = extract_last_json(answer)
+    if candidate is None:
+        # Fallback: try the older extraction for non-standard formats
+        candidate = extract_solution_zebralogic(answer)
+    if candidate is None:
+        return False, None, "Could not extract valid JSON solution"
+ 
+    # Lowercase candidate keys/values to match processed ground truth
+    normed_candidate = {}
+    for house_key, attrs in candidate.items():
+        house_match = re.search(r"House\s*(\d+)", house_key, re.IGNORECASE)
+        hk = f"House {house_match.group(1)}" if house_match else house_key
+        if isinstance(attrs, dict):
+            normed_candidate[hk] = {k.lower(): v.lower() if isinstance(v, str) else v
+                                    for k, v in attrs.items()}
+        else:
+            normed_candidate[hk] = attrs
+ 
+    problem = _raw_example_to_zebra_problem(example)
+    correct, skipped, missing, total = zebra_correctness(problem, normed_candidate)
+    is_correct = correct == total
+    msg = f"Correct={correct}/{total}, skipped={skipped}, missing={missing}"
+    return is_correct, normed_candidate, msg
+ 
+ 
+def load_dataset_for_task(task):
+    if task == "game24":
+        return load_dataset("nlile/24-game", split="train")
+    if task == "maze":
+        return load_dataset("microsoft/VISION_LANGUAGE", "maze_text_only", split="val")
+    if task == "spatialmap":
+        return load_dataset("microsoft/VISION_LANGUAGE", "spatial_map_text_only", split="val")
+    if task == "zebralogic":
+        return load_dataset("WildEval/ZebraLogic", name="grid_mode", split="test")
+    # if task == "verina":
+    #     return load_verina_dataset()
+    # if task == "verina_spec":
+    #     return load_verina_spec_dataset()
+    raise ValueError(f"Unsupported task: {task}")
+ 
+ 
+def resolve_indices(task, dataset_len, args):
+    if args.indices:
+        return [int(x.strip()) for x in args.indices.split(",")]
+    if args.xrange:
+        parts = args.xrange.split("-")
+        if len(parts) == 2:
+            try:
+                start = int(parts[0].strip())
+                end = int(parts[1].strip())
+                return list(range(start, end))
+            except ValueError:
+                raise ValueError(f"Invalid xrange format: {args.xrange}. Use 'start-end'")
+    if args.num_examples:
+        return list(np.linspace(0, dataset_len - 1, args.num_examples, dtype=int))
+    start = args.start if args.start is not None else 0
+    end = args.end if args.end is not None else dataset_len
+    return list(range(start, end))
+ 
+ 
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run Tree-of-Thought search on a subset of the supported tasks",
+    )
+    parser.add_argument("--task", choices=["game24", "maze", "spatialmap", "zebralogic", "verina", "verina_spec"], required=True)
+    parser.add_argument("--k", type=int, default=1, help="Unused placeholder to mirror other baselines")
+    parser.add_argument("--num_examples", "-n", type=int, default=None)
+    parser.add_argument("--indices", type=str, default=None)
+    parser.add_argument("--xrange", type=str, default=None)
+    parser.add_argument("--start", type=int, default=None)
+    parser.add_argument("--end", type=int, default=None)
+    parser.add_argument("--model", default="Qwen/QwQ-32B")
+    parser.add_argument("--llm_url", default="http://localhost:{port}/v1/chat/completions")
+    parser.add_argument(
+        "--ports",
+        default="8000,8001,8002,8003",
+        help="Comma-separated list of vLLM ports to round-robin across",
+    )
+    parser.add_argument("--temperature", type=float, default=0.6)
+    parser.add_argument("--top_p", type=float, default=0.95)
+    parser.add_argument("--top_k", type=int, default=20)
+    parser.add_argument("--max_tokens", type=int, default=8192)    
+    # parser.add_argument("--max_context_length", type=int, default=40960)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--search_method", choices=["bfs", "dfs", "beam"], default="beam")
+    parser.add_argument("--branching_factor", type=int, default=4)
+    parser.add_argument("--max_depth", type=int, default=1)
+    parser.add_argument("--beam_width", type=int, default=2)
+    parser.add_argument("--sure_threshold", type=float, default=0.9)
+    parser.add_argument("--likely_threshold", type=float, default=0.5)
+    parser.add_argument("--impossible_threshold", type=float, default=0.2)
+    parser.add_argument("--max_candidates_per_level", type=int, default=3)
+    parser.add_argument("--early_termination", action="store_true")
+    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=1,
+        help="Maximum number of ToT examples to run concurrently",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="/workspace/vishak/ToT/interwhen/examples/TTSwithVerification/outputs/dummy",
+        help="Directory to store per-example JSON logs and summary",
+    )
+    parser.add_argument("--log_level", default="INFO")
+    parser.add_argument("--summary_file",default="summary.json")
+    parser.add_argument("--log_file", default="tot_baseline.log")
+    return parser.parse_args()
+ 
+ 
+def parse_port_list(port_str: str) -> List[int]:
+    return [int(p.strip()) for p in port_str.split(",") if p.strip()]
+ 
+ 
+def build_llm_server(args: argparse.Namespace, port: int) -> Dict[str, Any]:
+    payload = {
+        "model": args.model,
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "top_k": args.top_k,
+        "max_tokens": args.max_tokens,
+        "stream": False,
+        "seed": args.seed,
+    }
+    return {
+        "url": args.llm_url.format(port=port),
+        "headers": {"content-type": "application/json"},
+        "payload": payload,
+        # "max_context_length": args.max_context_length,
+    }
+ 
+ 
+def build_tot_config(args: argparse.Namespace) -> ToTSearchConfig:
+    method = SearchMethod[args.search_method.upper()]
+    return ToTSearchConfig(
+        branching_factor=args.branching_factor,
+        max_depth=args.max_depth,
+        search_method=method,
+        beam_width=args.beam_width,
+        sure_threshold=args.sure_threshold,
+        likely_threshold=args.likely_threshold,
+        impossible_threshold=args.impossible_threshold,
+        early_termination=args.early_termination,
+        cache_evaluations=not args.no_cache,
+        max_candidates_per_level=args.max_candidates_per_level,
+    )
+ 
+ 
+def ensure_output_dir(base_dir: str, task: str) -> Path:
+    path = Path(base_dir).expanduser().resolve() / task
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+ 
+ 
+def evaluate_verina_answer(output, example, idx):
+    """Evaluate Verina code generation output for ToT."""
+    generated_code = extract_code_from_response(output)
+    if not generated_code.strip():
+        return False, "", "No code extracted from response"
+    compiles, all_tests_pass, compile_output, test_results = evaluate_generated_code(
+        example, generated_code, idx,
+    )
+    num_tests = len(example.tests) if example.tests else 0
+    num_passed = sum(1 for v in test_results.values() if v == "pass")
+    if compiles and all_tests_pass:
+        return True, generated_code, f"Code compiles and all {num_tests} tests pass"
+    elif compiles:
+        return False, generated_code, f"Compilation succeeded but {num_tests - num_passed}/{num_tests} tests failed"
+    else:
+        error_preview = compile_output[:300] if compile_output else "Unknown error"
+        return False, generated_code, f"Compilation failed: {error_preview}"
+ 
+ 
+def evaluate_verina_spec_answer(output, example, idx):
+    """Evaluate Verina spec generation output for ToT."""
+    generated_spec = extract_spec_from_response(output)
+    if not generated_spec.get("precond") and not generated_spec.get("postcond"):
+        return False, generated_spec, "No spec extracted from response"
+    eval_result = evaluate_generated_spec(example, generated_spec, idx)
+    if eval_result["full_spec_correct"]:
+        return True, generated_spec, "Spec compiles and all soundness/completeness tests pass"
+    elif eval_result["compiles"]:
+        msg_parts = []
+        if eval_result["precond_sound_total"] > 0:
+            msg_parts.append(f"precond_sound={eval_result['precond_sound_pass']}/{eval_result['precond_sound_total']}")
+        if eval_result["precond_complete_total"] > 0:
+            msg_parts.append(f"precond_complete={eval_result['precond_complete_pass']}/{eval_result['precond_complete_total']}")
+        if eval_result["postcond_sound_total"] > 0:
+            msg_parts.append(f"postcond_sound={eval_result['postcond_sound_pass']}/{eval_result['postcond_sound_total']}")
+        if eval_result["postcond_complete_total"] > 0:
+            msg_parts.append(f"postcond_complete={eval_result['postcond_complete_pass']}/{eval_result['postcond_complete_total']}")
+        return False, generated_spec, f"Compilation succeeded but tests: {', '.join(msg_parts)}"
+    else:
+        error_preview = eval_result.get("compile_error", "")[:300]
+        return False, generated_spec, f"Compilation failed: {error_preview}"
+ 
+ 
+def prepare_eval(task: str, example: Dict[str, Any], idx: int = 0) -> Tuple:
+    if task == "game24":
+        nums = list(example.get("numbers", []))
+        return (lambda output: evaluate_game24_answer(output, nums), {"numbers": nums})
+    if task == "zebralogic":
+        # Pass raw example so zebra_correctness can evaluate against processed solution
+        gt_problem = _raw_example_to_zebra_problem(example)
+        meta = {
+            "ground_truth_sample": str(example.get("solution", {}))[:100],
+            "ground_truth_solution": gt_problem["solution"],
+        }
+        return (lambda output: evaluate_zebralogic_answer(output, example), meta)
+    # if task == "verina":
+    #     meta = {"data_id": example.data_id}
+    #     return (lambda output: evaluate_verina_answer(output, example, idx), meta)
+    # if task == "verina_spec":
+    #     meta = {"data_id": example.data_id}
+    #     return (lambda output: evaluate_verina_spec_answer(output, example, idx), meta)
+    gt = str(example.get("ground_truth", "")).strip()
+    target_options = ["A", "B"] if gt == "Q4" else ["A", "B", "C", "D"]
+    if task == "maze":
+        _, user_prompt = build_maze_prompt(example)
+    else:
+        _, user_prompt = build_spatialmap_prompt(example)
+    options = extract_options_from_prompt(user_prompt, target_options)
+    meta = {"options": options, "ground_truth": gt}
+    return (lambda output: evaluate_mcq_answer(output, options, gt), meta)
+ 
+ 
+async def run_single_example(
+    idx: int,
+    task: str,
+    example: Dict[str, Any],
+    tot_config: ToTSearchConfig,
+    llm_server: Dict[str, Any],
+) -> Dict[str, Any]:
+    eval_fn, eval_meta = prepare_eval(task, example, idx)
+    nums = example.get("numbers") if hasattr(example, "get") else None
+    problem = build_tot_problem(task, example, nums=nums)
+    tot = TreeOfThoughtSearch(tot_config)
+    search_result = await tot.search(task, problem, llm_server)
+    best_traj = search_result.get("best_trajectory", "")
+    best_value = search_result.get("best_value", 0.0)
+ 
+    # Verina: if the trajectory already contains [CODE], evaluate directly
+    # (consistent with other tasks that embed answers in trajectories).
+    # Only fall back to a synthesis LLM call if no [CODE] block is present.
+    synthesized_code = None
+    synthesized_spec = None
+    if task == "verina":
+        a = 2
+    # if task == "verina" and best_traj.strip():
+    #     # if re.search(r'\[CODE\]', best_traj, re.IGNORECASE):
+    #     #     # Code already in trajectory — evaluate directly like other tasks
+    #     #     is_correct, extracted, message = eval_fn(best_traj)
+    #     # else:
+    #     # Pure reasoning trajectory — synthesize code
+    #     try:
+    #         synthesis_prompt = build_verina_synthesis_prompt(problem, best_traj)
+    #         synthesized_code = await tot._call_llm_streaming(llm_server, synthesis_prompt)
+    #         is_correct, extracted, message = eval_fn(synthesized_code)
+    #     except Exception as exc:
+    #         LOGGER.warning("Verina synthesis failed for index %s: %s", idx, exc)
+    #         is_correct, extracted, message = eval_fn(best_traj)
+    # elif task == "verina_spec" and best_traj.strip():
+    #     # has_precond = bool(re.search(r'\[PRECOND\]', best_traj, re.IGNORECASE))
+    #     # has_postcond = bool(re.search(r'\[POSTCOND\]', best_traj, re.IGNORECASE))
+    #     # if has_precond and has_postcond:
+    #     #     # Spec already in trajectory — evaluate directly
+    #     #     is_correct, extracted, message = eval_fn(best_traj)
+    #     # else:
+    #     # Pure reasoning trajectory — synthesize spec
+    #     try:
+    #         synthesis_prompt = build_verina_spec_synthesis_prompt(problem, best_traj)
+    #         synthesized_spec = await tot._call_llm_streaming(llm_server, synthesis_prompt)
+    #         is_correct, extracted, message = eval_fn(synthesized_spec)
+    #     except Exception as exc:
+    #         LOGGER.warning("Verina spec synthesis failed for index %s: %s", idx, exc)
+    #         is_correct, extracted, message = eval_fn(best_traj)
+    else:
+        is_correct, extracted, message = eval_fn(best_traj)
+ 
+    # ZebraLogic often ends with partial reasoning trajectories; add strict-JSON
+    # recovery passes before scoring.
+    finalized_answer = None
+    direct_answer = None
+    if task == "zebralogic" and (not is_correct):
+        try:
+            finalized_answer = await finalize_zebralogic_json(problem, best_traj, llm_server)
+            final_is_correct, final_extracted, final_message = eval_fn(finalized_answer)
+            if final_extracted is not None or final_is_correct:
+                is_correct = final_is_correct
+                extracted = final_extracted
+                message = final_message
+                best_traj = finalized_answer
+        except Exception as exc:  # pragma: no cover
+            LOGGER.warning("ZebraLogic finalization failed for index %s: %s", idx, exc)
+ 
+    if task == "zebralogic" and (not is_correct):
+        try:
+            direct_answer = await solve_zebralogic_json_direct(problem, llm_server)
+            direct_is_correct, direct_extracted, direct_message = eval_fn(direct_answer)
+            if direct_extracted is not None or direct_is_correct:
+                is_correct = direct_is_correct
+                extracted = direct_extracted
+                message = direct_message
+                best_traj = direct_answer
+        except Exception as exc:  # pragma: no cover
+            LOGGER.warning("ZebraLogic direct solve failed for index %s: %s", idx, exc)
+ 
+    return {
+        "index": int(idx),
+        "best_value": best_value,
+        "best_trajectory": best_traj,
+        "raw_best_trajectory": search_result.get("best_trajectory", ""),
+        "synthesized_code": synthesized_code,
+        "synthesized_spec": synthesized_spec,
+        "finalized_answer": finalized_answer,
+        "direct_answer": direct_answer,
+        "search_stats": search_result.get("search_stats", {}),
+        "decision_tree": search_result.get("decision_tree", []),
+        "correct": bool(is_correct),
+        "extracted": extracted,
+        "message": message,
+        "evaluation_meta": eval_meta,
+    }
+ 
+ 
+async def run_tot_baseline(args: argparse.Namespace) -> None:
+    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+    # Send all logs to a file instead of stdout/stderr (keeps tqdm clean)
+    log_file = Path(args.output_dir) / args.log_file
+    log_file.parent.mkdir(parents=True, exist_ok=True)
+    fh = logging.FileHandler(str(log_file), mode="a")
+    fh.setLevel(log_level)
+    fh.setFormatter(logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s"))
+    root_logger.addHandler(fh)
+    # Remove default stderr handler so logs don't clobber the progress bar
+    for h in root_logger.handlers[:]:
+        if isinstance(h, logging.StreamHandler) and not isinstance(h, logging.FileHandler):
+            root_logger.removeHandler(h)
+    dataset = load_dataset_for_task(args.task)
+    indices = resolve_indices(args.task, len(dataset), args)
+    output_dir = ensure_output_dir(args.output_dir, args.task)
+    tot_config = build_tot_config(args)
+    ports = parse_port_list(args.ports)
+    if not ports:
+        raise ValueError("At least one port must be specified via --ports")
+    concurrency = max(1, args.concurrency)
+    port_lock = asyncio.Lock()
+    port_index = {"value": 0}
+ 
+    async def next_port() -> int:
+        async with port_lock:
+            port = ports[port_index["value"] % len(ports)]
+            port_index["value"] += 1
+            return port
+ 
+    semaphore = asyncio.Semaphore(concurrency)
+ 
+    async def process_index(idx: int) -> Dict[str, Any]:
+        async with semaphore:
+            example = dataset[int(idx)]
+            port = await next_port()
+            llm_server = build_llm_server(args, port)
+            LOGGER.info("Running ToT on example %s via port %s", idx, port)
+            try:
+                record = await run_single_example(idx, args.task, example, tot_config, llm_server)
+            except Exception as exc:  # pragma: no cover
+                LOGGER.exception("Failed example %s", idx)
+                record = {
+                    "index": int(idx),
+                    "error": str(exc),
+                    "best_trajectory": "",
+                    "correct": False,
+                }
+            example_path = output_dir / f"example_{idx}.json"
+            with example_path.open("w", encoding="utf-8") as handle:
+                json.dump(record, handle, indent=2)
+            return record
+ 
+    processed = await tqdm_asyncio.gather(
+        *[process_index(idx) for idx in indices],
+        desc="ToT examples",
+    )
+ 
+    total = len(processed)
+    correct = sum(1 for r in processed if r.get("correct"))
+    summary = {
+        "task": args.task,
+        "model": args.model,
+        "total_examples": total,
+        "correct": correct,
+        "accuracy": (correct / total) if total else 0.0,
+        "search_method": args.search_method,
+        "config": {
+            "branching_factor": args.branching_factor,
+            "max_depth": args.max_depth,
+            "beam_width": args.beam_width,
+            "sure_threshold": args.sure_threshold,
+            "likely_threshold": args.likely_threshold,
+            "impossible_threshold": args.impossible_threshold,
+            "max_candidates_per_level": args.max_candidates_per_level,
+            "early_termination": args.early_termination,
+            "cache_evaluations": not args.no_cache,
+            "ports": ports,
+            "concurrency": concurrency,
+        },
+    }
+    summary_path = output_dir / args.summary_file
+    with summary_path.open("w", encoding="utf-8") as handle:
+        json.dump(summary, handle, indent=2)
+    LOGGER.info("Accuracy %.2f (%d/%d)", summary["accuracy"], correct, total)
+ 
+ 
+if __name__ == "__main__":
+    import time
+    st = time.time()
+    asyncio.run(run_tot_baseline(parse_args()))
+    et = time.time()
+    print(f"Total execution time: {et - st:.2f} seconds")
\ No newline at end of file