From dc91d81a690dffed34efe8e241f0bf759f761485 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Sat, 13 Sep 2025 21:04:53 -0400
Subject: [PATCH 01/12] check selector if valid, skip test if necessary

---
 core/execute.py | 27 ++++++++++++++-------------
 core/planner.py | 21 ++++++++++++---------
 main.py         | 15 ++++++++++++++-
 3 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/core/execute.py b/core/execute.py
index 0bcf6d4..3f7f4c5 100644
--- a/core/execute.py
+++ b/core/execute.py
@@ -1,6 +1,5 @@
 from collections import OrderedDict
 import logging
-import shutil
 from playwright.sync_api import sync_playwright, BrowserContext
 import os
 from urllib.parse import urljoin
@@ -18,10 +17,6 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
     
     github_mode = os.environ.get('GITHUB_ACTION_MODE') == 'true'
     
-    if os.path.exists("results"):
-        shutil.rmtree("results")  # remove old directory and contents
-    os.makedirs("results", exist_ok=True)
-    
     test_results = {
         "tests": []
     }
@@ -34,6 +29,19 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
         page_url = urljoin(base_url, page_name)
 
         for test in test_suite["tests"]:
+            test_result = {
+                "description": test['description'],
+                "status": "passed",
+                "error": None,
+                "screenshot": None
+            }
+
+            if test.get('disabled', False):
+                print(f"⚠️ Skipping disabled test: {test['description']}")
+                test_result["status"] = "skipped"
+                test_results["tests"].append(test_result)
+                continue
+            
             print(f"\n▶ Running: {test['description']}")
             page = sync_context.new_page()
             response = page.goto(page_url, wait_until="load")
@@ -62,14 +70,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
                     })
                     continue 
 
-            test_result = {
-                "description": test['description'],
-                "status": "passed",
-                "error": None,
-                "screenshot": None
-            }
-
-            # attemptp to execute the test steps
+            # attempt to execute the test steps
             try:
                 validated_test = PlaywrightTest(**test)
                 print(f"✓ Test validation passed: {len(validated_test.steps)} steps")
diff --git a/core/planner.py b/core/planner.py
index d7aada4..001d14d 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -12,17 +12,9 @@
 import logging
 
 
-# Configure logging
-logging.basicConfig(
-    filename="app.log",           # Log file name
-    level=logging.INFO,           # Minimum log level
-    format="%(asctime)s - %(levelname)s - %(message)s",  # Log format
-    filemode='w'
-)
-
-# for testing specific actions via Pydantic
 
 
+# for testing specific actions via Pydantic
 class ActionType(str, Enum):
     """Valid Playwright actions"""
     CLICK = "click"
@@ -317,6 +309,17 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
         if 'tests' in result and len(result['tests']) > max_tests:
             result['tests'] = result['tests'][:max_tests]
 
+        # We check if the selector exists on the page - if not, we disable the test
+        for test in result.get('tests', []):
+            for step in test.get('steps', []):
+                actionType = step['action']  # convert enum back to string
+                selector = step.get('selector', '')
+                try:
+                    auth_page.wait_for_selector(selector, timeout=2000)
+                except Exception as e:
+                    logging.warning(f"Selector '{selector}' not found on page '{page_node.name}': {e}")
+                    test['disabled'] = True  # mark test as disabled if selector not found
+
         unit_tests[page_node.name] = result
 
 
diff --git a/main.py b/main.py
index 8de5352..32fc0ce 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,8 @@
 import argparse
 import json
+import logging
 import os
+import shutil
 from dotenv import load_dotenv
 import yaml
 from pathlib import Path
@@ -22,7 +24,6 @@ def create_github_summary(results):
         "total_tests": len(results.get("tests", [])),
         "tests_passed": sum(1 for test in results.get("tests", []) if test.get("status") == "passed"),
         "tests_failed": sum(1 for test in results.get("tests", []) if test.get("status") == "failed"),
-        "ui_coverage": 85,  # Placeholder - could be calculated based on elements tested
         "failed_tests": [
             {
                 "name": test.get("description", "Unknown test"),
@@ -74,6 +75,18 @@ def main():
         'headless': args.headless.lower() == 'true'
     }
 
+    if os.path.exists("results"):
+        shutil.rmtree("results")  # remove old directory and contents
+    os.makedirs("results", exist_ok=True)
+    
+    # Configure logging
+    logging.basicConfig(
+        filename="./results/app.log",           # Log file name
+        level=logging.INFO,           # Minimum log level
+        format="%(asctime)s - %(levelname)s - %(message)s",  # Log format
+        filemode='w'
+    )
+
     with sync_playwright() as p:
         headless = test_config['headless']
         if test_config['browser'].lower() == 'firefox':

From 883de8f74560b61ee225a04beca89da0784e43df Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Fri, 19 Sep 2025 17:07:14 -0400
Subject: [PATCH 02/12] implemenent feedback mechanism for bad tests

---
 core/execute.py       |  9 ++--
 core/llm_constants.py |  8 ++--
 core/planner.py       | 95 ++++++++++++++++++++++++++++++++-----------
 tests/test_config.yml |  2 +-
 4 files changed, 79 insertions(+), 35 deletions(-)

diff --git a/core/execute.py b/core/execute.py
index 3f7f4c5..388af12 100644
--- a/core/execute.py
+++ b/core/execute.py
@@ -1,5 +1,6 @@
 from collections import OrderedDict
 import logging
+import re
 from playwright.sync_api import sync_playwright, BrowserContext
 import os
 from urllib.parse import urljoin
@@ -37,7 +38,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
             }
 
             if test.get('disabled', False):
-                print(f"⚠️ Skipping disabled test: {test['description']}")
+                print(f"Skipping test, disabled due to bad selectors: {test['description']}")
                 test_result["status"] = "skipped"
                 test_results["tests"].append(test_result)
                 continue
@@ -87,10 +88,6 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
                         page.click(selector, timeout=timeout)
                     elif action == ActionType.TYPE:
                         page.fill(selector, value, timeout=timeout)
-                    elif action == ActionType.ASSERT_VISIBLE:
-                        page.wait_for_selector(selector, state="visible", timeout=timeout)
-                    elif action == ActionType.ASSERT_URL:
-                        page.wait_for_url(value, timeout=timeout)
                     elif action == ActionType.NAVIGATE:
                         if value.startswith(('http://', 'https://')):
                             # Absolute URL
@@ -109,7 +106,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
                         test_result["error"] = f"URL mismatch: expected {test['expect']['url']}, got {page.url}"
                 
                 elif "selectorVisible" in test["expect"]:
-                    if not page.locator(test["expect"]["selectorVisible"]).is_visible():
+                    if not page.locator(test["expect"]["selectorVisible"]).is_visible(timeout=timeout):
                         test_result["status"] = "failed"
                         test_result["error"] = f"Selector not visible: {test['expect']['selectorVisible']}"
                         raise Exception(test_result["error"])
diff --git a/core/llm_constants.py b/core/llm_constants.py
index 7479ec1..b710d46 100644
--- a/core/llm_constants.py
+++ b/core/llm_constants.py
@@ -70,21 +70,19 @@
 """
 
 system_prompt = f"""
-You are a UI QA engineer. Given HTML, you return JSON test cases for the page.
+You are a UI QA engineer. Given HTML, you return JSON test cases for the page. You might also receive feedback along with this original prompt with a testcase that needs to be fixed.
 
 Each test case includes:
 - name: a string describing the test
 - steps: a list of actions. Each action has:
-    - action: one of ["click", "type", "assertVisible", "assertURL", "navigate"]
+    - action: one of ["click", "type", "navigate"]
     - selector: a CSS selector
     - value: optional (only for "type" and "navigate" actions)
-- expect: the final expected outcome (either `url` or a `selectorVisible`)
+- expect: the final expected outcome (either `url` or a `selectorVisible`). The url can be a base path excluding parameters. 
 
 IMPORTANT: Use EXACTLY these action values:
 - "click" for clicking elements
 - "type" for typing into input fields
-- "assertVisible" for checking if elements are visible
-- "assertURL" for checking URL changes
 - "navigate" for direct navigation
 
 Here are two examples:
diff --git a/core/planner.py b/core/planner.py
index 001d14d..6a85cc5 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -1,10 +1,12 @@
 import json
 import os
 from urllib.parse import urljoin
+import playwright
 from pydantic import BaseModel, Field, model_validator
 from .llm_constants import system_prompt
 from langchain.chat_models import init_chat_model
 from langchain_core.prompts import ChatPromptTemplate
+from langchain.memory import ConversationBufferMemory
 from playwright.sync_api import BrowserContext
 from collections import OrderedDict
 from enum import Enum
@@ -19,21 +21,19 @@ class ActionType(str, Enum):
     """Valid Playwright actions"""
     CLICK = "click"
     TYPE = "type"
-    ASSERT_VISIBLE = "assertVisible"
-    ASSERT_URL = "assertURL"
     NAVIGATE = "navigate"
     # add new actions here, update properties below if needed...
 
     @property
     def requires_selector(self) -> bool:
         """Actions that need a CSS selector"""
-        return self in {self.CLICK, self.TYPE, self.ASSERT_VISIBLE}
+        return self in {self.CLICK, self.TYPE}
 
     @property
     def requires_value(self) -> bool:
         """Actions that need a value (text to type, URL to navigate to, etc.)"""
         return self in {self.TYPE, self.NAVIGATE}
-
+    
 
 class Expect(BaseModel):
     """Expected outcome of a test."""
@@ -46,7 +46,7 @@ class Expect(BaseModel):
 class PlaywrightCommand(BaseModel):
     """A single Playwright command to run in the test."""
     action: ActionType = Field(
-        ..., description="The Playwright action, e.g. 'click', 'fill', 'goto', 'assert', 'navigate'")
+        ..., description="The Playwright action, e.g. 'click', 'type', 'navigate'")
     selector: Optional[str] = Field(
         default=None, description="The CSS/XPath selector or test id to target (if applicable)")
     value: Optional[str] = Field(
@@ -123,7 +123,7 @@ def visit(node: PageNode):
         if page.name not in visited:
             visit(page)
 
-    logging.info(f"Sorted pages: {[p.name for p in sorted_pages]}")
+    logging.info(f"Page testing order: {[p.name for p in sorted_pages]}")
     return sorted_pages
 
 
@@ -234,6 +234,62 @@ def get_authenticated_browser_context(sync_context: BrowserContext, base_url: st
 
     return login_page
 
+def is_valid_typing_target(elem) -> bool:
+    tag = elem.evaluate("el => el.tagName.toLowerCase()")
+    return tag in ["input", "textarea"]
+
+
+def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_instance, original_prompt, input, page_context, max_retries=2) -> dict:
+    # Check each test and its steps, retrying generation if needed
+    for i, test in enumerate(tests):
+        curr_test = test
+        for retry in range(max_retries):
+            require_retry, errorList = False, []
+            for step in test.get('steps', []):
+                actionType = step['action']  # convert enum back to string
+                selector = step.get('selector', '')
+                if not selector:
+                    continue  # No selector to validate for this action
+                logging.info(f"Validating selector '{selector}' for action '{actionType}' on page '{page_name}'")
+                try:
+                    # if this times out, we assume the selector is not present
+                    elem_handle = auth_page.wait_for_selector(selector, timeout=2000)
+                    
+                    if actionType == ActionType.TYPE:
+                        # ensure the selector is an input or textarea for typing
+                        if not is_valid_typing_target(elem_handle):
+                            raise ValueError(f"Selector '{selector}' is not an input or textarea for typing.")
+                except Exception as e:
+                    require_retry = True
+                    errorList.append(f"Selector '{selector}' validation error: {str(e)}")
+                    test["disabled"] = True
+                    logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}")
+                    break
+            if not require_retry:
+                test[i] = curr_test
+                test[i]["disabled"] = False
+                break
+            logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry}/{max_retries})")
+            # Regenerate the test
+            feedback_prompt = original_prompt + ("FEEDBACK: {feedback}\n\nHere is the original generated test that needs to be fixed: {test}"
+                                "\n\nPlease fix the test to address the feedback and ensure all selectors are valid on the page.\n")
+            prompt = ChatPromptTemplate.from_messages(
+            [("system", feedback_prompt), ("human", "{input}")])
+
+            # this chat model instance is set up to return a PlaywrightTest structured output instead of UnitTests
+            few_shot_structured_llm = prompt | chat_model_instance
+            
+            response = few_shot_structured_llm.invoke(
+                {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})
+            
+            logging.info(f"Prompt for retry:\n{prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})}")
+
+            result = response.model_dump(mode="python", exclude_none=True)
+            curr_test = result
+        if test.get("disabled", False):
+            logging.info(f"Unable to fix test for '{page_name}'")
+    return tests
+
 
 def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None):
     unit_tests = OrderedDict()
@@ -244,7 +300,9 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
 
     structured_llm = init_chat_model(
         "gemini-2.5-flash", model_provider="google_genai").with_structured_output(UnitTests)
-
+    retry_llm = init_chat_model(
+        "gemini-2.5-flash", model_provider="google_genai").with_structured_output(PlaywrightTest)
+    
     config = config or {}
     max_tests = config.get('max_tests', 2)
     pages_config = config.get('pages', {})
@@ -285,9 +343,6 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
 
         custom_prompt += f"\n\nPlease have a maximum of {max_tests} test cases"
 
-        config_prompt_text = "\n\n The page configuration is {config}"
-        custom_prompt += config_prompt_text
-
         prompt = ChatPromptTemplate.from_messages(
             [("system", custom_prompt), ("human", "{input}")])
         few_shot_structured_llm = prompt | structured_llm
@@ -297,28 +352,22 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
         html = auth_page.content()
 
         response = few_shot_structured_llm.invoke(
-            {'input': html, 'config': json.dumps(page_node.config), 'page_context': page_context})
+            {'input': html, 'page_context': page_context})
         result = response.model_dump(mode="python", exclude_none=True)
-
+        
+        # We check if the selector exists on the page - if not, we disable the test
+        validated_test = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context)
+        
         # Updated: convert "name" back to "description" for execute.py compatibility
         for test in result.get('tests', []):
             if 'name' in test:
                 test['description'] = test.pop('name')
-
+        logging.info(f"Generated test for page {page_node.name}: {json.dumps(result, indent=2)}")
         # Limit number of tests if configured
         if 'tests' in result and len(result['tests']) > max_tests:
             result['tests'] = result['tests'][:max_tests]
 
-        # We check if the selector exists on the page - if not, we disable the test
-        for test in result.get('tests', []):
-            for step in test.get('steps', []):
-                actionType = step['action']  # convert enum back to string
-                selector = step.get('selector', '')
-                try:
-                    auth_page.wait_for_selector(selector, timeout=2000)
-                except Exception as e:
-                    logging.warning(f"Selector '{selector}' not found on page '{page_node.name}': {e}")
-                    test['disabled'] = True  # mark test as disabled if selector not found
+
 
         unit_tests[page_node.name] = result
 
diff --git a/tests/test_config.yml b/tests/test_config.yml
index fe87eb1..fd857ee 100644
--- a/tests/test_config.yml
+++ b/tests/test_config.yml
@@ -1,4 +1,4 @@
-/page:
+/:
   Buttons: true
   Links: true
   Input_fields: true

From d5e75c86f7eee6396db43fd3fd111b639a3d5f25 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Fri, 19 Sep 2025 17:09:48 -0400
Subject: [PATCH 03/12] fix: test case for action type shouldn't include
 assertVisible

---
 tests/test_action_type.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_action_type.py b/tests/test_action_type.py
index 4a4873d..7102193 100644
--- a/tests/test_action_type.py
+++ b/tests/test_action_type.py
@@ -7,7 +7,6 @@ def valid_commands():
     return [
         {"action": "click", "selector": "#button"},
         {"action": "type", "selector": "#input", "value": "hello"},
-        {"action": "assertVisible", "selector": ".msg"},
         {"action": "navigate", "value": "/page"},
     ]
 
@@ -18,7 +17,6 @@ def invalid_commands():
         {"action": "click"},                                  # missing selector
         {"action": "type", "selector": "#input"},             # missing value
         {"action": "navigate"},                               # missing value
-        {"action": "assertVisible"},                          # missing selector
     ]
 
 @pytest.fixture

From df6e22063e2ab6555e305405f8332949ef0400de Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Fri, 19 Sep 2025 17:15:55 -0400
Subject: [PATCH 04/12] clean up models by moving to separate module

---
 core/models.py  | 86 ++++++++++++++++++++++++++++++++++++++++++++
 core/planner.py | 94 +++----------------------------------------------
 2 files changed, 90 insertions(+), 90 deletions(-)
 create mode 100644 core/models.py

diff --git a/core/models.py b/core/models.py
new file mode 100644
index 0000000..9e8db3b
--- /dev/null
+++ b/core/models.py
@@ -0,0 +1,86 @@
+from enum import Enum
+from typing import Optional, List
+from pydantic import BaseModel, Field, model_validator
+
+# for testing specific actions via Pydantic
+class ActionType(str, Enum):
+    """Valid Playwright actions"""
+    CLICK = "click"
+    TYPE = "type"
+    NAVIGATE = "navigate"
+    # add new actions here, update properties below if needed...
+
+    @property
+    def requires_selector(self) -> bool:
+        """Actions that need a CSS selector"""
+        return self in {self.CLICK, self.TYPE}
+
+    @property
+    def requires_value(self) -> bool:
+        """Actions that need a value (text to type, URL to navigate to, etc.)"""
+        return self in {self.TYPE, self.NAVIGATE}
+    
+
+class Expect(BaseModel):
+    """Expected outcome of a test."""
+    url: Optional[str] = Field(
+        default=None, description="Expected URL after the test steps")
+    selectorVisible: Optional[str] = Field(
+        default=None, description="CSS selector that should be visible after the test steps")
+
+
+class PlaywrightCommand(BaseModel):
+    """A single Playwright command to run in the test."""
+    action: ActionType = Field(
+        ..., description="The Playwright action, e.g. 'click', 'type', 'navigate'")
+    selector: Optional[str] = Field(
+        default=None, description="The CSS/XPath selector or test id to target (if applicable)")
+    value: Optional[str] = Field(
+        default=None, description="Value to input (if applicable), e.g. text to type into a field")
+
+    @model_validator(mode='after')  # need to use model_validator for pydantic v2
+    def validate_command_requirements(self):
+        """Validate command has required fields for its action type."""
+
+        # for debugging what the action type actually is
+        # print(f"self.action type: {type(self.action)}")
+        # print(f"self.action value: {self.action}")
+
+        if not isinstance(self.action, ActionType):
+            raise ValueError(f"Invalid action type: {type(self.action).__name__}")
+
+        # Check if action needs a selector but doesn't have one
+        if self.action.requires_selector and not self.selector:
+            raise ValueError(f"{self.action.value} needs a selector")
+
+        # Check if action needs a value but doesn't have one
+        if self.action.requires_value and not self.value:
+            raise ValueError(f"{self.action.value} needs a value")
+
+        return self
+
+
+class PlaywrightTest(BaseModel):
+    """A structured test plan for Playwright."""
+    description: str = Field(...,
+                             description="Brief description of what the test is verifying")
+    steps: List[PlaywrightCommand] = Field(
+        ..., description="Ordered list of Playwright commands to execute in sequence")
+    expect: Expect = Field(
+        ..., description="Expected outcome of the test, e.g. URL change, element visibility")
+
+
+class UnitTests(BaseModel):
+    """All generated tests for a page."""
+    description: str = Field(...,
+                             description="Brief description of what these tests are verifying")
+    tests: List[PlaywrightTest] = Field(..., description="List of tests to run")
+
+
+class PageNode():
+    """Represents a single page in the test plan."""
+
+    def __init__(self, name: str, config: dict):
+        self.name = name
+        self.config = config
+        self.tests = []
\ No newline at end of file
diff --git a/core/planner.py b/core/planner.py
index 6a85cc5..6a19140 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -1,104 +1,18 @@
 import json
 import os
+from typing import List
 from urllib.parse import urljoin
 import playwright
-from pydantic import BaseModel, Field, model_validator
 from .llm_constants import system_prompt
+from .models import (ActionType, PlaywrightTest,
+                     UnitTests, PageNode)
 from langchain.chat_models import init_chat_model
 from langchain_core.prompts import ChatPromptTemplate
 from langchain.memory import ConversationBufferMemory
 from playwright.sync_api import BrowserContext
 from collections import OrderedDict
-from enum import Enum
-from typing import Optional, List
-import logging
-
-
-
-
-# for testing specific actions via Pydantic
-class ActionType(str, Enum):
-    """Valid Playwright actions"""
-    CLICK = "click"
-    TYPE = "type"
-    NAVIGATE = "navigate"
-    # add new actions here, update properties below if needed...
-
-    @property
-    def requires_selector(self) -> bool:
-        """Actions that need a CSS selector"""
-        return self in {self.CLICK, self.TYPE}
-
-    @property
-    def requires_value(self) -> bool:
-        """Actions that need a value (text to type, URL to navigate to, etc.)"""
-        return self in {self.TYPE, self.NAVIGATE}
-    
-
-class Expect(BaseModel):
-    """Expected outcome of a test."""
-    url: Optional[str] = Field(
-        default=None, description="Expected URL after the test steps")
-    selectorVisible: Optional[str] = Field(
-        default=None, description="CSS selector that should be visible after the test steps")
-
-
-class PlaywrightCommand(BaseModel):
-    """A single Playwright command to run in the test."""
-    action: ActionType = Field(
-        ..., description="The Playwright action, e.g. 'click', 'type', 'navigate'")
-    selector: Optional[str] = Field(
-        default=None, description="The CSS/XPath selector or test id to target (if applicable)")
-    value: Optional[str] = Field(
-        default=None, description="Value to input (if applicable), e.g. text to type into a field")
-
-    @model_validator(mode='after')  # need to use model_validator for pydantic v2
-    def validate_command_requirements(self):
-        """Validate command has required fields for its action type."""
-
-        # for debugging what the action type actually is
-        # print(f"self.action type: {type(self.action)}")
-        # print(f"self.action value: {self.action}")
-
-        if not isinstance(self.action, ActionType):
-            raise ValueError(f"Invalid action type: {type(self.action).__name__}")
-
-        # Check if action needs a selector but doesn't have one
-        if self.action.requires_selector and not self.selector:
-            raise ValueError(f"{self.action.value} needs a selector")
-
-        # Check if action needs a value but doesn't have one
-        if self.action.requires_value and not self.value:
-            raise ValueError(f"{self.action.value} needs a value")
-
-        return self
-
-
-class PlaywrightTest(BaseModel):
-    """A structured test plan for Playwright."""
-    description: str = Field(...,
-                             description="Brief description of what the test is verifying")
-    steps: List[PlaywrightCommand] = Field(
-        ..., description="Ordered list of Playwright commands to execute in sequence")
-    expect: Expect = Field(
-        ..., description="Expected outcome of the test, e.g. URL change, element visibility")
-
-
-class UnitTests(BaseModel):
-    """All generated tests for a page."""
-    description: str = Field(...,
-                             description="Brief description of what these tests are verifying")
-    tests: List[PlaywrightTest] = Field(..., description="List of tests to run")
-
-
-class PageNode():
-    """Represents a single page in the test plan."""
-
-    def __init__(self, name: str, config: dict):
-        self.name = name
-        self.config = config
-        self.tests = []
 
+import logging
 
 def parse_config(config: dict) -> List[PageNode]:
     """Parse the configuration dictionary into a list of PageNode objects. We want to top sort the pages based on dependencies."""

From e28522659c9401473c5a863ade6a166665fca822 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Fri, 19 Sep 2025 17:17:23 -0400
Subject: [PATCH 05/12] fix imports in execute.py

---
 core/execute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/execute.py b/core/execute.py
index 388af12..a795d68 100644
--- a/core/execute.py
+++ b/core/execute.py
@@ -5,8 +5,8 @@
 import os
 from urllib.parse import urljoin
 from pydantic import ValidationError
-from .planner import PlaywrightCommand, PlaywrightTest, ActionType, get_authenticated_browser_context
-
+from .planner import get_authenticated_browser_context
+from .models import PlaywrightTest, PlaywrightCommand, ActionType
 import json
 
 from .planner import get_authenticated_browser_context

From a5ad47113258630249c2c04acec373a5d45767af Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Fri, 19 Sep 2025 17:42:07 -0400
Subject: [PATCH 06/12] fix circular reference error in json parsing

---
 core/planner.py           | 8 ++++----
 tests/test_action_type.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/planner.py b/core/planner.py
index 6a19140..5458117 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -180,8 +180,8 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
                     logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}")
                     break
             if not require_retry:
-                test[i] = curr_test
-                test[i]["disabled"] = False
+                tests[i] = curr_test
+                tests[i]["disabled"] = False
                 break
             logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry}/{max_retries})")
             # Regenerate the test
@@ -270,13 +270,13 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
         result = response.model_dump(mode="python", exclude_none=True)
         
         # We check if the selector exists on the page - if not, we disable the test
-        validated_test = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context)
+        validated_tests = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context)
         
         # Updated: convert "name" back to "description" for execute.py compatibility
         for test in result.get('tests', []):
             if 'name' in test:
                 test['description'] = test.pop('name')
-        logging.info(f"Generated test for page {page_node.name}: {json.dumps(result, indent=2)}")
+        logging.info(f"Generated test for page {page_node.name}: {json.dumps(validated_tests, indent=2)}")
         # Limit number of tests if configured
         if 'tests' in result and len(result['tests']) > max_tests:
             result['tests'] = result['tests'][:max_tests]
diff --git a/tests/test_action_type.py b/tests/test_action_type.py
index 7102193..a5f29c3 100644
--- a/tests/test_action_type.py
+++ b/tests/test_action_type.py
@@ -1,6 +1,6 @@
 import pytest
 from pydantic import ValidationError
-from core.planner import PlaywrightCommand, PlaywrightTest
+from core.models import PlaywrightCommand, PlaywrightTest
 
 @pytest.fixture
 def valid_commands():

From 0ceba0112e751055b65a7c42954501455f6e2165 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Sat, 20 Sep 2025 16:58:34 -0400
Subject: [PATCH 07/12] create cleaner logs, fix logic with regenerating cases

---
 core/planner.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/core/planner.py b/core/planner.py
index 5458117..be9d858 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -157,7 +157,7 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
     # Check each test and its steps, retrying generation if needed
     for i, test in enumerate(tests):
         curr_test = test
-        for retry in range(max_retries):
+        for retry in range(max_retries+1):
             require_retry, errorList = False, []
             for step in test.get('steps', []):
                 actionType = step['action']  # convert enum back to string
@@ -178,15 +178,17 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
                     errorList.append(f"Selector '{selector}' validation error: {str(e)}")
                     test["disabled"] = True
                     logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}")
-                    break
             if not require_retry:
                 tests[i] = curr_test
                 tests[i]["disabled"] = False
                 break
-            logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry}/{max_retries})")
+            elif retry >= max_retries:
+                logging.error(f"Max retries reached for test on page '{page_name}'. Disabling test.")
+                break
+            logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry+1}/{max_retries})")
             # Regenerate the test
-            feedback_prompt = original_prompt + ("FEEDBACK: {feedback}\n\nHere is the original generated test that needs to be fixed: {test}"
-                                "\n\nPlease fix the test to address the feedback and ensure all selectors are valid on the page.\n")
+            feedback_prompt = original_prompt + ("\nFEEDBACK: {feedback}\nHere is the original generated test that needs to be fixed: {test}"
+                                "\nPlease fix the test to address the feedback and ensure all selectors are valid on the page.\n")
             prompt = ChatPromptTemplate.from_messages(
             [("system", feedback_prompt), ("human", "{input}")])
 
@@ -195,8 +197,10 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
             
             response = few_shot_structured_llm.invoke(
                 {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})
-            
-            logging.info(f"Prompt for retry:\n{prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})}")
+            prompt_value = prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})
+    
+            for m in prompt_value.messages:
+                logging.info("%s: %s", m.type, m.content)
 
             result = response.model_dump(mode="python", exclude_none=True)
             curr_test = result
@@ -269,7 +273,7 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
             {'input': html, 'page_context': page_context})
         result = response.model_dump(mode="python", exclude_none=True)
         
-        # We check if the selector exists on the page - if not, we disable the test
+        # We check if the selector exists on the page - if not, we retry test generation. If all fails, we disable the test.
         validated_tests = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context)
         
         # Updated: convert "name" back to "description" for execute.py compatibility

From 9dddc7c670670a12cac11135bff8dd8164fb3826 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Sat, 20 Sep 2025 17:03:51 -0400
Subject: [PATCH 08/12] change page to validate with attached state, not
 visible state

---
 core/planner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/planner.py b/core/planner.py
index be9d858..d88e7a1 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -167,7 +167,7 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
                 logging.info(f"Validating selector '{selector}' for action '{actionType}' on page '{page_name}'")
                 try:
                     # if this times out, we assume the selector is not present
-                    elem_handle = auth_page.wait_for_selector(selector, timeout=2000)
+                    elem_handle = auth_page.wait_for_selector(selector, state='attached',timeout=2000)
                     
                     if actionType == ActionType.TYPE:
                         # ensure the selector is an input or textarea for typing

From ef72cac79e811fb3051b8c74fe89e132df1e63d1 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Thu, 25 Sep 2025 16:01:06 -0400
Subject: [PATCH 09/12] fix final bugs of loop

---
 core/execute.py       |  2 +-
 core/llm_constants.py |  4 ++--
 core/models.py        |  2 +-
 core/planner.py       | 37 ++++++++++++++++++-------------------
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/core/execute.py b/core/execute.py
index a795d68..540b9e2 100644
--- a/core/execute.py
+++ b/core/execute.py
@@ -3,7 +3,7 @@
 import re
 from playwright.sync_api import sync_playwright, BrowserContext
 import os
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse
 from pydantic import ValidationError
 from .planner import get_authenticated_browser_context
 from .models import PlaywrightTest, PlaywrightCommand, ActionType
diff --git a/core/llm_constants.py b/core/llm_constants.py
index b710d46..be13b63 100644
--- a/core/llm_constants.py
+++ b/core/llm_constants.py
@@ -77,7 +77,7 @@
 - steps: a list of actions. Each action has:
     - action: one of ["click", "type", "navigate"]
     - selector: a CSS selector
-    - value: optional (only for "type" and "navigate" actions)
+    - value: optional (only for "type" and "navigate" actions). You MUST provide a value for "type" and "navigate" actions. 
 - expect: the final expected outcome (either `url` or a `selectorVisible`). The url can be a base path excluding parameters. 
 
 IMPORTANT: Use EXACTLY these action values:
@@ -87,4 +87,4 @@
 
 Here are two examples:
 {examples} \n
-"""
+"""
\ No newline at end of file
diff --git a/core/models.py b/core/models.py
index 9e8db3b..3c90479 100644
--- a/core/models.py
+++ b/core/models.py
@@ -54,7 +54,7 @@ def validate_command_requirements(self):
             raise ValueError(f"{self.action.value} needs a selector")
 
         # Check if action needs a value but doesn't have one
-        if self.action.requires_value and not self.value:
+        if self.action.requires_value and self.value is None:
             raise ValueError(f"{self.action.value} needs a value")
 
         return self
diff --git a/core/planner.py b/core/planner.py
index d88e7a1..835c1e4 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -155,11 +155,14 @@ def is_valid_typing_target(elem) -> bool:
 
 def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_instance, original_prompt, input, page_context, max_retries=2) -> dict:
     # Check each test and its steps, retrying generation if needed
+    validated_test = []
     for i, test in enumerate(tests):
         curr_test = test
         for retry in range(max_retries+1):
+            logging.info(f"Current test for page '{page_name}': {json.dumps(curr_test, indent=2)}")
+
             require_retry, errorList = False, []
-            for step in test.get('steps', []):
+            for step in curr_test.get('steps', []):
                 actionType = step['action']  # convert enum back to string
                 selector = step.get('selector', '')
                 if not selector:
@@ -176,13 +179,13 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
                 except Exception as e:
                     require_retry = True
                     errorList.append(f"Selector '{selector}' validation error: {str(e)}")
-                    test["disabled"] = True
                     logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}")
-            if not require_retry:
-                tests[i] = curr_test
-                tests[i]["disabled"] = False
-                break
-            elif retry >= max_retries:
+            if require_retry:
+                curr_test['disabled'] = True  # disable the test if we need to retry
+            else:
+                curr_test['disabled'] = False
+                break  # all selectors valid, no need to retry
+            if retry >= max_retries:
                 logging.error(f"Max retries reached for test on page '{page_name}'. Disabling test.")
                 break
             logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry+1}/{max_retries})")
@@ -196,17 +199,16 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
             few_shot_structured_llm = prompt | chat_model_instance
             
             response = few_shot_structured_llm.invoke(
-                {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})
-            prompt_value = prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})
-    
-            for m in prompt_value.messages:
-                logging.info("%s: %s", m.type, m.content)
+                {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(curr_test, indent=2)})
 
             result = response.model_dump(mode="python", exclude_none=True)
-            curr_test = result
-        if test.get("disabled", False):
+            # new test that we will retry in the next loop iteration
+            curr_test = result            
+        if curr_test.get("disabled", False):
             logging.info(f"Unable to fix test for '{page_name}'")
-    return tests
+        validated_test.append(curr_test)
+
+    return validated_test
 
 
 def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None):
@@ -276,10 +278,7 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None)
         # We check if the selector exists on the page - if not, we retry test generation. If all fails, we disable the test.
         validated_tests = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context)
         
-        # Updated: convert "name" back to "description" for execute.py compatibility
-        for test in result.get('tests', []):
-            if 'name' in test:
-                test['description'] = test.pop('name')
+        result['tests'] = validated_tests
         logging.info(f"Generated test for page {page_node.name}: {json.dumps(validated_tests, indent=2)}")
         # Limit number of tests if configured
         if 'tests' in result and len(result['tests']) > max_tests:

From d98a4758df8f6b3ef2384fb44935cfab365f3d5d Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Thu, 25 Sep 2025 16:06:21 -0400
Subject: [PATCH 10/12] fix test

---
 tests/test_planner.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/test_planner.py b/tests/test_planner.py
index a54e0a2..4ec625d 100644
--- a/tests/test_planner.py
+++ b/tests/test_planner.py
@@ -33,10 +33,7 @@ def test_generate_test_plan_structure(mock_browser_context, sample_config):
     page_node_mock = MagicMock()
     page_node_mock.name = "login.html"
     page_node_mock.config = sample_config['pages']['login.html']
-
-    mock_llm_response.model_dump = MagicMock(return_value={
-        "description": "Test page analysis",
-        "tests": [
+    tests = [
             {
                 "description": "Test login form",
                 "steps": [
@@ -47,6 +44,9 @@ def test_generate_test_plan_structure(mock_browser_context, sample_config):
                 "expect": {"selectorVisible": "#content"}
             }
         ]
+    mock_llm_response.model_dump = MagicMock(return_value={
+        "description": "Test page analysis",
+        "tests": tests.copy()
     })
 
     chat_mock = MagicMock(name="chat")
@@ -57,7 +57,8 @@ def test_generate_test_plan_structure(mock_browser_context, sample_config):
 
     with patch("testagent.core.planner.get_authenticated_browser_context", return_value="login.html"), \
          patch("testagent.core.planner.parse_config", return_value=[page_node_mock]), \
-         patch("testagent.core.planner.ChatPromptTemplate", chat_mock):        
+         patch("testagent.core.planner.ChatPromptTemplate", chat_mock), \
+         patch("testagent.core.planner.validate_test_and_retry", return_value=tests.copy()):        
         
         result = generate_test_plan(mock_browser_context, "http://example.com", config=sample_config)
 

From 2748293430bb21d9192ef975b2319ff9c4a245d7 Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Thu, 25 Sep 2025 16:10:11 -0400
Subject: [PATCH 11/12] remove unnecessary logs

---
 core/planner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/planner.py b/core/planner.py
index 835c1e4..b3a62f8 100644
--- a/core/planner.py
+++ b/core/planner.py
@@ -158,9 +158,9 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i
     validated_test = []
     for i, test in enumerate(tests):
         curr_test = test
-        for retry in range(max_retries+1):
-            logging.info(f"Current test for page '{page_name}': {json.dumps(curr_test, indent=2)}")
+        logging.info(f"validate_test_and_retry(): Initial test for page '{page_name}': {json.dumps(curr_test, indent=2)}")
 
+        for retry in range(max_retries+1):
             require_retry, errorList = False, []
             for step in curr_test.get('steps', []):
                 actionType = step['action']  # convert enum back to string

From 150d0f8e636bcd3fe2f536cac55a629aab64895a Mon Sep 17 00:00:00 2001
From: Advay Balakrishnan <advaybalak@gmail.com>
Date: Thu, 25 Sep 2025 16:14:59 -0400
Subject: [PATCH 12/12] bug in execute

---
 core/execute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/execute.py b/core/execute.py
index 540b9e2..03a135b 100644
--- a/core/execute.py
+++ b/core/execute.py
@@ -104,7 +104,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere
                     if test["expect"]["url"] not in page.url:
                         test_result["status"] = "failed"
                         test_result["error"] = f"URL mismatch: expected {test['expect']['url']}, got {page.url}"
-                
+                        raise Exception(test_result["error"])
                 elif "selectorVisible" in test["expect"]:
                     if not page.locator(test["expect"]["selectorVisible"]).is_visible(timeout=timeout):
                         test_result["status"] = "failed"