From dc91d81a690dffed34efe8e241f0bf759f761485 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Sat, 13 Sep 2025 21:04:53 -0400 Subject: [PATCH 01/12] check selector if valid, skip test if necessary --- core/execute.py | 27 ++++++++++++++------------- core/planner.py | 21 ++++++++++++--------- main.py | 15 ++++++++++++++- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/core/execute.py b/core/execute.py index 0bcf6d4..3f7f4c5 100644 --- a/core/execute.py +++ b/core/execute.py @@ -1,6 +1,5 @@ from collections import OrderedDict import logging -import shutil from playwright.sync_api import sync_playwright, BrowserContext import os from urllib.parse import urljoin @@ -18,10 +17,6 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere github_mode = os.environ.get('GITHUB_ACTION_MODE') == 'true' - if os.path.exists("results"): - shutil.rmtree("results") # remove old directory and contents - os.makedirs("results", exist_ok=True) - test_results = { "tests": [] } @@ -34,6 +29,19 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere page_url = urljoin(base_url, page_name) for test in test_suite["tests"]: + test_result = { + "description": test['description'], + "status": "passed", + "error": None, + "screenshot": None + } + + if test.get('disabled', False): + print(f"⚠️ Skipping disabled test: {test['description']}") + test_result["status"] = "skipped" + test_results["tests"].append(test_result) + continue + print(f"\n▶ Running: {test['description']}") page = sync_context.new_page() response = page.goto(page_url, wait_until="load") @@ -62,14 +70,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere }) continue - test_result = { - "description": test['description'], - "status": "passed", - "error": None, - "screenshot": None - } - - # attemptp to execute the test steps + # attempt to execute the test steps try: validated_test = PlaywrightTest(**test) print(f"✓ Test validation passed: {len(validated_test.steps)} steps") diff --git a/core/planner.py b/core/planner.py index d7aada4..001d14d 100644 --- a/core/planner.py +++ b/core/planner.py @@ -12,17 +12,9 @@ import logging -# Configure logging -logging.basicConfig( - filename="app.log", # Log file name - level=logging.INFO, # Minimum log level - format="%(asctime)s - %(levelname)s - %(message)s", # Log format - filemode='w' -) - -# for testing specific actions via Pydantic +# for testing specific actions via Pydantic class ActionType(str, Enum): """Valid Playwright actions""" CLICK = "click" @@ -317,6 +309,17 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) if 'tests' in result and len(result['tests']) > max_tests: result['tests'] = result['tests'][:max_tests] + # We check if the selector exists on the page - if not, we disable the test + for test in result.get('tests', []): + for step in test.get('steps', []): + actionType = step['action'] # convert enum back to string + selector = step.get('selector', '') + try: + auth_page.wait_for_selector(selector, timeout=2000) + except Exception as e: + logging.warning(f"Selector '{selector}' not found on page '{page_node.name}': {e}") + test['disabled'] = True # mark test as disabled if selector not found + unit_tests[page_node.name] = result diff --git a/main.py b/main.py index 8de5352..32fc0ce 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,8 @@ import argparse import json +import logging import os +import shutil from dotenv import load_dotenv import yaml from pathlib import Path @@ -22,7 +24,6 @@ def create_github_summary(results): "total_tests": len(results.get("tests", [])), "tests_passed": sum(1 for test in results.get("tests", []) if test.get("status") == "passed"), "tests_failed": sum(1 for test in results.get("tests", []) if test.get("status") == "failed"), - "ui_coverage": 85, # Placeholder - could be calculated based on elements tested "failed_tests": [ { "name": test.get("description", "Unknown test"), @@ -74,6 +75,18 @@ def main(): 'headless': args.headless.lower() == 'true' } + if os.path.exists("results"): + shutil.rmtree("results") # remove old directory and contents + os.makedirs("results", exist_ok=True) + + # Configure logging + logging.basicConfig( + filename="./results/app.log", # Log file name + level=logging.INFO, # Minimum log level + format="%(asctime)s - %(levelname)s - %(message)s", # Log format + filemode='w' + ) + with sync_playwright() as p: headless = test_config['headless'] if test_config['browser'].lower() == 'firefox': From 883de8f74560b61ee225a04beca89da0784e43df Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Fri, 19 Sep 2025 17:07:14 -0400 Subject: [PATCH 02/12] implemenent feedback mechanism for bad tests --- core/execute.py | 9 ++-- core/llm_constants.py | 8 ++-- core/planner.py | 95 ++++++++++++++++++++++++++++++++----------- tests/test_config.yml | 2 +- 4 files changed, 79 insertions(+), 35 deletions(-) diff --git a/core/execute.py b/core/execute.py index 3f7f4c5..388af12 100644 --- a/core/execute.py +++ b/core/execute.py @@ -1,5 +1,6 @@ from collections import OrderedDict import logging +import re from playwright.sync_api import sync_playwright, BrowserContext import os from urllib.parse import urljoin @@ -37,7 +38,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere } if test.get('disabled', False): - print(f"⚠️ Skipping disabled test: {test['description']}") + print(f"Skipping test, disabled due to bad selectors: {test['description']}") test_result["status"] = "skipped" test_results["tests"].append(test_result) continue @@ -87,10 +88,6 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere page.click(selector, timeout=timeout) elif action == ActionType.TYPE: page.fill(selector, value, timeout=timeout) - elif action == ActionType.ASSERT_VISIBLE: - page.wait_for_selector(selector, state="visible", timeout=timeout) - elif action == ActionType.ASSERT_URL: - page.wait_for_url(value, timeout=timeout) elif action == ActionType.NAVIGATE: if value.startswith(('http://', 'https://')): # Absolute URL @@ -109,7 +106,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere test_result["error"] = f"URL mismatch: expected {test['expect']['url']}, got {page.url}" elif "selectorVisible" in test["expect"]: - if not page.locator(test["expect"]["selectorVisible"]).is_visible(): + if not page.locator(test["expect"]["selectorVisible"]).is_visible(timeout=timeout): test_result["status"] = "failed" test_result["error"] = f"Selector not visible: {test['expect']['selectorVisible']}" raise Exception(test_result["error"]) diff --git a/core/llm_constants.py b/core/llm_constants.py index 7479ec1..b710d46 100644 --- a/core/llm_constants.py +++ b/core/llm_constants.py @@ -70,21 +70,19 @@ """ system_prompt = f""" -You are a UI QA engineer. Given HTML, you return JSON test cases for the page. +You are a UI QA engineer. Given HTML, you return JSON test cases for the page. You might also receive feedback along with this original prompt with a testcase that needs to be fixed. Each test case includes: - name: a string describing the test - steps: a list of actions. Each action has: - - action: one of ["click", "type", "assertVisible", "assertURL", "navigate"] + - action: one of ["click", "type", "navigate"] - selector: a CSS selector - value: optional (only for "type" and "navigate" actions) -- expect: the final expected outcome (either `url` or a `selectorVisible`) +- expect: the final expected outcome (either `url` or a `selectorVisible`). The url can be a base path excluding parameters. IMPORTANT: Use EXACTLY these action values: - "click" for clicking elements - "type" for typing into input fields -- "assertVisible" for checking if elements are visible -- "assertURL" for checking URL changes - "navigate" for direct navigation Here are two examples: diff --git a/core/planner.py b/core/planner.py index 001d14d..6a85cc5 100644 --- a/core/planner.py +++ b/core/planner.py @@ -1,10 +1,12 @@ import json import os from urllib.parse import urljoin +import playwright from pydantic import BaseModel, Field, model_validator from .llm_constants import system_prompt from langchain.chat_models import init_chat_model from langchain_core.prompts import ChatPromptTemplate +from langchain.memory import ConversationBufferMemory from playwright.sync_api import BrowserContext from collections import OrderedDict from enum import Enum @@ -19,21 +21,19 @@ class ActionType(str, Enum): """Valid Playwright actions""" CLICK = "click" TYPE = "type" - ASSERT_VISIBLE = "assertVisible" - ASSERT_URL = "assertURL" NAVIGATE = "navigate" # add new actions here, update properties below if needed... @property def requires_selector(self) -> bool: """Actions that need a CSS selector""" - return self in {self.CLICK, self.TYPE, self.ASSERT_VISIBLE} + return self in {self.CLICK, self.TYPE} @property def requires_value(self) -> bool: """Actions that need a value (text to type, URL to navigate to, etc.)""" return self in {self.TYPE, self.NAVIGATE} - + class Expect(BaseModel): """Expected outcome of a test.""" @@ -46,7 +46,7 @@ class Expect(BaseModel): class PlaywrightCommand(BaseModel): """A single Playwright command to run in the test.""" action: ActionType = Field( - ..., description="The Playwright action, e.g. 'click', 'fill', 'goto', 'assert', 'navigate'") + ..., description="The Playwright action, e.g. 'click', 'type', 'navigate'") selector: Optional[str] = Field( default=None, description="The CSS/XPath selector or test id to target (if applicable)") value: Optional[str] = Field( @@ -123,7 +123,7 @@ def visit(node: PageNode): if page.name not in visited: visit(page) - logging.info(f"Sorted pages: {[p.name for p in sorted_pages]}") + logging.info(f"Page testing order: {[p.name for p in sorted_pages]}") return sorted_pages @@ -234,6 +234,62 @@ def get_authenticated_browser_context(sync_context: BrowserContext, base_url: st return login_page +def is_valid_typing_target(elem) -> bool: + tag = elem.evaluate("el => el.tagName.toLowerCase()") + return tag in ["input", "textarea"] + + +def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_instance, original_prompt, input, page_context, max_retries=2) -> dict: + # Check each test and its steps, retrying generation if needed + for i, test in enumerate(tests): + curr_test = test + for retry in range(max_retries): + require_retry, errorList = False, [] + for step in test.get('steps', []): + actionType = step['action'] # convert enum back to string + selector = step.get('selector', '') + if not selector: + continue # No selector to validate for this action + logging.info(f"Validating selector '{selector}' for action '{actionType}' on page '{page_name}'") + try: + # if this times out, we assume the selector is not present + elem_handle = auth_page.wait_for_selector(selector, timeout=2000) + + if actionType == ActionType.TYPE: + # ensure the selector is an input or textarea for typing + if not is_valid_typing_target(elem_handle): + raise ValueError(f"Selector '{selector}' is not an input or textarea for typing.") + except Exception as e: + require_retry = True + errorList.append(f"Selector '{selector}' validation error: {str(e)}") + test["disabled"] = True + logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}") + break + if not require_retry: + test[i] = curr_test + test[i]["disabled"] = False + break + logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry}/{max_retries})") + # Regenerate the test + feedback_prompt = original_prompt + ("FEEDBACK: {feedback}\n\nHere is the original generated test that needs to be fixed: {test}" + "\n\nPlease fix the test to address the feedback and ensure all selectors are valid on the page.\n") + prompt = ChatPromptTemplate.from_messages( + [("system", feedback_prompt), ("human", "{input}")]) + + # this chat model instance is set up to return a PlaywrightTest structured output instead of UnitTests + few_shot_structured_llm = prompt | chat_model_instance + + response = few_shot_structured_llm.invoke( + {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)}) + + logging.info(f"Prompt for retry:\n{prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})}") + + result = response.model_dump(mode="python", exclude_none=True) + curr_test = result + if test.get("disabled", False): + logging.info(f"Unable to fix test for '{page_name}'") + return tests + def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None): unit_tests = OrderedDict() @@ -244,7 +300,9 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) structured_llm = init_chat_model( "gemini-2.5-flash", model_provider="google_genai").with_structured_output(UnitTests) - + retry_llm = init_chat_model( + "gemini-2.5-flash", model_provider="google_genai").with_structured_output(PlaywrightTest) + config = config or {} max_tests = config.get('max_tests', 2) pages_config = config.get('pages', {}) @@ -285,9 +343,6 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) custom_prompt += f"\n\nPlease have a maximum of {max_tests} test cases" - config_prompt_text = "\n\n The page configuration is {config}" - custom_prompt += config_prompt_text - prompt = ChatPromptTemplate.from_messages( [("system", custom_prompt), ("human", "{input}")]) few_shot_structured_llm = prompt | structured_llm @@ -297,28 +352,22 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) html = auth_page.content() response = few_shot_structured_llm.invoke( - {'input': html, 'config': json.dumps(page_node.config), 'page_context': page_context}) + {'input': html, 'page_context': page_context}) result = response.model_dump(mode="python", exclude_none=True) - + + # We check if the selector exists on the page - if not, we disable the test + validated_test = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context) + # Updated: convert "name" back to "description" for execute.py compatibility for test in result.get('tests', []): if 'name' in test: test['description'] = test.pop('name') - + logging.info(f"Generated test for page {page_node.name}: {json.dumps(result, indent=2)}") # Limit number of tests if configured if 'tests' in result and len(result['tests']) > max_tests: result['tests'] = result['tests'][:max_tests] - # We check if the selector exists on the page - if not, we disable the test - for test in result.get('tests', []): - for step in test.get('steps', []): - actionType = step['action'] # convert enum back to string - selector = step.get('selector', '') - try: - auth_page.wait_for_selector(selector, timeout=2000) - except Exception as e: - logging.warning(f"Selector '{selector}' not found on page '{page_node.name}': {e}") - test['disabled'] = True # mark test as disabled if selector not found + unit_tests[page_node.name] = result diff --git a/tests/test_config.yml b/tests/test_config.yml index fe87eb1..fd857ee 100644 --- a/tests/test_config.yml +++ b/tests/test_config.yml @@ -1,4 +1,4 @@ -/page: +/: Buttons: true Links: true Input_fields: true From d5e75c86f7eee6396db43fd3fd111b639a3d5f25 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Fri, 19 Sep 2025 17:09:48 -0400 Subject: [PATCH 03/12] fix: test case for action type shouldn't include assertVisible --- tests/test_action_type.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_action_type.py b/tests/test_action_type.py index 4a4873d..7102193 100644 --- a/tests/test_action_type.py +++ b/tests/test_action_type.py @@ -7,7 +7,6 @@ def valid_commands(): return [ {"action": "click", "selector": "#button"}, {"action": "type", "selector": "#input", "value": "hello"}, - {"action": "assertVisible", "selector": ".msg"}, {"action": "navigate", "value": "/page"}, ] @@ -18,7 +17,6 @@ def invalid_commands(): {"action": "click"}, # missing selector {"action": "type", "selector": "#input"}, # missing value {"action": "navigate"}, # missing value - {"action": "assertVisible"}, # missing selector ] @pytest.fixture From df6e22063e2ab6555e305405f8332949ef0400de Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Fri, 19 Sep 2025 17:15:55 -0400 Subject: [PATCH 04/12] clean up models by moving to separate module --- core/models.py | 86 ++++++++++++++++++++++++++++++++++++++++++++ core/planner.py | 94 +++---------------------------------------------- 2 files changed, 90 insertions(+), 90 deletions(-) create mode 100644 core/models.py diff --git a/core/models.py b/core/models.py new file mode 100644 index 0000000..9e8db3b --- /dev/null +++ b/core/models.py @@ -0,0 +1,86 @@ +from enum import Enum +from typing import Optional, List +from pydantic import BaseModel, Field, model_validator + +# for testing specific actions via Pydantic +class ActionType(str, Enum): + """Valid Playwright actions""" + CLICK = "click" + TYPE = "type" + NAVIGATE = "navigate" + # add new actions here, update properties below if needed... + + @property + def requires_selector(self) -> bool: + """Actions that need a CSS selector""" + return self in {self.CLICK, self.TYPE} + + @property + def requires_value(self) -> bool: + """Actions that need a value (text to type, URL to navigate to, etc.)""" + return self in {self.TYPE, self.NAVIGATE} + + +class Expect(BaseModel): + """Expected outcome of a test.""" + url: Optional[str] = Field( + default=None, description="Expected URL after the test steps") + selectorVisible: Optional[str] = Field( + default=None, description="CSS selector that should be visible after the test steps") + + +class PlaywrightCommand(BaseModel): + """A single Playwright command to run in the test.""" + action: ActionType = Field( + ..., description="The Playwright action, e.g. 'click', 'type', 'navigate'") + selector: Optional[str] = Field( + default=None, description="The CSS/XPath selector or test id to target (if applicable)") + value: Optional[str] = Field( + default=None, description="Value to input (if applicable), e.g. text to type into a field") + + @model_validator(mode='after') # need to use model_validator for pydantic v2 + def validate_command_requirements(self): + """Validate command has required fields for its action type.""" + + # for debugging what the action type actually is + # print(f"self.action type: {type(self.action)}") + # print(f"self.action value: {self.action}") + + if not isinstance(self.action, ActionType): + raise ValueError(f"Invalid action type: {type(self.action).__name__}") + + # Check if action needs a selector but doesn't have one + if self.action.requires_selector and not self.selector: + raise ValueError(f"{self.action.value} needs a selector") + + # Check if action needs a value but doesn't have one + if self.action.requires_value and not self.value: + raise ValueError(f"{self.action.value} needs a value") + + return self + + +class PlaywrightTest(BaseModel): + """A structured test plan for Playwright.""" + description: str = Field(..., + description="Brief description of what the test is verifying") + steps: List[PlaywrightCommand] = Field( + ..., description="Ordered list of Playwright commands to execute in sequence") + expect: Expect = Field( + ..., description="Expected outcome of the test, e.g. URL change, element visibility") + + +class UnitTests(BaseModel): + """All generated tests for a page.""" + description: str = Field(..., + description="Brief description of what these tests are verifying") + tests: List[PlaywrightTest] = Field(..., description="List of tests to run") + + +class PageNode(): + """Represents a single page in the test plan.""" + + def __init__(self, name: str, config: dict): + self.name = name + self.config = config + self.tests = [] \ No newline at end of file diff --git a/core/planner.py b/core/planner.py index 6a85cc5..6a19140 100644 --- a/core/planner.py +++ b/core/planner.py @@ -1,104 +1,18 @@ import json import os +from typing import List from urllib.parse import urljoin import playwright -from pydantic import BaseModel, Field, model_validator from .llm_constants import system_prompt +from .models import (ActionType, PlaywrightTest, + UnitTests, PageNode) from langchain.chat_models import init_chat_model from langchain_core.prompts import ChatPromptTemplate from langchain.memory import ConversationBufferMemory from playwright.sync_api import BrowserContext from collections import OrderedDict -from enum import Enum -from typing import Optional, List -import logging - - - - -# for testing specific actions via Pydantic -class ActionType(str, Enum): - """Valid Playwright actions""" - CLICK = "click" - TYPE = "type" - NAVIGATE = "navigate" - # add new actions here, update properties below if needed... - - @property - def requires_selector(self) -> bool: - """Actions that need a CSS selector""" - return self in {self.CLICK, self.TYPE} - - @property - def requires_value(self) -> bool: - """Actions that need a value (text to type, URL to navigate to, etc.)""" - return self in {self.TYPE, self.NAVIGATE} - - -class Expect(BaseModel): - """Expected outcome of a test.""" - url: Optional[str] = Field( - default=None, description="Expected URL after the test steps") - selectorVisible: Optional[str] = Field( - default=None, description="CSS selector that should be visible after the test steps") - - -class PlaywrightCommand(BaseModel): - """A single Playwright command to run in the test.""" - action: ActionType = Field( - ..., description="The Playwright action, e.g. 'click', 'type', 'navigate'") - selector: Optional[str] = Field( - default=None, description="The CSS/XPath selector or test id to target (if applicable)") - value: Optional[str] = Field( - default=None, description="Value to input (if applicable), e.g. text to type into a field") - - @model_validator(mode='after') # need to use model_validator for pydantic v2 - def validate_command_requirements(self): - """Validate command has required fields for its action type.""" - - # for debugging what the action type actually is - # print(f"self.action type: {type(self.action)}") - # print(f"self.action value: {self.action}") - - if not isinstance(self.action, ActionType): - raise ValueError(f"Invalid action type: {type(self.action).__name__}") - - # Check if action needs a selector but doesn't have one - if self.action.requires_selector and not self.selector: - raise ValueError(f"{self.action.value} needs a selector") - - # Check if action needs a value but doesn't have one - if self.action.requires_value and not self.value: - raise ValueError(f"{self.action.value} needs a value") - - return self - - -class PlaywrightTest(BaseModel): - """A structured test plan for Playwright.""" - description: str = Field(..., - description="Brief description of what the test is verifying") - steps: List[PlaywrightCommand] = Field( - ..., description="Ordered list of Playwright commands to execute in sequence") - expect: Expect = Field( - ..., description="Expected outcome of the test, e.g. URL change, element visibility") - - -class UnitTests(BaseModel): - """All generated tests for a page.""" - description: str = Field(..., - description="Brief description of what these tests are verifying") - tests: List[PlaywrightTest] = Field(..., description="List of tests to run") - - -class PageNode(): - """Represents a single page in the test plan.""" - - def __init__(self, name: str, config: dict): - self.name = name - self.config = config - self.tests = [] +import logging def parse_config(config: dict) -> List[PageNode]: """Parse the configuration dictionary into a list of PageNode objects. We want to top sort the pages based on dependencies.""" From e28522659c9401473c5a863ade6a166665fca822 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Fri, 19 Sep 2025 17:17:23 -0400 Subject: [PATCH 05/12] fix imports in execute.py --- core/execute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/execute.py b/core/execute.py index 388af12..a795d68 100644 --- a/core/execute.py +++ b/core/execute.py @@ -5,8 +5,8 @@ import os from urllib.parse import urljoin from pydantic import ValidationError -from .planner import PlaywrightCommand, PlaywrightTest, ActionType, get_authenticated_browser_context - +from .planner import get_authenticated_browser_context +from .models import PlaywrightTest, PlaywrightCommand, ActionType import json from .planner import get_authenticated_browser_context From a5ad47113258630249c2c04acec373a5d45767af Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Fri, 19 Sep 2025 17:42:07 -0400 Subject: [PATCH 06/12] fix circular reference error in json parsing --- core/planner.py | 8 ++++---- tests/test_action_type.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/planner.py b/core/planner.py index 6a19140..5458117 100644 --- a/core/planner.py +++ b/core/planner.py @@ -180,8 +180,8 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}") break if not require_retry: - test[i] = curr_test - test[i]["disabled"] = False + tests[i] = curr_test + tests[i]["disabled"] = False break logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry}/{max_retries})") # Regenerate the test @@ -270,13 +270,13 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) result = response.model_dump(mode="python", exclude_none=True) # We check if the selector exists on the page - if not, we disable the test - validated_test = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context) + validated_tests = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context) # Updated: convert "name" back to "description" for execute.py compatibility for test in result.get('tests', []): if 'name' in test: test['description'] = test.pop('name') - logging.info(f"Generated test for page {page_node.name}: {json.dumps(result, indent=2)}") + logging.info(f"Generated test for page {page_node.name}: {json.dumps(validated_tests, indent=2)}") # Limit number of tests if configured if 'tests' in result and len(result['tests']) > max_tests: result['tests'] = result['tests'][:max_tests] diff --git a/tests/test_action_type.py b/tests/test_action_type.py index 7102193..a5f29c3 100644 --- a/tests/test_action_type.py +++ b/tests/test_action_type.py @@ -1,6 +1,6 @@ import pytest from pydantic import ValidationError -from core.planner import PlaywrightCommand, PlaywrightTest +from core.models import PlaywrightCommand, PlaywrightTest @pytest.fixture def valid_commands(): From 0ceba0112e751055b65a7c42954501455f6e2165 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Sat, 20 Sep 2025 16:58:34 -0400 Subject: [PATCH 07/12] create cleaner logs, fix logic with regenerating cases --- core/planner.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/core/planner.py b/core/planner.py index 5458117..be9d858 100644 --- a/core/planner.py +++ b/core/planner.py @@ -157,7 +157,7 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i # Check each test and its steps, retrying generation if needed for i, test in enumerate(tests): curr_test = test - for retry in range(max_retries): + for retry in range(max_retries+1): require_retry, errorList = False, [] for step in test.get('steps', []): actionType = step['action'] # convert enum back to string @@ -178,15 +178,17 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i errorList.append(f"Selector '{selector}' validation error: {str(e)}") test["disabled"] = True logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}") - break if not require_retry: tests[i] = curr_test tests[i]["disabled"] = False break - logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry}/{max_retries})") + elif retry >= max_retries: + logging.error(f"Max retries reached for test on page '{page_name}'. Disabling test.") + break + logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry+1}/{max_retries})") # Regenerate the test - feedback_prompt = original_prompt + ("FEEDBACK: {feedback}\n\nHere is the original generated test that needs to be fixed: {test}" - "\n\nPlease fix the test to address the feedback and ensure all selectors are valid on the page.\n") + feedback_prompt = original_prompt + ("\nFEEDBACK: {feedback}\nHere is the original generated test that needs to be fixed: {test}" + "\nPlease fix the test to address the feedback and ensure all selectors are valid on the page.\n") prompt = ChatPromptTemplate.from_messages( [("system", feedback_prompt), ("human", "{input}")]) @@ -195,8 +197,10 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i response = few_shot_structured_llm.invoke( {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)}) - - logging.info(f"Prompt for retry:\n{prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)})}") + prompt_value = prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)}) + + for m in prompt_value.messages: + logging.info("%s: %s", m.type, m.content) result = response.model_dump(mode="python", exclude_none=True) curr_test = result @@ -269,7 +273,7 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) {'input': html, 'page_context': page_context}) result = response.model_dump(mode="python", exclude_none=True) - # We check if the selector exists on the page - if not, we disable the test + # We check if the selector exists on the page - if not, we retry test generation. If all fails, we disable the test. validated_tests = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context) # Updated: convert "name" back to "description" for execute.py compatibility From 9dddc7c670670a12cac11135bff8dd8164fb3826 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Sat, 20 Sep 2025 17:03:51 -0400 Subject: [PATCH 08/12] change page to validate with attached state, not visible state --- core/planner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/planner.py b/core/planner.py index be9d858..d88e7a1 100644 --- a/core/planner.py +++ b/core/planner.py @@ -167,7 +167,7 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i logging.info(f"Validating selector '{selector}' for action '{actionType}' on page '{page_name}'") try: # if this times out, we assume the selector is not present - elem_handle = auth_page.wait_for_selector(selector, timeout=2000) + elem_handle = auth_page.wait_for_selector(selector, state='attached',timeout=2000) if actionType == ActionType.TYPE: # ensure the selector is an input or textarea for typing From ef72cac79e811fb3051b8c74fe89e132df1e63d1 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Thu, 25 Sep 2025 16:01:06 -0400 Subject: [PATCH 09/12] fix final bugs of loop --- core/execute.py | 2 +- core/llm_constants.py | 4 ++-- core/models.py | 2 +- core/planner.py | 37 ++++++++++++++++++------------------- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/core/execute.py b/core/execute.py index a795d68..540b9e2 100644 --- a/core/execute.py +++ b/core/execute.py @@ -3,7 +3,7 @@ import re from playwright.sync_api import sync_playwright, BrowserContext import os -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse from pydantic import ValidationError from .planner import get_authenticated_browser_context from .models import PlaywrightTest, PlaywrightCommand, ActionType diff --git a/core/llm_constants.py b/core/llm_constants.py index b710d46..be13b63 100644 --- a/core/llm_constants.py +++ b/core/llm_constants.py @@ -77,7 +77,7 @@ - steps: a list of actions. Each action has: - action: one of ["click", "type", "navigate"] - selector: a CSS selector - - value: optional (only for "type" and "navigate" actions) + - value: optional (only for "type" and "navigate" actions). You MUST provide a value for "type" and "navigate" actions. - expect: the final expected outcome (either `url` or a `selectorVisible`). The url can be a base path excluding parameters. IMPORTANT: Use EXACTLY these action values: @@ -87,4 +87,4 @@ Here are two examples: {examples} \n -""" +""" \ No newline at end of file diff --git a/core/models.py b/core/models.py index 9e8db3b..3c90479 100644 --- a/core/models.py +++ b/core/models.py @@ -54,7 +54,7 @@ def validate_command_requirements(self): raise ValueError(f"{self.action.value} needs a selector") # Check if action needs a value but doesn't have one - if self.action.requires_value and not self.value: + if self.action.requires_value and self.value is None: raise ValueError(f"{self.action.value} needs a value") return self diff --git a/core/planner.py b/core/planner.py index d88e7a1..835c1e4 100644 --- a/core/planner.py +++ b/core/planner.py @@ -155,11 +155,14 @@ def is_valid_typing_target(elem) -> bool: def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_instance, original_prompt, input, page_context, max_retries=2) -> dict: # Check each test and its steps, retrying generation if needed + validated_test = [] for i, test in enumerate(tests): curr_test = test for retry in range(max_retries+1): + logging.info(f"Current test for page '{page_name}': {json.dumps(curr_test, indent=2)}") + require_retry, errorList = False, [] - for step in test.get('steps', []): + for step in curr_test.get('steps', []): actionType = step['action'] # convert enum back to string selector = step.get('selector', '') if not selector: @@ -176,13 +179,13 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i except Exception as e: require_retry = True errorList.append(f"Selector '{selector}' validation error: {str(e)}") - test["disabled"] = True logging.warning(f"Selector '{selector}' validation error on page '{page_name}': {e}") - if not require_retry: - tests[i] = curr_test - tests[i]["disabled"] = False - break - elif retry >= max_retries: + if require_retry: + curr_test['disabled'] = True # disable the test if we need to retry + else: + curr_test['disabled'] = False + break # all selectors valid, no need to retry + if retry >= max_retries: logging.error(f"Max retries reached for test on page '{page_name}'. Disabling test.") break logging.info(f"Retrying test generation for page '{page_name}' (attempt {retry+1}/{max_retries})") @@ -196,17 +199,16 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i few_shot_structured_llm = prompt | chat_model_instance response = few_shot_structured_llm.invoke( - {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)}) - prompt_value = prompt.invoke({'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(test)}) - - for m in prompt_value.messages: - logging.info("%s: %s", m.type, m.content) + {'input': input, 'page_context': page_context, 'feedback': errorList, 'test': json.dumps(curr_test, indent=2)}) result = response.model_dump(mode="python", exclude_none=True) - curr_test = result - if test.get("disabled", False): + # new test that we will retry in the next loop iteration + curr_test = result + if curr_test.get("disabled", False): logging.info(f"Unable to fix test for '{page_name}'") - return tests + validated_test.append(curr_test) + + return validated_test def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None): @@ -276,10 +278,7 @@ def generate_test_plan(sync_context: BrowserContext, base_url: str, config=None) # We check if the selector exists on the page - if not, we retry test generation. If all fails, we disable the test. validated_tests = validate_test_and_retry(auth_page, page_node.name, result.get("tests", []), retry_llm, custom_prompt, html, page_context) - # Updated: convert "name" back to "description" for execute.py compatibility - for test in result.get('tests', []): - if 'name' in test: - test['description'] = test.pop('name') + result['tests'] = validated_tests logging.info(f"Generated test for page {page_node.name}: {json.dumps(validated_tests, indent=2)}") # Limit number of tests if configured if 'tests' in result and len(result['tests']) > max_tests: From d98a4758df8f6b3ef2384fb44935cfab365f3d5d Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Thu, 25 Sep 2025 16:06:21 -0400 Subject: [PATCH 10/12] fix test --- tests/test_planner.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_planner.py b/tests/test_planner.py index a54e0a2..4ec625d 100644 --- a/tests/test_planner.py +++ b/tests/test_planner.py @@ -33,10 +33,7 @@ def test_generate_test_plan_structure(mock_browser_context, sample_config): page_node_mock = MagicMock() page_node_mock.name = "login.html" page_node_mock.config = sample_config['pages']['login.html'] - - mock_llm_response.model_dump = MagicMock(return_value={ - "description": "Test page analysis", - "tests": [ + tests = [ { "description": "Test login form", "steps": [ @@ -47,6 +44,9 @@ def test_generate_test_plan_structure(mock_browser_context, sample_config): "expect": {"selectorVisible": "#content"} } ] + mock_llm_response.model_dump = MagicMock(return_value={ + "description": "Test page analysis", + "tests": tests.copy() }) chat_mock = MagicMock(name="chat") @@ -57,7 +57,8 @@ def test_generate_test_plan_structure(mock_browser_context, sample_config): with patch("testagent.core.planner.get_authenticated_browser_context", return_value="login.html"), \ patch("testagent.core.planner.parse_config", return_value=[page_node_mock]), \ - patch("testagent.core.planner.ChatPromptTemplate", chat_mock): + patch("testagent.core.planner.ChatPromptTemplate", chat_mock), \ + patch("testagent.core.planner.validate_test_and_retry", return_value=tests.copy()): result = generate_test_plan(mock_browser_context, "http://example.com", config=sample_config) From 2748293430bb21d9192ef975b2319ff9c4a245d7 Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Thu, 25 Sep 2025 16:10:11 -0400 Subject: [PATCH 11/12] remove unnecessary logs --- core/planner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/planner.py b/core/planner.py index 835c1e4..b3a62f8 100644 --- a/core/planner.py +++ b/core/planner.py @@ -158,9 +158,9 @@ def validate_test_and_retry(auth_page, page_name: str, tests: dict, chat_model_i validated_test = [] for i, test in enumerate(tests): curr_test = test - for retry in range(max_retries+1): - logging.info(f"Current test for page '{page_name}': {json.dumps(curr_test, indent=2)}") + logging.info(f"validate_test_and_retry(): Initial test for page '{page_name}': {json.dumps(curr_test, indent=2)}") + for retry in range(max_retries+1): require_retry, errorList = False, [] for step in curr_test.get('steps', []): actionType = step['action'] # convert enum back to string From 150d0f8e636bcd3fe2f536cac55a629aab64895a Mon Sep 17 00:00:00 2001 From: Advay Balakrishnan Date: Thu, 25 Sep 2025 16:14:59 -0400 Subject: [PATCH 12/12] bug in execute --- core/execute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/execute.py b/core/execute.py index 540b9e2..03a135b 100644 --- a/core/execute.py +++ b/core/execute.py @@ -104,7 +104,7 @@ def run_ui_tests(sync_context: BrowserContext, base_url: str, unit_tests: Ordere if test["expect"]["url"] not in page.url: test_result["status"] = "failed" test_result["error"] = f"URL mismatch: expected {test['expect']['url']}, got {page.url}" - + raise Exception(test_result["error"]) elif "selectorVisible" in test["expect"]: if not page.locator(test["expect"]["selectorVisible"]).is_visible(timeout=timeout): test_result["status"] = "failed"