Refract/tools.py at main · Magnus0969/Refract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import asyncio
from playwright_search_scrape import sync_search_and_scrape  # ✅ no circular import

blocked_keywords = [
    "advertisement", "subscribe", "terms and conditions", "cookie consent",
    "newsletter", "sign up", "buy now", "promo", "deal", "login"
]

def execute_action(action: str) -> str:
    if action.startswith("Scrape["):
        url = action[len("Scrape["):-1].strip()
        return scrape_url(url)
    elif action.startswith("Search["):
        query = action[len("Search["):-1].strip()
        return sync_search_and_scrape(query)
    else:
        return "[Unsupported action: Only Search[...] and Scrape[...] are supported]"

async def scrape_with_playwright(url: str) -> str:
    try:
        from bs4 import BeautifulSoup
        from playwright.async_api import async_playwright

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            await page.goto(url, timeout=15000)
            content = await page.content()
            await browser.close()

        soup = BeautifulSoup(content, "html.parser")
        paragraphs = soup.find_all("p")
        text = "\n".join(p.get_text() for p in paragraphs[:10])

        filtered = [
            line for line in text.splitlines()
            if not any(bad_word in line.lower() for bad_word in blocked_keywords)
        ]
        return "\n".join(filtered).strip()

    except Exception as e:
        return f"[Scrape Failed: {e}]"

def scrape_url(url: str) -> str:
    # ✅ skip is_allowed check here to avoid circular import
    return asyncio.run(scrape_with_playwright(url))