aturret · aturret · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/.idea/FastFetchBot.iml b/.idea/FastFetchBot.iml
diff --git a/...a/runConfigurations/fullstack_polling.xml → ...nConfigurations/fullstack_polling_api.xml b/...a/runConfigurations/fullstack_polling.xml → ...nConfigurations/fullstack_polling_api.xml
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -2,7 +2,7 @@
 
 ## Project Overview
 
-FastFetchBot is a social media content fetching service built as a **UV workspace monorepo** with three microservices: a FastAPI server (API), a Telegram Bot client, and a Celery worker for file operations. It scrapes and archives content from various social media platforms including Twitter, Weibo, Xiaohongshu, Reddit, Bluesky, Instagram, Zhihu, Douban, YouTube, and Bilibili.
+FastFetchBot is a social media content fetching service built as a **UV workspace monorepo** with four microservices: a FastAPI server (API), a Telegram Bot client, a Celery worker for file operations, and an ARQ-based async worker for off-path scraping. It scrapes and archives content from various social media platforms including Twitter, Weibo, Xiaohongshu, Reddit, Bluesky, Instagram, Zhihu, Douban, YouTube, and Bilibili.
 
 ## Architecture
 
@@ -23,11 +23,13 @@ FastFetchBot/
 │           │   ├── twitter/  bluesky/  weibo/  xiaohongshu/  reddit/
 │           │   ├── instagram/  zhihu/  douban/  threads/  wechat/
 │           │   └── general/            # Firecrawl + Zyte generic scraping
+│           ├── file_export/  # Async Celery task wrappers (PDF, video, audio transcription)
 │           └── telegraph/    # Telegraph content publishing
-├── packages/file-export/     # fastfetchbot-file-export: video download, PDF export, transcription
+├── packages/file-export/     # fastfetchbot-file-export: synchronous Celery worker jobs (yt-dlp, WeasyPrint, OpenAI)
 ├── apps/api/                 # FastAPI server: enriched service, routing, storage
 ├── apps/telegram-bot/        # Telegram Bot: webhook/polling, message handling
-├── apps/worker/              # Celery worker: async file operations (video, PDF, audio)
+├── apps/worker/              # Celery worker: sync file operations (video, PDF, audio)
+├── apps/async-worker/        # ARQ async worker: off-path scraping + enrichment
 ├── pyproject.toml            # Root workspace configuration
 └── uv.lock                   # Lockfile for the entire workspace
 ```
@@ -37,6 +39,7 @@ FastFetchBot/
 | **API Server** (`apps/api/src/`) | `fastfetchbot-api` | 10450 | `gunicorn -k uvicorn.workers.UvicornWorker src.main:app --preload` |
 | **Telegram Bot** (`apps/telegram-bot/core/`) | `fastfetchbot-telegram-bot` | 10451 | `python -m core.main` |
 | **Worker** (`apps/worker/worker_core/`) | `fastfetchbot-worker` | — | `celery -A worker_core.main:app worker --loglevel=info --concurrency=2` |
+| **Async Worker** (`apps/async-worker/async_worker/`) | `fastfetchbot-async-worker` | — | `arq async_worker.main.WorkerSettings` |
 | **Shared Library** (`packages/shared/fastfetchbot_shared/`) | `fastfetchbot-shared` | — | — |
 | **File Export Library** (`packages/file-export/fastfetchbot_file_export/`) | `fastfetchbot-file-export` | — | — |
 
@@ -74,6 +77,7 @@ The Telegram Bot communicates with the API server over HTTP (`API_SERVER_URL`).
   - **`templates/`** — 13 Jinja2 templates for platform-specific output formatting (bundled via `__file__`-relative paths)
   - **Platform modules**: `twitter/`, `bluesky/`, `weibo/`, `xiaohongshu/`, `reddit/`, `instagram/`, `zhihu/`, `douban/`, `threads/`, `wechat/`, `general/` (Firecrawl + Zyte)
 - **`services/telegraph/`** — Telegraph content publishing (creates telegra.ph pages from scraped content)
+- **`services/file_export/`** — Async Celery task wrappers for PDF export, video download, and audio transcription. These accept `celery_app` and `timeout` as constructor parameters (dependency injection) so any app can use them with its own Celery client
 
 The shared scrapers library can be used standalone without the API server:
 ```python
@@ -179,6 +183,8 @@ GitHub Actions (`.github/workflows/ci.yml`) builds and pushes all three images o
 7. Add any new pip dependencies to `packages/shared/pyproject.toml` under `[project.optional-dependencies] scrapers`
 
 ### Key Conventions
+- **`packages/shared/` (`fastfetchbot-shared`)** is for shared async logic — scrapers, templates, Telegraph, and async Celery task wrappers (file_export). Most code here is async and reusable across apps
+- **`packages/file-export/` (`fastfetchbot-file-export`)** is exclusively for synchronous Celery worker jobs — the heavy I/O operations that run inside the Celery worker process (yt-dlp video download, WeasyPrint PDF generation, OpenAI audio transcription). Apps never import this package directly; they use the async wrappers in `fastfetchbot_shared.services.file_export` which submit tasks to the Celery worker
 - **Scrapers, templates, and Telegraph live in `packages/shared/`** — they are framework-agnostic and reusable
 - Scraper config (platform credentials, Firecrawl/Zyte settings) lives in `fastfetchbot_shared.services.scrapers.config`, **not** in `apps/api/src/config.py`
 - API-only config (BASE_URL, MongoDB, Celery, AWS, Inoreader) stays in `apps/api/src/config.py`

diff --git a/apps/api/src/services/file_export/audio_transcribe/__init__.py b/apps/api/src/services/file_export/audio_transcribe/__init__.py
@@ -1,29 +1,16 @@
-import asyncio
+"""API-layer audio transcription — wraps the shared AudioTranscribe with API config."""
 
-from src.config import DOWNLOAD_VIDEO_TIMEOUT
+from fastfetchbot_shared.services.file_export.audio_transcribe import AudioTranscribe as BaseAudioTranscribe
 from src.services.celery_client import celery_app
-from fastfetchbot_shared.utils.logger import logger
+from src.config import DOWNLOAD_VIDEO_TIMEOUT
 
 
-class AudioTranscribe:
-    def __init__(self, audio_file: str):
-        self.audio_file = audio_file
+class AudioTranscribe(BaseAudioTranscribe):
+    """API AudioTranscribe that injects the API's Celery app and timeout."""
 
-    async def transcribe(self):
-        return await self._get_audio_text(self.audio_file)
-
-    @staticmethod
-    async def _get_audio_text(audio_file: str):
-        logger.info(f"submitting transcribe task: {audio_file}")
-        result = celery_app.send_task("file_export.transcribe", kwargs={
-            "audio_file": audio_file,
-        })
-        try:
-            response = await asyncio.to_thread(result.get, timeout=int(DOWNLOAD_VIDEO_TIMEOUT))
-            return response["transcript"]
-        except Exception:
-            logger.exception(
-                f"file_export.transcribe task failed: audio_file={audio_file}, "
-                f"timeout={DOWNLOAD_VIDEO_TIMEOUT}"
-            )
-            raise
+    def __init__(self, audio_file: str):
+        super().__init__(
+            audio_file=audio_file,
+            celery_app=celery_app,
+            timeout=DOWNLOAD_VIDEO_TIMEOUT,
+        )
diff --git a/apps/api/src/services/file_export/document_export/pdf_export.py b/apps/api/src/services/file_export/document_export/pdf_export.py
@@ -1,10 +1,10 @@
-import asyncio
-import uuid
+"""API-layer PDF export — extends the shared PdfExport with S3 upload support."""
+
 from pathlib import Path
 
 import aiofiles.os
-from bs4 import BeautifulSoup
 
+from fastfetchbot_shared.services.file_export.pdf_export import PdfExport as BasePdfExport, wrap_html_string
 from src.config import DOWNLOAD_VIDEO_TIMEOUT, AWS_STORAGE_ON
 from src.services.celery_client import celery_app
 from src.services.amazon.s3 import upload as upload_to_s3
@@ -19,48 +19,23 @@ async def upload_file_to_s3(output_filename):
     )
 
 
-class PdfExport:
+class PdfExport(BasePdfExport):
+    """API PDF export that adds optional S3 upload after Celery PDF generation."""
+
     def __init__(self, title: str, html_string: str = None):
-    def __init__(self, title: str, html_string: str = None):
+    def __init__(self, title: str, html_string: str | None = None):
-    def __init__(self, title: str, html_string: str = None):
+    def __init__(self, title: str, html_string: str | None = None):
-        self.title = title
-        self.html_string = html_string
+        super().__init__(
+            title=title,
+            html_string=html_string,
+            celery_app=celery_app,
+            timeout=DOWNLOAD_VIDEO_TIMEOUT,
+        )
 
     async def export(self) -> str:
-        html_string = self.wrap_html_string(self.html_string)
-        output_filename = f"{self.title}-{uuid.uuid4()}.pdf"
-
-        logger.info(f"submitting pdf export task: {output_filename}")
-        result = celery_app.send_task("file_export.pdf_export", kwargs={
-            "html_string": html_string,
-            "output_filename": output_filename,
-        })
-        try:
-            response = await asyncio.to_thread(result.get, timeout=int(DOWNLOAD_VIDEO_TIMEOUT))
-            output_filename = response["output_filename"]
-        except Exception:
-            logger.exception(
-                f"file_export.pdf_export task failed: output_filename={output_filename}, "
-                f"timeout={DOWNLOAD_VIDEO_TIMEOUT}"
-            )
-            raise
-        logger.info(f"pdf export success: {output_filename}")
+        output_filename = await super().export()
 
         if AWS_STORAGE_ON:
             local_filename = output_filename
             output_filename = await upload_file_to_s3(Path(output_filename))
             await aiofiles.os.remove(local_filename)
-        return output_filename
 
-    @staticmethod
-    def wrap_html_string(html_string: str) -> str:
-        soup = BeautifulSoup(
-            '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
-            '<meta charset="UTF-8"></head><body></body></html>',
-            "html.parser",
-        )
-        soup.body.append(BeautifulSoup(html_string, "html.parser"))
-        for tag in soup.find_all(True):
-            if "style" in tag.attrs:
-                del tag["style"]
-        for style_tag in soup.find_all("style"):
-            style_tag.decompose()
-        return soup.prettify()
+        return output_filename