diff --git a/apps/api/src/auth.py b/apps/api/src/auth.py index c9600d8..30dac7b 100644 --- a/apps/api/src/auth.py +++ b/apps/api/src/auth.py @@ -3,9 +3,9 @@ from fastapi import HTTPException, Security, status from fastapi.security.api_key import APIKeyQuery -from src.config import API_KEY_NAME, API_KEY +from src.config import settings -api_key_query = APIKeyQuery(name=API_KEY_NAME, auto_error=False) +api_key_query = APIKeyQuery(name=settings.API_KEY_NAME, auto_error=False) def verify_key(input_key: str, true_key: str): @@ -16,4 +16,4 @@ def verify_key(input_key: str, true_key: str): def verify_api_key(api_key_query: str = Security(api_key_query)): - verify_key(api_key_query, API_KEY) + verify_key(api_key_query, settings.API_KEY) diff --git a/apps/api/src/config.py b/apps/api/src/config.py index ea04347..1ce4adc 100644 --- a/apps/api/src/config.py +++ b/apps/api/src/config.py @@ -2,65 +2,82 @@ import tempfile import gettext import secrets +from typing import Optional -from fastfetchbot_shared.utils.parse import get_env_bool - -env = os.environ -current_directory = os.path.dirname(os.path.abspath(__file__)) -conf_dir = os.path.join(current_directory, "..", "conf") - -# FastAPI environment variables -BASE_URL = env.get("BASE_URL", "localhost") -API_KEY_NAME = env.get("API_KEY_NAME", "pwd") -API_KEY = env.get("API_KEY", secrets.token_urlsafe(32)) - -# Filesystem environment variables -TEMP_DIR = env.get("TEMP_DIR", tempfile.gettempdir()) -WORK_DIR = env.get("WORK_DIR", os.getcwd()) -DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", os.path.join(WORK_DIR, "download")) -DEBUG_MODE = get_env_bool(env, "DEBUG_MODE", False) - -# Logging environment variables -LOG_FILE_PATH = env.get("LOG_FILE_PATH", TEMP_DIR) -LOG_LEVEL = env.get("LOG_LEVEL", "DEBUG") - -# MongoDB environment variables -DATABASE_ON = get_env_bool(env, "DATABASE_ON", False) -MONGODB_PORT = int(env.get("MONGODB_PORT", 27017)) or 27017 -MONGODB_HOST = env.get("MONGODB_HOST", "localhost") -MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}") - -# File exporter toggle (used by telegram bot to show/hide buttons) -FILE_EXPORTER_ON = get_env_bool(env, "FILE_EXPORTER_ON", True) -DOWNLOAD_VIDEO_TIMEOUT = env.get("DOWNLOAD_VIDEO_TIMEOUT", 600) - -# Celery configuration -CELERY_BROKER_URL = env.get("CELERY_BROKER_URL", "redis://localhost:6379/0") -CELERY_RESULT_BACKEND = env.get("CELERY_RESULT_BACKEND", "redis://localhost:6379/1") - -# AWS storage -AWS_STORAGE_ON = get_env_bool(env, "AWS_STORAGE_ON", False) -AWS_ACCESS_KEY_ID = env.get("AWS_ACCESS_KEY_ID", None) -AWS_SECRET_ACCESS_KEY = env.get("AWS_SECRET_ACCESS_KEY", None) -AWS_S3_BUCKET_NAME = env.get("AWS_S3_BUCKET_NAME", "") -AWS_REGION_NAME = env.get("AWS_REGION_NAME", "") -AWS_DOMAIN_HOST = env.get("AWS_DOMAIN_HOST", None) -if not (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_S3_BUCKET_NAME): - AWS_STORAGE_ON = False - -# Inoreader -INOREADER_APP_ID = env.get("INOREADER_APP_ID", None) -INOREADER_APP_KEY = env.get("INOREADER_APP_KEY", None) -INOREADER_EMAIL = env.get("INOREADER_EMAIL", None) -INOREADER_PASSWORD = env.get("INOREADER_PASSWORD", None) - -# Locale directories environment variables +from pydantic import Field, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class ApiSettings(BaseSettings): + model_config = SettingsConfigDict(extra="ignore") + + # FastAPI + BASE_URL: str = "localhost" + API_KEY_NAME: str = "pwd" + API_KEY: str = Field(default_factory=lambda: secrets.token_urlsafe(32)) + + # Filesystem + TEMP_DIR: str = tempfile.gettempdir() + WORK_DIR: str = os.getcwd() + DOWNLOAD_DIR: str = "" + DEBUG_MODE: bool = False + + # Logging + LOG_FILE_PATH: str = "" + LOG_LEVEL: str = "DEBUG" + + # MongoDB + DATABASE_ON: bool = False + MONGODB_PORT: int = 27017 + MONGODB_HOST: str = "localhost" + MONGODB_URL: str = "" + + # File exporter + FILE_EXPORTER_ON: bool = True + DOWNLOAD_VIDEO_TIMEOUT: int = 600 + + # Celery + CELERY_BROKER_URL: str = "redis://localhost:6379/0" + CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" + + # AWS storage + AWS_STORAGE_ON: bool = False + AWS_ACCESS_KEY_ID: Optional[str] = None + AWS_SECRET_ACCESS_KEY: Optional[str] = None + AWS_S3_BUCKET_NAME: str = "" + AWS_REGION_NAME: str = "" + AWS_DOMAIN_HOST: Optional[str] = None + + # Inoreader + INOREADER_APP_ID: Optional[str] = None + INOREADER_APP_KEY: Optional[str] = None + INOREADER_EMAIL: Optional[str] = None + INOREADER_PASSWORD: Optional[str] = None + + # Utils + HTTP_REQUEST_TIMEOUT: int = 30 + + # Telegram Bot callback URL + TELEGRAM_BOT_CALLBACK_URL: str = "http://telegram-bot:10451" + + @model_validator(mode="after") + def _resolve_derived(self) -> "ApiSettings": + if not self.DOWNLOAD_DIR: + self.DOWNLOAD_DIR = os.path.join(self.WORK_DIR, "download") + if not self.LOG_FILE_PATH: + self.LOG_FILE_PATH = self.TEMP_DIR + if not self.MONGODB_URL: + self.MONGODB_URL = f"mongodb://{self.MONGODB_HOST}:{self.MONGODB_PORT}" + if not (self.AWS_ACCESS_KEY_ID and self.AWS_SECRET_ACCESS_KEY and self.AWS_S3_BUCKET_NAME): + self.AWS_STORAGE_ON = False + return self + + +settings = ApiSettings() + +# --- Non-settings module-level objects --- + +# Locale / i18n localedir = os.path.join(os.path.dirname(__file__), "locale") translation = gettext.translation("messages", localedir=localedir, fallback=True) _ = translation.gettext - -# Utils environment variables -HTTP_REQUEST_TIMEOUT = env.get("HTTP_REQUEST_TIMEOUT", 30) - -# Telegram Bot callback URL (for inter-service communication) -TELEGRAM_BOT_CALLBACK_URL = env.get("TELEGRAM_BOT_CALLBACK_URL", "http://telegram-bot:10451") diff --git a/apps/api/src/database.py b/apps/api/src/database.py index 5a4387e..40224f8 100644 --- a/apps/api/src/database.py +++ b/apps/api/src/database.py @@ -3,13 +3,13 @@ from motor.motor_asyncio import AsyncIOMotorClient from beanie import init_beanie, Document, Indexed -from src.config import MONGODB_URL +from src.config import settings from src.models.database_model import document_list from fastfetchbot_shared.utils.logger import logger async def startup() -> None: - client = AsyncIOMotorClient(MONGODB_URL) + client = AsyncIOMotorClient(settings.MONGODB_URL) await init_beanie(database=client["telegram_bot"], document_models=document_list) diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 2a712be..dea43d9 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -6,7 +6,7 @@ from src import database from src.routers import inoreader, scraper_routers, scraper -from src.config import DATABASE_ON +from src.config import settings from fastfetchbot_shared.utils.logger import logger SENTRY_DSN = "" @@ -23,12 +23,12 @@ @asynccontextmanager async def lifespan(app: FastAPI): - if DATABASE_ON: + if settings.DATABASE_ON: await database.startup() try: yield finally: - if DATABASE_ON: + if settings.DATABASE_ON: await database.shutdown() diff --git a/apps/api/src/routers/inoreader.py b/apps/api/src/routers/inoreader.py index 521adaa..0779e6b 100644 --- a/apps/api/src/routers/inoreader.py +++ b/apps/api/src/routers/inoreader.py @@ -1,7 +1,7 @@ from fastapi import APIRouter from fastapi.requests import Request -from src.config import INOREADER_APP_ID, INOREADER_APP_KEY +from src.config import settings from src.services.inoreader import Inoreader from src.services.inoreader.process import ( get_inoreader_item_async, @@ -21,7 +21,7 @@ async def get_inoreader_webhook_data(data: dict): @router.post("/triggerAsync", dependencies=[Security(verify_api_key)]) async def inoreader_trigger_webhook(request: Request): - if not INOREADER_APP_ID or not INOREADER_APP_KEY: + if not settings.INOREADER_APP_ID or not settings.INOREADER_APP_KEY: return "inoreader app id or key not set" params = request.query_params await get_inoreader_item_async(trigger=True, params=params) diff --git a/apps/api/src/routers/scraper.py b/apps/api/src/routers/scraper.py index b02be9c..9504db8 100644 --- a/apps/api/src/routers/scraper.py +++ b/apps/api/src/routers/scraper.py @@ -3,7 +3,7 @@ from fastapi import APIRouter from fastapi.requests import Request -from src.config import API_KEY_NAME +from src.config import settings from src.services.scrapers.common import InfoExtractService from fastapi import Security from src.auth import verify_api_key @@ -20,8 +20,8 @@ async def get_item_route(request: Request): url = query_params.pop("url") ban_list = query_params.pop("ban_list", None) logger.debug(f"get_item_route: url: {url}, query_params: {query_params}") - if API_KEY_NAME in query_params: - query_params.pop(API_KEY_NAME) + if settings.API_KEY_NAME in query_params: + query_params.pop(settings.API_KEY_NAME) url_metadata = await get_url_metadata(url, ban_list) item = InfoExtractService(url_metadata, **query_params) result = await item.get_item() diff --git a/apps/api/src/services/amazon/s3.py b/apps/api/src/services/amazon/s3.py index e0e13aa..e034ec5 100644 --- a/apps/api/src/services/amazon/s3.py +++ b/apps/api/src/services/amazon/s3.py @@ -11,13 +11,13 @@ from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.utils.network import download_file_to_local -from src.config import AWS_S3_BUCKET_NAME, AWS_REGION_NAME, AWS_DOMAIN_HOST +from src.config import settings session = aioboto3.Session() image_url_host = ( - AWS_DOMAIN_HOST - if AWS_DOMAIN_HOST - else f"{AWS_S3_BUCKET_NAME}.s3.{AWS_REGION_NAME}.amazonaws.com" + settings.AWS_DOMAIN_HOST + if settings.AWS_DOMAIN_HOST + else f"{settings.AWS_S3_BUCKET_NAME}.s3.{settings.AWS_REGION_NAME}.amazonaws.com" ) @@ -40,11 +40,13 @@ async def download_and_upload(url: str, referer: str = None, suite: str = "test" async def upload( staging_path: Path, - bucket: str = AWS_S3_BUCKET_NAME, + bucket: str = None, suite: str = "test", release: str = datetime.now().strftime("%Y-%m-%d"), file_name: str = None, ) -> str: + if bucket is None: + bucket = settings.AWS_S3_BUCKET_NAME if not file_name: file_name = uuid.uuid4().hex blob_s3_key = f"{suite}/{release}/{file_name}" diff --git a/apps/api/src/services/celery_client.py b/apps/api/src/services/celery_client.py index 49a94ec..d303447 100644 --- a/apps/api/src/services/celery_client.py +++ b/apps/api/src/services/celery_client.py @@ -1,8 +1,8 @@ from celery import Celery -from src.config import CELERY_BROKER_URL, CELERY_RESULT_BACKEND +from src.config import settings celery_app = Celery( "fastfetchbot_worker", - broker=CELERY_BROKER_URL, - backend=CELERY_RESULT_BACKEND, + broker=settings.CELERY_BROKER_URL, + backend=settings.CELERY_RESULT_BACKEND, ) diff --git a/apps/api/src/services/file_export/audio_transcribe/__init__.py b/apps/api/src/services/file_export/audio_transcribe/__init__.py index 810d52f..3cc282c 100644 --- a/apps/api/src/services/file_export/audio_transcribe/__init__.py +++ b/apps/api/src/services/file_export/audio_transcribe/__init__.py @@ -2,7 +2,7 @@ from fastfetchbot_shared.services.file_export.audio_transcribe import AudioTranscribe as BaseAudioTranscribe from src.services.celery_client import celery_app -from src.config import DOWNLOAD_VIDEO_TIMEOUT +from src.config import settings class AudioTranscribe(BaseAudioTranscribe): @@ -12,5 +12,5 @@ def __init__(self, audio_file: str): super().__init__( audio_file=audio_file, celery_app=celery_app, - timeout=DOWNLOAD_VIDEO_TIMEOUT, + timeout=settings.DOWNLOAD_VIDEO_TIMEOUT, ) diff --git a/apps/api/src/services/file_export/document_export/pdf_export.py b/apps/api/src/services/file_export/document_export/pdf_export.py index 4a0193c..4ad1a8c 100644 --- a/apps/api/src/services/file_export/document_export/pdf_export.py +++ b/apps/api/src/services/file_export/document_export/pdf_export.py @@ -5,7 +5,7 @@ import aiofiles.os from fastfetchbot_shared.services.file_export.pdf_export import PdfExport as BasePdfExport, wrap_html_string -from src.config import DOWNLOAD_VIDEO_TIMEOUT, AWS_STORAGE_ON +from src.config import settings from src.services.celery_client import celery_app from src.services.amazon.s3 import upload as upload_to_s3 from fastfetchbot_shared.utils.logger import logger @@ -27,13 +27,13 @@ def __init__(self, title: str, html_string: str = None): title=title, html_string=html_string, celery_app=celery_app, - timeout=DOWNLOAD_VIDEO_TIMEOUT, + timeout=settings.DOWNLOAD_VIDEO_TIMEOUT, ) async def export(self) -> str: output_filename = await super().export() - if AWS_STORAGE_ON: + if settings.AWS_STORAGE_ON: local_filename = output_filename output_filename = await upload_file_to_s3(Path(output_filename)) await aiofiles.os.remove(local_filename) diff --git a/apps/api/src/services/file_export/video_download/__init__.py b/apps/api/src/services/file_export/video_download/__init__.py index a8a2e48..8ec9bcf 100644 --- a/apps/api/src/services/file_export/video_download/__init__.py +++ b/apps/api/src/services/file_export/video_download/__init__.py @@ -4,7 +4,7 @@ from fastfetchbot_shared.services.file_export.video_download import VideoDownloader as BaseVideoDownloader from src.services.celery_client import celery_app -from src.config import DOWNLOAD_VIDEO_TIMEOUT +from src.config import settings class VideoDownloader(BaseVideoDownloader): @@ -25,7 +25,7 @@ def __init__( url=url, category=category, celery_app=celery_app, - timeout=DOWNLOAD_VIDEO_TIMEOUT, + timeout=settings.DOWNLOAD_VIDEO_TIMEOUT, data=data, download=download, audio_only=audio_only, diff --git a/apps/api/src/services/inoreader/__init__.py b/apps/api/src/services/inoreader/__init__.py index 1343079..2bbaa15 100644 --- a/apps/api/src/services/inoreader/__init__.py +++ b/apps/api/src/services/inoreader/__init__.py @@ -10,12 +10,7 @@ from fastfetchbot_shared.utils.network import HEADERS from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.utils.parse import get_html_text_length -from src.config import ( - INOREADER_APP_ID, - INOREADER_APP_KEY, - INOREADER_EMAIL, - INOREADER_PASSWORD, -) +from src.config import settings INOREADER_CONTENT_URL = "https://www.inoreader.com/reader/api/0/stream/contents/" TAG_PATH = "user/-/label/" @@ -144,8 +139,8 @@ async def get_api_info( resp = await client.post( INOREADER_LOGIN_URL, params={ - "Email": INOREADER_EMAIL, - "Passwd": INOREADER_PASSWORD, + "Email": settings.INOREADER_EMAIL, + "Passwd": settings.INOREADER_PASSWORD, }, ) authorization = resp.text.split("\n")[2].split("=")[1] @@ -156,8 +151,8 @@ async def get_api_info( params = params or {} params.update( { - "AppId": INOREADER_APP_ID, - "AppKey": INOREADER_APP_KEY, + "AppId": settings.INOREADER_APP_ID, + "AppKey": settings.INOREADER_APP_KEY, } ) resp = await client.get( diff --git a/apps/api/src/services/inoreader/process.py b/apps/api/src/services/inoreader/process.py index 7fc16e3..4e7afd4 100644 --- a/apps/api/src/services/inoreader/process.py +++ b/apps/api/src/services/inoreader/process.py @@ -2,7 +2,7 @@ import httpx -from src.config import TELEGRAM_BOT_CALLBACK_URL +from src.config import settings from fastfetchbot_shared.models.url_metadata import UrlMetadata from src.services.inoreader import Inoreader from src.services.scrapers.common import InfoExtractService @@ -19,7 +19,7 @@ async def _default_message_callback(metadata_item: dict, chat_id: Union[int, str """Default callback that sends via HTTP to the Telegram bot service.""" async with httpx.AsyncClient() as client: await client.post( - f"{TELEGRAM_BOT_CALLBACK_URL}/send_message", + f"{settings.TELEGRAM_BOT_CALLBACK_URL}/send_message", json={"data": metadata_item, "chat_id": str(chat_id)}, timeout=120, ) diff --git a/apps/api/src/services/scrapers/common.py b/apps/api/src/services/scrapers/common.py index ec78f54..2782dec 100644 --- a/apps/api/src/services/scrapers/common.py +++ b/apps/api/src/services/scrapers/common.py @@ -8,7 +8,7 @@ from src.services.file_export import video_download, document_export from src.database import save_instances from fastfetchbot_shared.utils.logger import logger -from src.config import DATABASE_ON +from src.config import settings class InfoExtractService(CoreInfoExtractService): @@ -24,11 +24,13 @@ def __init__( self, url_metadata: UrlMetadata, data: Any = None, - store_database: Optional[bool] = DATABASE_ON, + store_database: Optional[bool] = None, store_telegraph: Optional[bool] = True, store_document: Optional[bool] = False, **kwargs, ): + if store_database is None: + store_database = settings.DATABASE_ON super().__init__( url_metadata, data=data, diff --git a/apps/async-worker/async_worker/celery_client.py b/apps/async-worker/async_worker/celery_client.py index 3f0d665..80e968f 100644 --- a/apps/async-worker/async_worker/celery_client.py +++ b/apps/async-worker/async_worker/celery_client.py @@ -1,9 +1,9 @@ from celery import Celery -from async_worker.config import CELERY_BROKER_URL, CELERY_RESULT_BACKEND +from async_worker.config import settings celery_app = Celery( "fastfetchbot_worker", - broker=CELERY_BROKER_URL, - backend=CELERY_RESULT_BACKEND, + broker=settings.CELERY_BROKER_URL, + backend=settings.CELERY_RESULT_BACKEND, ) diff --git a/apps/async-worker/async_worker/config.py b/apps/async-worker/async_worker/config.py index 40cf333..d5cb61b 100644 --- a/apps/async-worker/async_worker/config.py +++ b/apps/async-worker/async_worker/config.py @@ -1,29 +1,41 @@ import os -from fastfetchbot_shared.utils.parse import get_env_bool +from pydantic import model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict -env = os.environ -# ARQ Redis (task queue) -ARQ_REDIS_URL = env.get("ARQ_REDIS_URL", "redis://localhost:6379/2") +class AsyncWorkerSettings(BaseSettings): + model_config = SettingsConfigDict(extra="ignore") -# Outbox Redis (result delivery) -OUTBOX_REDIS_URL = env.get("OUTBOX_REDIS_URL", "redis://localhost:6379/3") -OUTBOX_QUEUE_KEY = env.get("OUTBOX_QUEUE_KEY", "scrape:outbox") + # ARQ Redis + ARQ_REDIS_URL: str = "redis://localhost:6379/2" -# Celery (for PDF export tasks on existing worker) -CELERY_BROKER_URL = env.get("CELERY_BROKER_URL", "redis://localhost:6379/0") -CELERY_RESULT_BACKEND = env.get("CELERY_RESULT_BACKEND", "redis://localhost:6379/1") + # Outbox Redis + OUTBOX_REDIS_URL: str = "redis://localhost:6379/3" + OUTBOX_QUEUE_KEY: str = "scrape:outbox" -# Feature flags -STORE_TELEGRAPH = get_env_bool(env, "STORE_TELEGRAPH", True) -STORE_DOCUMENT = get_env_bool(env, "STORE_DOCUMENT", False) -DATABASE_ON = get_env_bool(env, "DATABASE_ON", False) + # Celery + CELERY_BROKER_URL: str = "redis://localhost:6379/0" + CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" -# MongoDB (optional, for DB storage) -MONGODB_HOST = env.get("MONGODB_HOST", "localhost") -MONGODB_PORT = int(env.get("MONGODB_PORT", 27017)) -MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}") + # Feature flags + STORE_TELEGRAPH: bool = True + STORE_DOCUMENT: bool = False + DATABASE_ON: bool = False -# Download timeout for Celery PDF tasks -DOWNLOAD_VIDEO_TIMEOUT = int(env.get("DOWNLOAD_VIDEO_TIMEOUT", 600)) + # MongoDB + MONGODB_HOST: str = "localhost" + MONGODB_PORT: int = 27017 + MONGODB_URL: str = "" + + # Timeout + DOWNLOAD_VIDEO_TIMEOUT: int = 600 + + @model_validator(mode="after") + def _resolve_derived(self) -> "AsyncWorkerSettings": + if not self.MONGODB_URL: + self.MONGODB_URL = f"mongodb://{self.MONGODB_HOST}:{self.MONGODB_PORT}" + return self + + +settings = AsyncWorkerSettings() diff --git a/apps/async-worker/async_worker/main.py b/apps/async-worker/async_worker/main.py index 88a76ad..be4f3bd 100644 --- a/apps/async-worker/async_worker/main.py +++ b/apps/async-worker/async_worker/main.py @@ -2,7 +2,7 @@ from arq.connections import RedisSettings -from async_worker.config import ARQ_REDIS_URL +from async_worker.config import settings from async_worker.tasks.scrape import scrape_and_enrich # The twitter-api-client-v2 library installs uvloop's EventLoopPolicy at @@ -39,7 +39,7 @@ class WorkerSettings: """ARQ worker configuration.""" functions = [scrape_and_enrich] - redis_settings = parse_redis_url(ARQ_REDIS_URL) + redis_settings = parse_redis_url(settings.ARQ_REDIS_URL) # Job timeout: 10 minutes (matches existing Celery soft limit) job_timeout = 600 diff --git a/apps/async-worker/async_worker/services/enrichment.py b/apps/async-worker/async_worker/services/enrichment.py index 5b796c6..2254bcb 100644 --- a/apps/async-worker/async_worker/services/enrichment.py +++ b/apps/async-worker/async_worker/services/enrichment.py @@ -3,7 +3,7 @@ from fastfetchbot_shared.models.metadata_item import MessageType from fastfetchbot_shared.services.telegraph import Telegraph from fastfetchbot_shared.utils.logger import logger -from async_worker.config import STORE_TELEGRAPH, STORE_DOCUMENT, DOWNLOAD_VIDEO_TIMEOUT +from async_worker.config import settings async def enrich( @@ -17,9 +17,9 @@ async def enrich( - PDF export (via shared PdfExport → Celery worker) """ if store_telegraph is None: - store_telegraph = STORE_TELEGRAPH + store_telegraph = settings.STORE_TELEGRAPH if store_document is None: - store_document = STORE_DOCUMENT + store_document = settings.STORE_DOCUMENT # Force Telegraph for long messages if metadata_item.get("message_type") == MessageType.LONG: @@ -49,7 +49,7 @@ async def enrich( title=metadata_item["title"], html_string=metadata_item["content"], celery_app=celery_app, - timeout=DOWNLOAD_VIDEO_TIMEOUT, + timeout=settings.DOWNLOAD_VIDEO_TIMEOUT, ) output_filename = await pdf_export.export() metadata_item["media_files"].append( diff --git a/apps/async-worker/async_worker/services/outbox.py b/apps/async-worker/async_worker/services/outbox.py index 40ec9a8..b45e6ed 100644 --- a/apps/async-worker/async_worker/services/outbox.py +++ b/apps/async-worker/async_worker/services/outbox.py @@ -2,7 +2,7 @@ import redis.asyncio as aioredis -from async_worker.config import OUTBOX_REDIS_URL, OUTBOX_QUEUE_KEY +from async_worker.config import settings from fastfetchbot_shared.utils.logger import logger _redis: aioredis.Redis | None = None @@ -12,7 +12,7 @@ async def get_outbox_redis() -> aioredis.Redis: """Get or create the outbox Redis connection.""" global _redis if _redis is None: - _redis = aioredis.from_url(OUTBOX_REDIS_URL, decode_responses=True) + _redis = aioredis.from_url(settings.OUTBOX_REDIS_URL, decode_responses=True) return _redis @@ -30,7 +30,7 @@ async def push( falling back to the plain ``OUTBOX_QUEUE_KEY`` for backward compatibility. """ r = await get_outbox_redis() - queue_key = f"{OUTBOX_QUEUE_KEY}:{bot_id}" if bot_id is not None else OUTBOX_QUEUE_KEY + queue_key = f"{settings.OUTBOX_QUEUE_KEY}:{bot_id}" if bot_id is not None else settings.OUTBOX_QUEUE_KEY payload = { "job_id": job_id, "chat_id": chat_id, diff --git a/apps/async-worker/async_worker/tasks/scrape.py b/apps/async-worker/async_worker/tasks/scrape.py index 8f62c36..e362608 100644 --- a/apps/async-worker/async_worker/tasks/scrape.py +++ b/apps/async-worker/async_worker/tasks/scrape.py @@ -6,7 +6,7 @@ from fastfetchbot_shared.utils.logger import logger from async_worker.services import outbox, enrichment from async_worker.celery_client import celery_app -from async_worker.config import DOWNLOAD_VIDEO_TIMEOUT +from async_worker.config import settings async def scrape_and_enrich( @@ -53,7 +53,7 @@ async def scrape_and_enrich( store_telegraph=False, # We handle enrichment separately store_document=False, celery_app=celery_app, - timeout=DOWNLOAD_VIDEO_TIMEOUT, + timeout=settings.DOWNLOAD_VIDEO_TIMEOUT, **kwargs, ) metadata_item = await service.get_item() diff --git a/apps/telegram-bot/core/api_client.py b/apps/telegram-bot/core/api_client.py index 90364d1..b806364 100644 --- a/apps/telegram-bot/core/api_client.py +++ b/apps/telegram-bot/core/api_client.py @@ -1,17 +1,17 @@ import httpx -from core.config import API_SERVER_URL, API_KEY, API_KEY_NAME +from core.config import settings from fastfetchbot_shared.utils.logger import logger async def get_item(url: str, ban_list: list = None, **kwargs) -> dict: """Call API server's /scraper/getItem endpoint.""" - params = {"url": url, API_KEY_NAME: API_KEY} + params = {"url": url, settings.API_KEY_NAME: settings.API_KEY} params.update(kwargs) if ban_list: params["ban_list"] = ",".join(ban_list) async with httpx.AsyncClient() as client: resp = await client.post( - f"{API_SERVER_URL}/scraper/getItem", + f"{settings.API_SERVER_URL}/scraper/getItem", params=params, timeout=120, ) @@ -21,12 +21,12 @@ async def get_item(url: str, ban_list: list = None, **kwargs) -> dict: async def get_url_metadata(url: str, ban_list: list = None) -> dict: """Call API server's /scraper/getUrlMetadata endpoint.""" - params = {"url": url, API_KEY_NAME: API_KEY} + params = {"url": url, settings.API_KEY_NAME: settings.API_KEY} if ban_list: params["ban_list"] = ",".join(ban_list) async with httpx.AsyncClient() as client: resp = await client.post( - f"{API_SERVER_URL}/scraper/getUrlMetadata", + f"{settings.API_SERVER_URL}/scraper/getUrlMetadata", params=params, timeout=30, ) diff --git a/apps/telegram-bot/core/config.py b/apps/telegram-bot/core/config.py index c637c7f..35bf676 100644 --- a/apps/telegram-bot/core/config.py +++ b/apps/telegram-bot/core/config.py @@ -1,152 +1,166 @@ import os import secrets +from typing import Optional, Union from jinja2 import Environment, FileSystemLoader +from pydantic import Field, computed_field, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class TelegramBotSettings(BaseSettings): + model_config = SettingsConfigDict(extra="ignore") + + # API Server connection + API_SERVER_URL: str = "http://localhost:10450" + API_KEY_NAME: str = "pwd" + API_KEY: str = Field(default_factory=lambda: secrets.token_urlsafe(32)) + + # Bot's own BASE_URL + BASE_URL: str = "localhost" + + # Telegram bot + TELEGRAM_BOT_ON: bool = True + TELEGRAM_BOT_MODE: str = "polling" + TELEGRAM_BOT_TOKEN: Optional[str] = None + TELEGRAM_BOT_SECRET_TOKEN: str = Field( + default_factory=lambda: secrets.token_urlsafe(32) + ) + + # Channel IDs (raw comma-separated string, parsed after instantiation) + TELEGRAM_CHANNEL_ID: str = "" + TELEGRAM_CHANNEL_ADMIN_LIST: str = "" + TELEBOT_DEBUG_CHANNEL: str = "" + + # Telegram Bot API server + TELEBOT_API_SERVER_HOST: Optional[str] = None + TELEBOT_API_SERVER_PORT: Optional[str] = None + + # Telegram Bot server port + TELEGRAM_BOT_PORT: int = 10451 + + # Telegram Bot timeouts (env var names differ from field names) + TELEBOT_CONNECT_TIMEOUT: int = Field(default=15, validation_alias="TELEGRAM_CONNECT_TIMEOUT") + TELEBOT_READ_TIMEOUT: int = Field(default=60, validation_alias="TELEGRAM_READ_TIMEOUT") + TELEBOT_WRITE_TIMEOUT: int = Field(default=60, validation_alias="TELEGRAM_WRITE_TIMEOUT") + TELEBOT_MAX_RETRY: int = Field(default=5, validation_alias="TELEGRAM_MAX_RETRY") + + # Telegram image limits (fix bug: use separate env var names) + TELEGRAM_IMAGE_DIMENSION_LIMIT: int = 1600 + TELEGRAM_IMAGE_SIZE_LIMIT: int = 5242880 + + # Ban lists (raw comma-separated, parsed after instantiation) + TELEGRAM_GROUP_MESSAGE_BAN_LIST: str = "" + TELEGRAM_BOT_MESSAGE_BAN_LIST: str = "" + + # Feature flags + FILE_EXPORTER_ON: bool = True + OPENAI_API_KEY: Optional[str] = None + GENERAL_SCRAPING_ON: bool = False + + # Scrape mode + SCRAPE_MODE: str = "queue" + + # Redis URLs + ARQ_REDIS_URL: str = "redis://localhost:6379/2" + OUTBOX_REDIS_URL: str = "redis://localhost:6379/3" + OUTBOX_QUEUE_KEY: str = "scrape:outbox" + + # Database + ITEM_DATABASE_ON: bool = False + MONGODB_PORT: int = 27017 + MONGODB_HOST: str = "localhost" + MONGODB_URL: str = "" + SETTINGS_DATABASE_URL: str = "sqlite+aiosqlite:///data/fastfetchbot.db" + + # Template language + TEMPLATE_LANGUAGE: str = "zh_CN" + + @model_validator(mode="after") + def _resolve_derived(self) -> "TelegramBotSettings": + if not self.MONGODB_URL: + self.MONGODB_URL = f"mongodb://{self.MONGODB_HOST}:{self.MONGODB_PORT}" + return self + + @computed_field + @property + def TELEGRAM_WEBHOOK_URL(self) -> str: + return f"https://{self.BASE_URL}/webhook" + + @computed_field + @property + def TELEBOT_API_SERVER(self) -> str: + if self.TELEBOT_API_SERVER_HOST and self.TELEBOT_API_SERVER_PORT: + return f"http://{self.TELEBOT_API_SERVER_HOST}:{self.TELEBOT_API_SERVER_PORT}/bot" + return "https://api.telegram.org/bot" + + @computed_field + @property + def TELEBOT_API_SERVER_FILE(self) -> str: + if self.TELEBOT_API_SERVER_HOST and self.TELEBOT_API_SERVER_PORT: + return f"http://{self.TELEBOT_API_SERVER_HOST}:{self.TELEBOT_API_SERVER_PORT}/file/bot" + return "https://api.telegram.org/file/bot" + + @computed_field + @property + def TELEBOT_LOCAL_FILE_MODE(self) -> bool: + return self.TELEBOT_API_SERVER != "https://api.telegram.org/bot" + + +settings = TelegramBotSettings() + +# --- Non-settings module-level objects --- -from fastfetchbot_shared.utils.parse import get_env_bool - -env = os.environ +# Jinja2 template configuration current_directory = os.path.dirname(os.path.abspath(__file__)) +templates_directory = os.path.join(current_directory, "templates") +JINJA2_ENV = Environment( + loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True +) -# API Server connection (for calling the FastFetchBot API server) -API_SERVER_URL = env.get("API_SERVER_URL", "http://localhost:10450") -API_KEY_NAME = env.get("API_KEY_NAME", "pwd") -API_KEY = env.get("API_KEY", secrets.token_urlsafe(32)) -# Bot's own BASE_URL (for webhook registration) -BASE_URL = env.get("BASE_URL", "localhost") +# --- Parsed channel/ban list values --- -# Telegram bot environment variables -TELEGRAM_BOT_ON = get_env_bool(env, "TELEGRAM_BOT_ON", True) -TELEGRAM_BOT_MODE = env.get("TELEGRAM_BOT_MODE", "polling") -TELEGRAM_BOT_TOKEN = env.get("TELEGRAM_BOT_TOKEN", None) -TELEGRAM_BOT_SECRET_TOKEN = env.get( - "TELEGRAM_BOT_SECRET_TOKEN", secrets.token_urlsafe(32) -) +def _parse_channel_ids(raw: str) -> Optional[list[Union[str, int]]]: + result: list[Union[str, int]] = [] + for cid in raw.split(","): + cid = cid.strip() + if cid.startswith("@"): + result.append(cid) + elif cid.startswith("-1"): + result.append(int(cid)) + return result or None -# Telegram channel configuration -TELEGRAM_CHANNEL_ID = [] -telegram_channel_id = env.get("TELEGRAM_CHANNEL_ID", "").split(",") -for single_telegram_channel_id in telegram_channel_id: - if single_telegram_channel_id.startswith("@"): - TELEGRAM_CHANNEL_ID.append(single_telegram_channel_id) - elif single_telegram_channel_id.startswith("-1"): - TELEGRAM_CHANNEL_ID.append(int(single_telegram_channel_id)) -if len(TELEGRAM_CHANNEL_ID) == 0: - TELEGRAM_CHANNEL_ID = None - -# Debug channel -telebot_debug_channel = env.get("TELEBOT_DEBUG_CHANNEL", "") -if telebot_debug_channel.startswith("@"): - TELEBOT_DEBUG_CHANNEL = telebot_debug_channel -elif telebot_debug_channel.startswith("-1"): - TELEBOT_DEBUG_CHANNEL = int(telebot_debug_channel) -else: - TELEBOT_DEBUG_CHANNEL = None - -# Channel admin list -telegram_channel_admin_list = env.get("TELEGRAM_CHANNEL_ADMIN_LIST", "") -TELEGRAM_CHANNEL_ADMIN_LIST = [ - admin_id for admin_id in telegram_channel_admin_list.split(",") -] -if not TELEGRAM_CHANNEL_ADMIN_LIST: - TELEGRAM_CHANNEL_ADMIN_LIST = None - -# Webhook URL (constructed from bot's own BASE_URL) -TELEGRAM_WEBHOOK_URL = f"https://{BASE_URL}/webhook" - -# Telegram Bot API server configuration -TELEBOT_API_SERVER_HOST = env.get("TELEBOT_API_SERVER_HOST", None) -TELEBOT_API_SERVER_PORT = env.get("TELEBOT_API_SERVER_PORT", None) -TELEBOT_API_SERVER = ( - f"http://{TELEBOT_API_SERVER_HOST}:{TELEBOT_API_SERVER_PORT}" + "/bot" - if (TELEBOT_API_SERVER_HOST and TELEBOT_API_SERVER_PORT) - else "https://api.telegram.org/bot" -) -TELEBOT_API_SERVER_FILE = ( - f"http://{TELEBOT_API_SERVER_HOST}:{TELEBOT_API_SERVER_PORT}" + "/file/bot" - if (TELEBOT_API_SERVER_HOST and TELEBOT_API_SERVER_PORT) - else "https://api.telegram.org/file/bot" -) -TELEBOT_LOCAL_FILE_MODE = ( - False if TELEBOT_API_SERVER == "https://api.telegram.org/bot" else True -) -# Telegram Bot server port -TELEGRAM_BOT_PORT = int(env.get("TELEGRAM_BOT_PORT", 10451)) or 10451 +def _parse_debug_channel(raw: str) -> Optional[Union[str, int]]: + raw = raw.strip() + if raw.startswith("@"): + return raw + elif raw.startswith("-1"): + return int(raw) + return None -# Telegram Bot timeouts -TELEBOT_CONNECT_TIMEOUT = int(env.get("TELEGRAM_CONNECT_TIMEOUT", 15)) or 15 -TELEBOT_READ_TIMEOUT = int(env.get("TELEGRAM_READ_TIMEOUT", 60)) or 60 -TELEBOT_WRITE_TIMEOUT = int(env.get("TELEGRAM_WRITE_TIMEOUT", 60)) or 60 -TELEBOT_MAX_RETRY = int(env.get("TELEGRAM_MAX_RETRY", 5)) or 5 -# Telegram image limits -TELEGRAM_IMAGE_DIMENSION_LIMIT = int(env.get("TELEGRAM_IMAGE_SIZE_LIMIT", 1600)) or 1600 -TELEGRAM_IMAGE_SIZE_LIMIT = ( - int(env.get("TELEGRAM_IMAGE_SIZE_LIMIT", 5242880)) or 5242880 -) - -# Ban lists -telegram_group_message_ban_list = env.get("TELEGRAM_GROUP_MESSAGE_BAN_LIST", "") -telegram_bot_message_ban_list = env.get("TELEGRAM_BOT_MESSAGE_BAN_LIST", "") +def _parse_admin_list(raw: str) -> Optional[list[str]]: + result = [admin_id.strip() for admin_id in raw.split(",") if admin_id.strip()] + return result or None def ban_list_resolver(ban_list_string: str) -> list: - ban_list = ban_list_string.split(",") + ban_list = [item.strip() for item in ban_list_string.split(",") if item.strip()] + expanded = list(ban_list) for item in ban_list: if item == "social_media": - ban_list.extend( - [ - "weibo", - "twitter", - "instagram", - "zhihu", - "douban", - "wechat", - "xiaohongshu", - "reddit", - ] - ) + expanded.extend([ + "weibo", "twitter", "instagram", "zhihu", + "douban", "wechat", "xiaohongshu", "reddit", + ]) elif item == "video": - ban_list.extend(["youtube", "bilibili"]) - return ban_list - + expanded.extend(["youtube", "bilibili"]) + return expanded -TELEGRAM_GROUP_MESSAGE_BAN_LIST = ban_list_resolver(telegram_group_message_ban_list) -TELEGRAM_BOT_MESSAGE_BAN_LIST = ban_list_resolver(telegram_bot_message_ban_list) - -# Feature flags (needed for handler logic) -FILE_EXPORTER_ON = get_env_bool(env, "FILE_EXPORTER_ON", True) -OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) -GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) - -# Scrape mode: "api" (sync via API server) or "queue" (async via ARQ worker) -SCRAPE_MODE = env.get("SCRAPE_MODE", "queue") - -# Redis URLs for queue mode -ARQ_REDIS_URL = env.get("ARQ_REDIS_URL", "redis://localhost:6379/2") -OUTBOX_REDIS_URL = env.get("OUTBOX_REDIS_URL", "redis://localhost:6379/3") -OUTBOX_QUEUE_KEY = env.get("OUTBOX_QUEUE_KEY", "scrape:outbox") - -# Database configuration -ITEM_DATABASE_ON = get_env_bool(env, "ITEM_DATABASE_ON", False) -MONGODB_PORT = int(env.get("MONGODB_PORT", 27017)) or 27017 -MONGODB_HOST = env.get("MONGODB_HOST", "localhost") -MONGODB_URL = env.get("MONGODB_URL", f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}") - -# User settings database (SQLAlchemy async) -SETTINGS_DATABASE_URL = env.get( - "SETTINGS_DATABASE_URL", "sqlite+aiosqlite:///data/fastfetchbot.db" -) - -# Jinja2 template configuration -templates_directory = os.path.join(current_directory, "templates") -JINJA2_ENV = Environment( - loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True -) -# Template language -TEMPLATE_LANGUAGE = env.get( - "TEMPLATE_LANGUAGE", "zh_CN" -) # It is a workaround for translation system +TELEGRAM_CHANNEL_ID = _parse_channel_ids(settings.TELEGRAM_CHANNEL_ID) +TELEBOT_DEBUG_CHANNEL = _parse_debug_channel(settings.TELEBOT_DEBUG_CHANNEL) +TELEGRAM_CHANNEL_ADMIN_LIST = _parse_admin_list(settings.TELEGRAM_CHANNEL_ADMIN_LIST) +TELEGRAM_GROUP_MESSAGE_BAN_LIST = ban_list_resolver(settings.TELEGRAM_GROUP_MESSAGE_BAN_LIST) +TELEGRAM_BOT_MESSAGE_BAN_LIST = ban_list_resolver(settings.TELEGRAM_BOT_MESSAGE_BAN_LIST) diff --git a/apps/telegram-bot/core/database.py b/apps/telegram-bot/core/database.py index 7942ee3..f2dd79d 100644 --- a/apps/telegram-bot/core/database.py +++ b/apps/telegram-bot/core/database.py @@ -3,13 +3,13 @@ from motor.motor_asyncio import AsyncIOMotorClient from beanie import init_beanie, Document -from core.config import MONGODB_URL +from core.config import settings from core.models.database_model import document_list from fastfetchbot_shared.utils.logger import logger async def startup() -> None: - client = AsyncIOMotorClient(MONGODB_URL) + client = AsyncIOMotorClient(settings.MONGODB_URL) await init_beanie(database=client["telegram_bot"], document_models=document_list) diff --git a/apps/telegram-bot/core/handlers/buttons.py b/apps/telegram-bot/core/handlers/buttons.py index c550d1e..5d85149 100644 --- a/apps/telegram-bot/core/handlers/buttons.py +++ b/apps/telegram-bot/core/handlers/buttons.py @@ -11,10 +11,7 @@ from core import api_client from core.services.message_sender import send_item_message from fastfetchbot_shared.utils.logger import logger -from core.config import ( - TELEGRAM_CHANNEL_ID, - SCRAPE_MODE, -) +from core.config import settings, TELEGRAM_CHANNEL_ID async def buttons_process(update: Update, context: CallbackContext) -> None: @@ -60,7 +57,7 @@ async def buttons_process(update: Update, context: CallbackContext) -> None: await query.answer("Video processing...") extra_args = data["extra_args"] if "extra_args" in data else {} - if SCRAPE_MODE == "queue": + if settings.SCRAPE_MODE == "queue": from core import queue_client replying_message = await query.message.reply_text( diff --git a/apps/telegram-bot/core/handlers/messages.py b/apps/telegram-bot/core/handlers/messages.py index b828ef2..ee4d4d4 100644 --- a/apps/telegram-bot/core/handlers/messages.py +++ b/apps/telegram-bot/core/handlers/messages.py @@ -15,10 +15,7 @@ from core.models.telegram_chat import TelegramMessage, TelegramUser, TelegramChat from core.services.user_settings import ensure_user_settings from fastfetchbot_shared.utils.logger import logger -from core.config import ( - TELEBOT_DEBUG_CHANNEL, - ITEM_DATABASE_ON, -) +from core.config import settings, TELEBOT_DEBUG_CHANNEL async def all_messages_process(update: Update, context: CallbackContext) -> None: @@ -34,7 +31,7 @@ async def all_messages_process(update: Update, context: CallbackContext) -> None "Failed to ensure user settings for user {}", message.from_user.id ) - if message and ITEM_DATABASE_ON: + if message and settings.ITEM_DATABASE_ON: telegram_chat = TelegramChat.construct(**message.chat.to_dict()) telegram_user = TelegramUser.construct(**message.from_user.to_dict()) telegram_message = TelegramMessage( diff --git a/apps/telegram-bot/core/handlers/url_process.py b/apps/telegram-bot/core/handlers/url_process.py index 74113ea..a3a962a 100644 --- a/apps/telegram-bot/core/handlers/url_process.py +++ b/apps/telegram-bot/core/handlers/url_process.py @@ -12,14 +12,11 @@ from fastfetchbot_shared.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS from fastfetchbot_shared.utils.logger import logger from core.config import ( + settings, TELEGRAM_CHANNEL_ID, TELEGRAM_CHANNEL_ADMIN_LIST, TELEGRAM_GROUP_MESSAGE_BAN_LIST, TELEGRAM_BOT_MESSAGE_BAN_LIST, - FILE_EXPORTER_ON, - OPENAI_API_KEY, - GENERAL_SCRAPING_ON, - SCRAPE_MODE, ) @@ -30,7 +27,7 @@ async def _get_url_metadata(url: str, ban_list: list | None = None) -> dict: In queue mode: calls the shared library's get_url_metadata directly (pure URL parsing, no network call needed). """ - if SCRAPE_MODE == "queue": + if settings.SCRAPE_MODE == "queue": from fastfetchbot_shared.utils.parse import get_url_metadata as shared_get_url_metadata url_metadata = await shared_get_url_metadata(url, ban_list=ban_list) @@ -61,7 +58,7 @@ async def _fetch_and_send( message: Optional telegram Message for reply context. **kwargs: Extra arguments passed to the scraper. """ - if SCRAPE_MODE == "queue": + if settings.SCRAPE_MODE == "queue": from core import queue_client await queue_client.enqueue_scrape( @@ -104,7 +101,7 @@ async def https_url_process(update: Update, context: CallbackContext) -> None: ) return if url_metadata["source"] == "unknown": - if GENERAL_SCRAPING_ON: + if settings.GENERAL_SCRAPING_ON: await process_message.edit_text( text=f"Uncategorized url found. General webpage parser is on, Processing..." ) @@ -166,7 +163,7 @@ async def https_url_process(update: Update, context: CallbackContext) -> None: ), ] ) - if FILE_EXPORTER_ON: + if settings.FILE_EXPORTER_ON: special_function_keyboard.extend( [ InlineKeyboardButton( @@ -193,7 +190,7 @@ async def https_url_process(update: Update, context: CallbackContext) -> None: ), ] ) - if OPENAI_API_KEY: + if settings.OPENAI_API_KEY: special_function_keyboard.append( InlineKeyboardButton( "Transcribe Text", @@ -233,7 +230,7 @@ async def https_url_process(update: Update, context: CallbackContext) -> None: ), ] ) - if FILE_EXPORTER_ON: + if settings.FILE_EXPORTER_ON: special_function_keyboard.append( InlineKeyboardButton( "Send with PDF", @@ -271,7 +268,7 @@ async def _auto_fetch_urls(message) -> None: url_metadata = await _get_url_metadata( url, ban_list=TELEGRAM_BOT_MESSAGE_BAN_LIST ) - if url_metadata["source"] == "unknown" and GENERAL_SCRAPING_ON: + if url_metadata["source"] == "unknown" and settings.GENERAL_SCRAPING_ON: await _fetch_and_send( url=url_metadata["url"], chat_id=message.chat_id, @@ -304,7 +301,7 @@ async def https_url_auto_process(update: Update, context: CallbackContext) -> No url_metadata = await _get_url_metadata( url, ban_list=TELEGRAM_GROUP_MESSAGE_BAN_LIST ) - if url_metadata["source"] == "unknown" and GENERAL_SCRAPING_ON: + if url_metadata["source"] == "unknown" and settings.GENERAL_SCRAPING_ON: await _fetch_and_send( url=url_metadata["url"], chat_id=message.chat_id, diff --git a/apps/telegram-bot/core/main.py b/apps/telegram-bot/core/main.py index c68b0ad..d1ee59c 100644 --- a/apps/telegram-bot/core/main.py +++ b/apps/telegram-bot/core/main.py @@ -1,13 +1,13 @@ import uvicorn from core.webhook.server import webhook_app, callback_app -from core.config import TELEGRAM_BOT_MODE, TELEGRAM_BOT_PORT +from core.config import settings from fastfetchbot_shared.utils.logger import logger if __name__ == "__main__": - if TELEGRAM_BOT_MODE == "webhook": - logger.info(f"Running in webhook mode on port {TELEGRAM_BOT_PORT}") - uvicorn.run(webhook_app, host="0.0.0.0", port=TELEGRAM_BOT_PORT) + if settings.TELEGRAM_BOT_MODE == "webhook": + logger.info(f"Running in webhook mode on port {settings.TELEGRAM_BOT_PORT}") + uvicorn.run(webhook_app, host="0.0.0.0", port=settings.TELEGRAM_BOT_PORT) else: - logger.info(f"Running in polling mode (HTTP server on port {TELEGRAM_BOT_PORT} for callbacks)") - uvicorn.run(callback_app, host="0.0.0.0", port=TELEGRAM_BOT_PORT) + logger.info(f"Running in polling mode (HTTP server on port {settings.TELEGRAM_BOT_PORT} for callbacks)") + uvicorn.run(callback_app, host="0.0.0.0", port=settings.TELEGRAM_BOT_PORT) diff --git a/apps/telegram-bot/core/queue_client.py b/apps/telegram-bot/core/queue_client.py index 8a31039..2c1f596 100644 --- a/apps/telegram-bot/core/queue_client.py +++ b/apps/telegram-bot/core/queue_client.py @@ -2,7 +2,7 @@ from arq.connections import ArqRedis, create_pool, RedisSettings -from core.config import ARQ_REDIS_URL +from core.config import settings from fastfetchbot_shared.utils.logger import logger _arq_redis: ArqRedis | None = None @@ -31,7 +31,7 @@ async def init(bot_id: int) -> None: global _arq_redis, _bot_id if _arq_redis is None: _bot_id = bot_id - _arq_redis = await create_pool(_parse_redis_url(ARQ_REDIS_URL)) + _arq_redis = await create_pool(_parse_redis_url(settings.ARQ_REDIS_URL)) logger.info(f"ARQ queue client initialized for bot_id={bot_id}") diff --git a/apps/telegram-bot/core/services/bot_app.py b/apps/telegram-bot/core/services/bot_app.py index 5de6ea4..449a4dd 100644 --- a/apps/telegram-bot/core/services/bot_app.py +++ b/apps/telegram-bot/core/services/bot_app.py @@ -18,20 +18,7 @@ ) from fastfetchbot_shared.utils.logger import logger -from core.config import ( - TELEGRAM_BOT_TOKEN, - TELEGRAM_BOT_MODE, - TELEGRAM_WEBHOOK_URL, - TELEGRAM_BOT_SECRET_TOKEN, - TELEBOT_API_SERVER, - TELEBOT_API_SERVER_FILE, - TELEBOT_LOCAL_FILE_MODE, - TELEBOT_CONNECT_TIMEOUT, - TELEBOT_READ_TIMEOUT, - TELEBOT_WRITE_TIMEOUT, - TELEBOT_MAX_RETRY, - SCRAPE_MODE, -) +from core.config import settings from core.handlers.url_process import https_url_process, https_url_auto_process from core.handlers.buttons import buttons_process, invalid_buttons @@ -47,26 +34,26 @@ async def set_webhook() -> bool: - logger.debug(f"set_webhook: {TELEGRAM_WEBHOOK_URL}, secret_token: {TELEGRAM_BOT_SECRET_TOKEN}") + logger.debug(f"set_webhook: {settings.TELEGRAM_WEBHOOK_URL}, secret_token: {settings.TELEGRAM_BOT_SECRET_TOKEN}") return await application.bot.set_webhook( - url=TELEGRAM_WEBHOOK_URL, secret_token=TELEGRAM_BOT_SECRET_TOKEN + url=settings.TELEGRAM_WEBHOOK_URL, secret_token=settings.TELEGRAM_BOT_SECRET_TOKEN ) -if TELEGRAM_BOT_TOKEN is not None: +if settings.TELEGRAM_BOT_TOKEN is not None: builder = ( Application.builder() - .token(TELEGRAM_BOT_TOKEN) + .token(settings.TELEGRAM_BOT_TOKEN) .arbitrary_callback_data(True) - .connect_timeout(TELEBOT_CONNECT_TIMEOUT) - .read_timeout(TELEBOT_READ_TIMEOUT) - .write_timeout(TELEBOT_WRITE_TIMEOUT) - .base_url(TELEBOT_API_SERVER) - .base_file_url(TELEBOT_API_SERVER_FILE) - .local_mode(TELEBOT_LOCAL_FILE_MODE) - .rate_limiter(AIORateLimiter(max_retries=TELEBOT_MAX_RETRY)) + .connect_timeout(settings.TELEBOT_CONNECT_TIMEOUT) + .read_timeout(settings.TELEBOT_READ_TIMEOUT) + .write_timeout(settings.TELEBOT_WRITE_TIMEOUT) + .base_url(settings.TELEBOT_API_SERVER) + .base_file_url(settings.TELEBOT_API_SERVER_FILE) + .local_mode(settings.TELEBOT_LOCAL_FILE_MODE) + .rate_limiter(AIORateLimiter(max_retries=settings.TELEBOT_MAX_RETRY)) ) - if TELEGRAM_BOT_MODE == "webhook": + if settings.TELEGRAM_BOT_MODE == "webhook": builder = builder.updater(None) application = builder.build() else: @@ -132,7 +119,7 @@ async def startup() -> None: ] ) # Initialize queue mode if enabled - if SCRAPE_MODE == "queue": + if settings.SCRAPE_MODE == "queue": from core import queue_client from core.services import outbox_consumer @@ -172,9 +159,9 @@ async def show_bot_info() -> None: logger.info(f"Can Join Groups: {bot_info.can_join_groups}") logger.info(f"Can Read All Group Messages: {bot_info.can_read_all_group_messages}") logger.info(f"Supports Inline Queries: {bot_info.supports_inline_queries}") - logger.info(f"Mode: {TELEGRAM_BOT_MODE}") + logger.info(f"Mode: {settings.TELEGRAM_BOT_MODE}") - if TELEGRAM_BOT_MODE == "webhook": + if settings.TELEGRAM_BOT_MODE == "webhook": webhook_info = await bot.get_webhook_info() logger.info(f"Webhook URL: {webhook_info.url}") logger.info(f"Webhook Has Custom Certificate: {webhook_info.has_custom_certificate}") @@ -190,7 +177,7 @@ async def show_bot_info() -> None: async def shutdown() -> None: # Shut down queue mode resources - if SCRAPE_MODE == "queue": + if settings.SCRAPE_MODE == "queue": from core import queue_client from core.services import outbox_consumer diff --git a/apps/telegram-bot/core/services/message_sender.py b/apps/telegram-bot/core/services/message_sender.py index 05c33b9..666b568 100644 --- a/apps/telegram-bot/core/services/message_sender.py +++ b/apps/telegram-bot/core/services/message_sender.py @@ -23,14 +23,7 @@ from fastfetchbot_shared.utils.network import download_file_by_metadata_item from fastfetchbot_shared.utils.image import Image, image_compressing, check_image_type from fastfetchbot_shared.utils.logger import logger -from core.config import ( - TELEBOT_API_SERVER, - TELEBOT_WRITE_TIMEOUT, - TELEGRAM_IMAGE_DIMENSION_LIMIT, - TELEGRAM_IMAGE_SIZE_LIMIT, - JINJA2_ENV, - TEMPLATE_LANGUAGE, -) +from core.config import settings, JINJA2_ENV from core.services.constants import ( TELEGRAM_SINGLE_MESSAGE_MEDIA_LIMIT, TELEGRAM_FILE_UPLOAD_LIMIT, @@ -41,7 +34,7 @@ environment = JINJA2_ENV template = environment.get_template("social_media_message.jinja2") template_text = TEMPLATE_TRANSLATION.get( - TEMPLATE_LANGUAGE, TEMPLATE_TRANSLATION["zh_CN"] + settings.TEMPLATE_LANGUAGE, TEMPLATE_TRANSLATION["zh_CN"] ) @@ -102,7 +95,7 @@ async def send_item_message( media=media_group, parse_mode=ParseMode.HTML, caption=caption_text, - write_timeout=TELEBOT_WRITE_TIMEOUT, + write_timeout=settings.TELEBOT_WRITE_TIMEOUT, reply_to_message_id=_reply_to, ) if sent_media_files_message is tuple: @@ -259,7 +252,7 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple: continue # check the file size if ( - not TELEBOT_API_SERVER + not settings.TELEBOT_API_SERVER ): # the official telegram bot api server only supports 50MB file if file_size > TELEGRAM_FILE_UPLOAD_LIMIT: # if the size is over 50MB, skip this file @@ -284,9 +277,9 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple: # don't try to resize image if the ratio is too large if ( ratio < 5 - or max(img_height, img_width) < TELEGRAM_IMAGE_DIMENSION_LIMIT + or max(img_height, img_width) < settings.TELEGRAM_IMAGE_DIMENSION_LIMIT ): - image = image_compressing(image, TELEGRAM_IMAGE_DIMENSION_LIMIT) + image = image_compressing(image, settings.TELEGRAM_IMAGE_DIMENSION_LIMIT) with BytesIO() as buffer: # mime_type file format image.save(buffer, format=ext) @@ -303,9 +296,9 @@ async def media_files_packaging(media_files: list, data: dict) -> tuple: f"image size: {file_size}, ratio: {ratio}, width: {img_width}, height: {img_height}" ) if ( - file_size > TELEGRAM_IMAGE_SIZE_LIMIT - or img_width > TELEGRAM_IMAGE_DIMENSION_LIMIT - or img_height > TELEGRAM_IMAGE_DIMENSION_LIMIT + file_size > settings.TELEGRAM_IMAGE_SIZE_LIMIT + or img_width > settings.TELEGRAM_IMAGE_DIMENSION_LIMIT + or img_height > settings.TELEGRAM_IMAGE_DIMENSION_LIMIT ) and data["category"] not in ["xiaohongshu"]: io_object = await download_file_by_metadata_item( url=image_url, data=data diff --git a/apps/telegram-bot/core/services/outbox_consumer.py b/apps/telegram-bot/core/services/outbox_consumer.py index 426eb27..6415235 100644 --- a/apps/telegram-bot/core/services/outbox_consumer.py +++ b/apps/telegram-bot/core/services/outbox_consumer.py @@ -3,7 +3,7 @@ import redis.asyncio as aioredis -from core.config import OUTBOX_REDIS_URL, OUTBOX_QUEUE_KEY +from core.config import settings from core.services.message_sender import send_item_message from fastfetchbot_shared.utils.logger import logger @@ -16,14 +16,14 @@ async def _get_redis() -> aioredis.Redis: """Get or create the outbox Redis connection.""" global _redis if _redis is None: - _redis = aioredis.from_url(OUTBOX_REDIS_URL, decode_responses=True) + _redis = aioredis.from_url(settings.OUTBOX_REDIS_URL, decode_responses=True) return _redis async def _consume_loop() -> None: """Background loop: BRPOP from the per-bot outbox queue and dispatch results.""" r = await _get_redis() - key = _outbox_key or OUTBOX_QUEUE_KEY + key = _outbox_key or settings.OUTBOX_QUEUE_KEY logger.info(f"Outbox consumer started, listening on '{key}'") while True: @@ -87,7 +87,7 @@ async def start(bot_id: int) -> None: if _consumer_task is not None: logger.warning("Outbox consumer already running") return - _outbox_key = f"{OUTBOX_QUEUE_KEY}:{bot_id}" + _outbox_key = f"{settings.OUTBOX_QUEUE_KEY}:{bot_id}" _consumer_task = asyncio.create_task(_consume_loop()) logger.info(f"Outbox consumer task created for bot_id={bot_id}") diff --git a/apps/telegram-bot/core/webhook/server.py b/apps/telegram-bot/core/webhook/server.py index b3c9700..96ae377 100644 --- a/apps/telegram-bot/core/webhook/server.py +++ b/apps/telegram-bot/core/webhook/server.py @@ -8,7 +8,7 @@ from core.services.bot_app import process_telegram_update from core.services.message_sender import send_item_message -from core.config import TELEGRAM_BOT_SECRET_TOKEN +from core.config import settings from fastfetchbot_shared.utils.logger import logger @@ -21,17 +21,16 @@ async def lifespan(app): update_queue all share one event loop. """ from core.services.bot_app import startup, shutdown, set_webhook, start_polling, show_bot_info - from core.config import TELEGRAM_BOT_TOKEN, TELEGRAM_BOT_MODE, ITEM_DATABASE_ON from fastfetchbot_shared.database import init_db, close_db # -- startup -- - if ITEM_DATABASE_ON: + if settings.ITEM_DATABASE_ON: from core import database await database.startup() await init_db() - if TELEGRAM_BOT_TOKEN: + if settings.TELEGRAM_BOT_TOKEN: await startup() - if TELEGRAM_BOT_MODE == "webhook": + if settings.TELEGRAM_BOT_MODE == "webhook": result = await set_webhook() if result: logger.info("Webhook registered successfully") @@ -44,17 +43,17 @@ async def lifespan(app): yield # -- shutdown -- - if TELEGRAM_BOT_TOKEN: + if settings.TELEGRAM_BOT_TOKEN: await shutdown() await close_db() - if ITEM_DATABASE_ON: + if settings.ITEM_DATABASE_ON: from core import database await database.shutdown() async def telegram_webhook(request: Request): secret = request.headers.get("X-Telegram-Bot-Api-Secret-Token") - if secret != TELEGRAM_BOT_SECRET_TOKEN: + if secret != settings.TELEGRAM_BOT_SECRET_TOKEN: return JSONResponse({"error": "unauthorized"}, status_code=401) data = await request.json() logger.debug(f"Telegram webhook update received: {data.get('update_id', 'unknown')}") diff --git a/apps/worker/worker_core/config.py b/apps/worker/worker_core/config.py index cb4fdc1..031d554 100644 --- a/apps/worker/worker_core/config.py +++ b/apps/worker/worker_core/config.py @@ -1,24 +1,32 @@ import os -from fastfetchbot_shared.utils.parse import get_env_bool +from pydantic import model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict -env = os.environ -current_directory = os.path.dirname(os.path.abspath(__file__)) -conf_dir = os.path.join(current_directory, "..", "conf") +class WorkerSettings(BaseSettings): + model_config = SettingsConfigDict(extra="ignore") -CELERY_BROKER_URL = env.get("CELERY_BROKER_URL", "redis://localhost:6379/0") -CELERY_RESULT_BACKEND = env.get("CELERY_RESULT_BACKEND", "redis://localhost:6379/1") + CELERY_BROKER_URL: str = "redis://localhost:6379/0" + CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" -# Conf directory: defaults to apps/worker/conf/ (same convention as API's apps/api/conf/) -# In Docker, override via CONF_DIR env var to /app/conf (where the volume is mounted) -CONF_DIR = env.get("CONF_DIR", conf_dir) + # Conf directory + CONF_DIR: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "conf") -# File export config -DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", "/tmp") -COOKIE_FILE_PATH = env.get("COOKIE_FILE_PATH", os.path.join(CONF_DIR, "cookies.txt")) -PROXY_MODE = get_env_bool(env, "PROXY_MODE", False) -PROXY_URL = env.get("PROXY_URL", "") -YOUTUBE_COOKIE = get_env_bool(env, "YOUTUBE_COOKIE", False) -BILIBILI_COOKIE = get_env_bool(env, "BILIBILI_COOKIE", False) -OPENAI_API_KEY = env.get("OPENAI_API_KEY", "") + # File export + DOWNLOAD_DIR: str = "/tmp" + COOKIE_FILE_PATH: str = "" + PROXY_MODE: bool = False + PROXY_URL: str = "" + YOUTUBE_COOKIE: bool = False + BILIBILI_COOKIE: bool = False + OPENAI_API_KEY: str = "" + + @model_validator(mode="after") + def _resolve_derived(self) -> "WorkerSettings": + if not self.COOKIE_FILE_PATH: + self.COOKIE_FILE_PATH = os.path.join(self.CONF_DIR, "cookies.txt") + return self + + +settings = WorkerSettings() diff --git a/apps/worker/worker_core/main.py b/apps/worker/worker_core/main.py index 348cb66..a8b6f52 100644 --- a/apps/worker/worker_core/main.py +++ b/apps/worker/worker_core/main.py @@ -1,10 +1,10 @@ from celery import Celery -from worker_core.config import CELERY_BROKER_URL, CELERY_RESULT_BACKEND +from worker_core.config import settings app = Celery( "fastfetchbot_worker", - broker=CELERY_BROKER_URL, - backend=CELERY_RESULT_BACKEND, + broker=settings.CELERY_BROKER_URL, + backend=settings.CELERY_RESULT_BACKEND, ) app.conf.update( diff --git a/apps/worker/worker_core/tasks/pdf.py b/apps/worker/worker_core/tasks/pdf.py index 8f47060..1c36958 100644 --- a/apps/worker/worker_core/tasks/pdf.py +++ b/apps/worker/worker_core/tasks/pdf.py @@ -1,5 +1,5 @@ from worker_core.main import app -from worker_core.config import DOWNLOAD_DIR +from worker_core.config import settings from fastfetchbot_file_export.pdf_export import export_pdf from fastfetchbot_shared.utils.logger import logger @@ -8,13 +8,13 @@ def pdf_export_task(html_string: str, output_filename: str) -> dict: logger.info( f"pdf_export_task started: output_filename={output_filename}, " - f"html_string length={len(html_string)}, DOWNLOAD_DIR={DOWNLOAD_DIR}" + f"html_string length={len(html_string)}, DOWNLOAD_DIR={settings.DOWNLOAD_DIR}" ) try: output_path = export_pdf( html_string=html_string, output_filename=output_filename, - download_dir=DOWNLOAD_DIR, + download_dir=settings.DOWNLOAD_DIR, ) except Exception: logger.exception(f"pdf_export_task failed: output_filename={output_filename}") diff --git a/apps/worker/worker_core/tasks/transcribe.py b/apps/worker/worker_core/tasks/transcribe.py index 82068d6..849d9ab 100644 --- a/apps/worker/worker_core/tasks/transcribe.py +++ b/apps/worker/worker_core/tasks/transcribe.py @@ -1,5 +1,5 @@ from worker_core.main import app -from worker_core.config import OPENAI_API_KEY +from worker_core.config import settings from fastfetchbot_file_export.transcribe import get_audio_text from fastfetchbot_shared.utils.logger import logger @@ -7,11 +7,11 @@ @app.task(name="file_export.transcribe") def transcribe_task(audio_file: str) -> dict: logger.info(f"transcribe_task started: audio_file={audio_file}") - if not OPENAI_API_KEY: + if not settings.OPENAI_API_KEY: logger.error("transcribe_task failed: OPENAI_API_KEY is not set") raise ValueError("OPENAI_API_KEY is not configured in the worker environment") try: - transcript = get_audio_text(audio_file, OPENAI_API_KEY) + transcript = get_audio_text(audio_file, settings.OPENAI_API_KEY) except Exception: logger.exception(f"transcribe_task failed: audio_file={audio_file}") raise diff --git a/apps/worker/worker_core/tasks/video.py b/apps/worker/worker_core/tasks/video.py index 767a737..2609e68 100644 --- a/apps/worker/worker_core/tasks/video.py +++ b/apps/worker/worker_core/tasks/video.py @@ -1,12 +1,5 @@ from worker_core.main import app -from worker_core.config import ( - DOWNLOAD_DIR, - COOKIE_FILE_PATH, - PROXY_MODE, - PROXY_URL, - YOUTUBE_COOKIE, - BILIBILI_COOKIE, -) +from worker_core.config import settings from fastfetchbot_file_export.video_download import download_video from fastfetchbot_shared.utils.logger import logger @@ -47,12 +40,12 @@ def video_download_task( ) -> dict: logger.info(f"video_download_task started: url={url}, extractor={extractor}, download={download}, hd={hd}, audio_only={audio_only}") config = { - "DOWNLOAD_DIR": DOWNLOAD_DIR, - "COOKIE_FILE_PATH": COOKIE_FILE_PATH, - "PROXY_MODE": PROXY_MODE, - "PROXY_URL": PROXY_URL, - "YOUTUBE_COOKIE": YOUTUBE_COOKIE, - "BILIBILI_COOKIE": BILIBILI_COOKIE, + "DOWNLOAD_DIR": settings.DOWNLOAD_DIR, + "COOKIE_FILE_PATH": settings.COOKIE_FILE_PATH, + "PROXY_MODE": settings.PROXY_MODE, + "PROXY_URL": settings.PROXY_URL, + "YOUTUBE_COOKIE": settings.YOUTUBE_COOKIE, + "BILIBILI_COOKIE": settings.BILIBILI_COOKIE, "LOCAL_MODE": True, } try: diff --git a/packages/shared/fastfetchbot_shared/config.py b/packages/shared/fastfetchbot_shared/config.py index fca6987..76530fd 100644 --- a/packages/shared/fastfetchbot_shared/config.py +++ b/packages/shared/fastfetchbot_shared/config.py @@ -1,23 +1,37 @@ import os import tempfile -from fastfetchbot_shared.utils.parse import get_env_bool +from pydantic import model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict -env = os.environ -# Filesystem environment variables -TEMP_DIR = env.get("TEMP_DIR", tempfile.gettempdir()) -WORK_DIR = env.get("WORK_DIR", os.getcwd()) -DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", os.path.join(WORK_DIR, "download")) -DEBUG_MODE = get_env_bool(env, "DEBUG_MODE", False) +class SharedSettings(BaseSettings): + model_config = SettingsConfigDict(extra="ignore") -# Logging environment variables -LOG_FILE_PATH = env.get("LOG_FILE_PATH", TEMP_DIR) -LOG_LEVEL = env.get("LOG_LEVEL", "DEBUG") + # Filesystem + TEMP_DIR: str = tempfile.gettempdir() + WORK_DIR: str = os.getcwd() + DOWNLOAD_DIR: str = "" + DEBUG_MODE: bool = False -# Utils environment variables -HTTP_REQUEST_TIMEOUT = env.get("HTTP_REQUEST_TIMEOUT", 30) + # Logging + LOG_FILE_PATH: str = "" + LOG_LEVEL: str = "DEBUG" -# XHS (Xiaohongshu) shared configuration -SIGN_SERVER_URL = env.get("SIGN_SERVER_URL", "http://localhost:8989") -XHS_COOKIE_PATH = env.get("XHS_COOKIE_PATH", "") + # Utils + HTTP_REQUEST_TIMEOUT: int = 30 + + # XHS (Xiaohongshu) shared configuration + SIGN_SERVER_URL: str = "http://localhost:8989" + XHS_COOKIE_PATH: str = "" + + @model_validator(mode="after") + def _resolve_derived(self) -> "SharedSettings": + if not self.DOWNLOAD_DIR: + self.DOWNLOAD_DIR = os.path.join(self.WORK_DIR, "download") + if not self.LOG_FILE_PATH: + self.LOG_FILE_PATH = self.TEMP_DIR + return self + + +settings = SharedSettings() diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/config.py index 3c28cb0..d4b5916 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/config.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/config.py @@ -1,137 +1,205 @@ import json import os import tempfile +from typing import Optional from jinja2 import Environment, FileSystemLoader +from pydantic import computed_field, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict from fastfetchbot_shared.utils.cookie import read_json_cookies_to_string from fastfetchbot_shared.utils.logger import logger -from fastfetchbot_shared.utils.parse import get_env_bool - -env = os.environ - -# Filesystem environment variables -TEMP_DIR = env.get("TEMP_DIR", tempfile.gettempdir()) -WORK_DIR = env.get("WORK_DIR", os.getcwd()) -DOWNLOAD_DIR = env.get("DOWNLOAD_DIR", os.path.join(WORK_DIR, "download")) -DEBUG_MODE = get_env_bool(env, "DEBUG_MODE", False) - -# Cookie/config file directory — defaults to /conf but can be overridden -CONF_DIR = env.get("CONF_DIR", os.path.join(WORK_DIR, "conf")) +from fastfetchbot_shared.utils.pydantic_types import _parse_comma_list, _parse_optional_comma_list + + +class ScrapersSettings(BaseSettings): + model_config = SettingsConfigDict(extra="ignore") + + # Filesystem + TEMP_DIR: str = tempfile.gettempdir() + WORK_DIR: str = os.getcwd() + DOWNLOAD_DIR: str = "" + DEBUG_MODE: bool = False + CONF_DIR: str = "" + TEMPLATE_LANGUAGE: str = "zh_CN" + + # XHS sign server and cookie path (also declared in SharedSettings, read independently) + SIGN_SERVER_URL: str = "http://localhost:8989" + XHS_COOKIE_PATH: str = "" + + # X-RapidAPI (shared by Twitter and Instagram scrapers) + X_RAPIDAPI_KEY: Optional[str] = None + + # Twitter + TWITTER_EMAIL: Optional[str] = None + TWITTER_PASSWORD: Optional[str] = None + TWITTER_USERNAME: Optional[str] = None + TWITTER_CT0: Optional[str] = None + TWITTER_AUTH_TOKEN: Optional[str] = None + + # Bluesky + BLUESKY_USERNAME: Optional[str] = None + BLUESKY_PASSWORD: Optional[str] = None + + # Weibo (cookie loaded externally) + WEIBO_COOKIES: Optional[str] = None + + # Xiaohongshu + XIAOHONGSHU_A1: Optional[str] = None + XIAOHONGSHU_WEBID: Optional[str] = None + XIAOHONGSHU_WEBSESSION: Optional[str] = None + # Stored as comma-separated strings; access parsed lists via computed properties + XHS_PHONE_LIST: str = "" + XHS_IP_PROXY_LIST: str = "" + XHS_ENABLE_IP_PROXY: bool = False + XHS_SAVE_LOGIN_STATE: bool = True + + # Zhihu + FXZHIHU_HOST: str = "fxzhihu.com" + ZHIHU_Z_C0: Optional[str] = None + + # Reddit + REDDIT_CLIENT_ID: Optional[str] = None + REDDIT_CLIENT_SECRET: Optional[str] = None + REDDIT_PASSWORD: Optional[str] = None + REDDIT_USERNAME: Optional[str] = None + + # OpenAI + OPENAI_API_KEY: Optional[str] = None + + # General webpage scraping + GENERAL_SCRAPING_ON: bool = False + GENERAL_SCRAPING_API: str = "FIRECRAWL" + + # Firecrawl API + FIRECRAWL_API_URL: str = "" + FIRECRAWL_API_KEY: str = "" + FIRECRAWL_WAIT_FOR: str = "3000" + FIRECRAWL_USE_JSON_EXTRACTION: bool = False + + # Zyte API + ZYTE_API_KEY: Optional[str] = None + + # Telegraph (comma-separated string; access parsed list via computed property) + TELEGRAPH_TOKEN_LIST: str = "" + + @model_validator(mode="after") + def _resolve_derived(self) -> "ScrapersSettings": + if not self.DOWNLOAD_DIR: + self.DOWNLOAD_DIR = os.path.join(self.WORK_DIR, "download") + if not self.CONF_DIR: + self.CONF_DIR = os.path.join(self.WORK_DIR, "conf") + return self + + @computed_field + @property + def xhs_phone_list(self) -> list[str]: + """Parse XHS_PHONE_LIST comma-separated string into a list.""" + return _parse_comma_list(self.XHS_PHONE_LIST) + + @computed_field + @property + def xhs_ip_proxy_list(self) -> list[str]: + """Parse XHS_IP_PROXY_LIST comma-separated string into a list.""" + return _parse_comma_list(self.XHS_IP_PROXY_LIST) + + @computed_field + @property + def telegraph_token_list(self) -> Optional[list[str]]: + """Parse TELEGRAPH_TOKEN_LIST comma-separated string into a list, None if empty.""" + return _parse_optional_comma_list(self.TELEGRAPH_TOKEN_LIST) + + @computed_field + @property + def TWITTER_COOKIES(self) -> dict[str, Optional[str]]: + return {"ct0": self.TWITTER_CT0, "auth_token": self.TWITTER_AUTH_TOKEN} + + @computed_field + @property + def XIAOHONGSHU_COOKIES(self) -> dict[str, Optional[str]]: + return { + "a1": self.XIAOHONGSHU_A1, + "web_id": self.XIAOHONGSHU_WEBID, + "web_session": self.XIAOHONGSHU_WEBSESSION, + } + + @property + def firecrawl_wait_for_int(self) -> int: + """Parse FIRECRAWL_WAIT_FOR as int with fallback to 3000.""" + try: + val = int(self.FIRECRAWL_WAIT_FOR) + return val if val else 3000 + except (ValueError, TypeError): + return 3000 + + +settings = ScrapersSettings() + +# --- Non-settings module-level objects --- # Templates & Jinja2 templates_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates") JINJA2_ENV = Environment( loader=FileSystemLoader(templates_directory), lstrip_blocks=True, trim_blocks=True ) -TEMPLATE_LANGUAGE = env.get("TEMPLATE_LANGUAGE", "zh_CN") - -# X-RapidAPI (shared by Twitter and Instagram scrapers) -X_RAPIDAPI_KEY = env.get("X_RAPIDAPI_KEY", None) - -# Twitter -TWITTER_EMAIL = env.get("TWITTER_EMAIL", None) -TWITTER_PASSWORD = env.get("TWITTER_PASSWORD", None) -TWITTER_USERNAME = env.get("TWITTER_USERNAME", None) -TWITTER_CT0 = env.get("TWITTER_CT0", None) -TWITTER_AUTH_TOKEN = env.get("TWITTER_AUTH_TOKEN", None) -TWITTER_COOKIES = { - "ct0": TWITTER_CT0, - "auth_token": TWITTER_AUTH_TOKEN, -} - -# Bluesky -BLUESKY_USERNAME = env.get("BLUESKY_USERNAME", None) -BLUESKY_PASSWORD = env.get("BLUESKY_PASSWORD", None) - -# Weibo -weibo_cookies_path = os.path.join(CONF_DIR, "weibo_cookies.json") -if os.path.exists(weibo_cookies_path): - WEIBO_COOKIES = read_json_cookies_to_string(weibo_cookies_path) -else: - WEIBO_COOKIES = env.get("WEIBO_COOKIES", None) - -# Xiaohongshu -XIAOHONGSHU_A1 = env.get("XIAOHONGSHU_A1", None) -XIAOHONGSHU_WEBID = env.get("XIAOHONGSHU_WEBID", None) -XIAOHONGSHU_WEBSESSION = env.get("XIAOHONGSHU_WEBSESSION", None) -XIAOHONGSHU_COOKIES = { - "a1": XIAOHONGSHU_A1, - "web_id": XIAOHONGSHU_WEBID, - "web_session": XIAOHONGSHU_WEBSESSION, -} -XHS_PHONE_LIST = env.get("XHS_PHONE_LIST", "").split(",") -XHS_IP_PROXY_LIST = env.get("XHS_IP_PROXY_LIST", "").split(",") -XHS_ENABLE_IP_PROXY = get_env_bool(env, "XHS_ENABLE_IP_PROXY", False) -XHS_SAVE_LOGIN_STATE = get_env_bool(env, "XHS_SAVE_LOGIN_STATE", True) - -# XHS sign server and cookie file -from fastfetchbot_shared.config import SIGN_SERVER_URL as XHS_SIGN_SERVER_URL -from fastfetchbot_shared.config import XHS_COOKIE_PATH as _XHS_COOKIE_PATH - -xhs_cookie_path = _XHS_COOKIE_PATH or os.path.join(CONF_DIR, "xhs_cookies.txt") - -XHS_COOKIE_STRING = "" -if os.path.exists(xhs_cookie_path): - try: - with open(xhs_cookie_path, "r", encoding="utf-8") as f: - XHS_COOKIE_STRING = f.read().strip() - except (IOError, OSError) as e: - logger.error(f"Error reading XHS cookie file: {e}") - XHS_COOKIE_STRING = "" -else: + + +# --- Cookie file loading (standalone functions) --- + +def _load_weibo_cookies(conf_dir: str, env_fallback: Optional[str]) -> Optional[str]: + weibo_cookies_path = os.path.join(conf_dir, "weibo_cookies.json") + if os.path.exists(weibo_cookies_path): + return read_json_cookies_to_string(weibo_cookies_path) + return env_fallback + + +def _load_xhs_cookies( + conf_dir: str, + xhs_cookie_path: str, + a1: Optional[str], + webid: Optional[str], + websession: Optional[str], +) -> str: + cookie_path = xhs_cookie_path or os.path.join(conf_dir, "xhs_cookies.txt") + if os.path.exists(cookie_path): + try: + with open(cookie_path, "r", encoding="utf-8") as f: + return f.read().strip() + except (IOError, OSError) as e: + logger.error(f"Error reading XHS cookie file: {e}") + return "" cookie_parts = [] - if XIAOHONGSHU_A1: - cookie_parts.append(f"a1={XIAOHONGSHU_A1}") - if XIAOHONGSHU_WEBID: - cookie_parts.append(f"web_id={XIAOHONGSHU_WEBID}") - if XIAOHONGSHU_WEBSESSION: - cookie_parts.append(f"web_session={XIAOHONGSHU_WEBSESSION}") - XHS_COOKIE_STRING = "; ".join(cookie_parts) - -# Zhihu -FXZHIHU_HOST = env.get("FXZHIHU_HOST", "fxzhihu.com") -ZHIHU_Z_C0 = env.get("ZHIHU_Z_C0", None) - -zhihu_cookie_path = os.path.join(CONF_DIR, "zhihu_cookies.json") -if os.path.exists(zhihu_cookie_path): - try: - with open(zhihu_cookie_path, "r") as f: - ZHIHU_COOKIES_JSON = json.load(f) - except json.JSONDecodeError: - logger.error("Error: zhihu_cookies.json is not in a valid JSON format.") - ZHIHU_COOKIES_JSON = None - except FileNotFoundError: - logger.error("Error: zhihu_cookies.json does not exist.") - ZHIHU_COOKIES_JSON = None -else: - ZHIHU_COOKIES_JSON = None - -# Reddit -REDDIT_CLIENT_ID = env.get("REDDIT_CLIENT_ID", None) -REDDIT_CLIENT_SECRET = env.get("REDDIT_CLIENT_SECRET", None) -REDDIT_PASSWORD = env.get("REDDIT_PASSWORD", None) -REDDIT_USERNAME = env.get("REDDIT_USERNAME", None) - -# Open AI API -OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) - -# General webpage scraping -GENERAL_SCRAPING_ON = get_env_bool(env, "GENERAL_SCRAPING_ON", False) -GENERAL_SCRAPING_API = env.get("GENERAL_SCRAPING_API", "FIRECRAWL") - -# Firecrawl API -FIRECRAWL_API_URL = env.get("FIRECRAWL_API_URL", "") -FIRECRAWL_API_KEY = env.get("FIRECRAWL_API_KEY", "") -try: - FIRECRAWL_WAIT_FOR = int(env.get("FIRECRAWL_WAIT_FOR") or 3000) -except (ValueError, TypeError): - FIRECRAWL_WAIT_FOR = 3000 -FIRECRAWL_USE_JSON_EXTRACTION = get_env_bool(env, "FIRECRAWL_USE_JSON_EXTRACTION", False) - -# Zyte API -ZYTE_API_KEY = env.get("ZYTE_API_KEY", None) - -# Telegraph -telegraph_token_list = env.get("TELEGRAPH_TOKEN_LIST", "") -TELEGRAPH_TOKEN_LIST = telegraph_token_list.split(",") if telegraph_token_list else None + if a1: + cookie_parts.append(f"a1={a1}") + if webid: + cookie_parts.append(f"web_id={webid}") + if websession: + cookie_parts.append(f"web_session={websession}") + return "; ".join(cookie_parts) + + +def _load_zhihu_cookies(conf_dir: str) -> Optional[dict]: + zhihu_cookie_path = os.path.join(conf_dir, "zhihu_cookies.json") + if os.path.exists(zhihu_cookie_path): + try: + with open(zhihu_cookie_path, "r") as f: + return json.load(f) + except json.JSONDecodeError: + logger.error("Error: zhihu_cookies.json is not in a valid JSON format.") + return None + except FileNotFoundError: + logger.error("Error: zhihu_cookies.json does not exist.") + return None + return None + + +WEIBO_COOKIES = _load_weibo_cookies(settings.CONF_DIR, settings.WEIBO_COOKIES) +XHS_COOKIE_STRING = _load_xhs_cookies( + settings.CONF_DIR, + settings.XHS_COOKIE_PATH, + settings.XIAOHONGSHU_A1, + settings.XIAOHONGSHU_WEBID, + settings.XIAOHONGSHU_WEBSESSION, +) +XHS_SIGN_SERVER_URL = settings.SIGN_SERVER_URL +ZHIHU_COOKIES_JSON = _load_zhihu_cookies(settings.CONF_DIR) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py index aeffdd0..62817d1 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py @@ -7,7 +7,7 @@ from openai import AsyncOpenAI from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam -from fastfetchbot_shared.services.scrapers.config import OPENAI_API_KEY +from fastfetchbot_shared.services.scrapers.config import settings from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor from fastfetchbot_shared.services.scrapers.general import GeneralItem @@ -163,12 +163,12 @@ async def parsing_article_body_by_llm(html_content: str) -> str: if not html_content: return html_content - if not OPENAI_API_KEY: + if not settings.OPENAI_API_KEY: logger.warning("OPENAI_API_KEY not configured, skipping LLM parsing") return html_content try: - client = AsyncOpenAI(api_key=OPENAI_API_KEY) + client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY) # Truncate content if too long to avoid token limits max_content_length = 50000 diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py index d27600e..c018daf 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py @@ -1,6 +1,6 @@ from typing import Optional -from fastfetchbot_shared.services.scrapers.config import FIRECRAWL_WAIT_FOR, FIRECRAWL_USE_JSON_EXTRACTION +from fastfetchbot_shared.services.scrapers.config import settings from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper from fastfetchbot_shared.services.scrapers.general.firecrawl_client import FirecrawlClient from fastfetchbot_shared.services.scrapers.general.firecrawl_schema import ( @@ -57,7 +57,7 @@ def __init__(self, url: str, use_json_extraction: Optional[bool] = None): self._use_json_extraction = ( use_json_extraction if use_json_extraction is not None - else FIRECRAWL_USE_JSON_EXTRACTION + else settings.FIRECRAWL_USE_JSON_EXTRACTION ) async def _get_page_content(self) -> None: @@ -77,7 +77,7 @@ async def _get_page_content_legacy(self) -> None: formats=["markdown", "html"], only_main_content=True, exclude_tags=FIRECRAWL_EXCLUDE_TAGS, - wait_for=FIRECRAWL_WAIT_FOR, + wait_for=settings.firecrawl_wait_for_int, ) await self._process_firecrawl_result(result) @@ -93,7 +93,7 @@ async def _get_page_content_json(self) -> None: formats=["markdown", "html", json_format], only_main_content=True, exclude_tags=FIRECRAWL_EXCLUDE_TAGS, - wait_for=FIRECRAWL_WAIT_FOR, + wait_for=settings.firecrawl_wait_for_int, ) json_data = result.get("json") diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py index db4b519..3c7e8ea 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl_client.py @@ -6,7 +6,7 @@ from firecrawl import AsyncFirecrawl -from fastfetchbot_shared.services.scrapers.config import FIRECRAWL_API_URL, FIRECRAWL_API_KEY +from fastfetchbot_shared.services.scrapers.config import settings @dataclass(frozen=True) @@ -49,8 +49,8 @@ def get_instance(cls) -> "FirecrawlClient": return cls._instance config = FirecrawlSettings( - api_url=FIRECRAWL_API_URL, - api_key=FIRECRAWL_API_KEY, + api_url=settings.FIRECRAWL_API_URL, + api_key=settings.FIRECRAWL_API_KEY, ) cls._instance = cls(config) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py index 08d472b..84fbf42 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/scraper.py @@ -1,6 +1,6 @@ from typing import Optional -from fastfetchbot_shared.services.scrapers.config import GENERAL_SCRAPING_API +from fastfetchbot_shared.services.scrapers.config import settings from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralScraper from fastfetchbot_shared.services.scrapers.general.firecrawl import FirecrawlScraper @@ -33,7 +33,7 @@ def __init__(self, scraper_type: Optional[str] = None): Args: scraper_type: The type of scraper to use. If None, uses GENERAL_SCRAPING_API config. """ - self.scraper_type = scraper_type or GENERAL_SCRAPING_API + self.scraper_type = scraper_type or settings.GENERAL_SCRAPING_API self._scraper: Optional[BaseGeneralScraper] = None self._init_scraper() diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py index 234dd5f..cabf2c3 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py @@ -1,6 +1,6 @@ from zyte_api import AsyncZyteAPI -from fastfetchbot_shared.services.scrapers.config import ZYTE_API_KEY +from fastfetchbot_shared.services.scrapers.config import settings from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor, BaseGeneralScraper from fastfetchbot_shared.services.scrapers.scraper import DataProcessor from fastfetchbot_shared.utils.logger import logger @@ -16,11 +16,11 @@ def __init__(self, url: str): self.scraper_type = "zyte" async def _get_page_content(self) -> None: - if not ZYTE_API_KEY: + if not settings.ZYTE_API_KEY: raise RuntimeError("ZYTE_API_KEY is not configured") try: - client = AsyncZyteAPI(api_key=ZYTE_API_KEY) + client = AsyncZyteAPI(api_key=settings.ZYTE_API_KEY) result = await client.get( { "url": self.url, diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py index 9b4408e..52bf256 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py @@ -10,7 +10,7 @@ from fastfetchbot_shared.utils.parse import get_html_text_length from fastfetchbot_shared.utils.logger import logger from .config import API_HEADERS_LIST, ALL_SCRAPERS -from fastfetchbot_shared.services.scrapers.config import X_RAPIDAPI_KEY +from fastfetchbot_shared.services.scrapers.config import settings class Instagram(MetadataItem): @@ -55,7 +55,7 @@ async def _get_post_info(self) -> dict: self.scraper = scraper self.host = API_HEADERS_LIST[self.scraper]["host"] self.headers = { - "X-RapidAPI-Key": X_RAPIDAPI_KEY, + "X-RapidAPI-Key": settings.X_RAPIDAPI_KEY, "X-RapidAPI-Host": API_HEADERS_LIST[self.scraper]["top_domain"], "content-type": "application/octet-stream", } diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py index d44c1a9..9a907cb 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py @@ -5,13 +5,7 @@ from bs4 import BeautifulSoup from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile -from fastfetchbot_shared.services.scrapers.config import ( - REDDIT_CLIENT_ID, - REDDIT_CLIENT_SECRET, - REDDIT_PASSWORD, - REDDIT_USERNAME, - JINJA2_ENV, -) +from fastfetchbot_shared.services.scrapers.config import settings, JINJA2_ENV from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, get_html_text_length from fastfetchbot_shared.utils.network import get_redirect_url @@ -36,13 +30,13 @@ async def get_reddit(self) -> None: await self._process_reddit_data(reddit_data) async def _get_reddit_data(self) -> dict: - reddit_user_agent = f"testscript by u/{REDDIT_USERNAME}" + reddit_user_agent = f"testscript by u/{settings.REDDIT_USERNAME}" reddit = asyncpraw.Reddit( - client_id=REDDIT_CLIENT_ID, - client_secret=REDDIT_CLIENT_SECRET, - password=REDDIT_PASSWORD, + client_id=settings.REDDIT_CLIENT_ID, + client_secret=settings.REDDIT_CLIENT_SECRET, + password=settings.REDDIT_PASSWORD, user_agent=reddit_user_agent, - username=REDDIT_USERNAME, + username=settings.REDDIT_USERNAME, ) submission = await reddit.submission(url=self.url) return submission.__dict__ diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py index f7bbbef..b1abed5 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/scraper_manager.py @@ -4,9 +4,7 @@ from fastfetchbot_shared.services.scrapers.bluesky.scraper import BlueskyScraper from fastfetchbot_shared.services.scrapers.weibo.scraper import WeiboScraper from fastfetchbot_shared.services.scrapers.general.scraper import GeneralScraper -from fastfetchbot_shared.services.scrapers.config import ( - BLUESKY_USERNAME, BLUESKY_PASSWORD -) +from fastfetchbot_shared.services.scrapers.config import settings class ScraperManager: @@ -46,7 +44,7 @@ async def init_scraper(cls, category: str) -> None: @classmethod async def init_bluesky_scraper(cls) -> BlueskyScraper: - cls.bluesky_scraper = BlueskyScraper(username=BLUESKY_USERNAME, password=BLUESKY_PASSWORD) + cls.bluesky_scraper = BlueskyScraper(username=settings.BLUESKY_USERNAME, password=settings.BLUESKY_PASSWORD) await cls.bluesky_scraper.init() return cls.bluesky_scraper diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py index 66d8019..9a38090 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py @@ -17,7 +17,7 @@ SCRAPER_INFO, SHORT_LIMIT, ) -from fastfetchbot_shared.services.scrapers.config import X_RAPIDAPI_KEY, TWITTER_COOKIES, DEBUG_MODE +from fastfetchbot_shared.services.scrapers.config import settings from fastfetchbot_shared.utils.logger import logger @@ -105,7 +105,7 @@ async def _api_client_get_response_tweet_data(self) -> Dict: save=False, pbar=False, debug=0, - cookies=TWITTER_COOKIES + cookies=settings.TWITTER_COOKIES ) tweet_data = await asyncio.to_thread(scraper.tweets_details, [int(self.tid)], limit=1) logger.debug(tweet_data) @@ -308,7 +308,7 @@ def parse_article_content(article: Dict) -> Tuple[str, List[MediaFile]]: def _get_request_headers(self): self.host = SCRAPER_INFO[self.scraper]["host"] self.headers = { - "X-RapidAPI-Key": X_RAPIDAPI_KEY, + "X-RapidAPI-Key": settings.X_RAPIDAPI_KEY, "X-RapidAPI-Host": SCRAPER_INFO[self.scraper]["top_domain"] + X_RAPIDAPI_HOST, "content-type": "application/octet-stream", diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py index a3f53ae..900bc52 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/adaptar.py @@ -7,7 +7,7 @@ import httpx -from fastfetchbot_shared.config import SIGN_SERVER_URL +from fastfetchbot_shared.config import settings as shared_settings from fastfetchbot_shared.utils.logger import logger XHS_API_URL = "https://edith.xiaohongshu.com" @@ -50,11 +50,11 @@ def __init__( timeout: float = 20.0, ): self.cookies = cookies.strip() - self.sign_server_endpoint = (sign_server_endpoint or SIGN_SERVER_URL).rstrip("/") + self.sign_server_endpoint = (sign_server_endpoint or shared_settings.SIGN_SERVER_URL).rstrip("/") if not self.sign_server_endpoint: raise ValueError( "XhsSinglePostAdapter requires a sign server URL. " - "Set SIGN_SERVER_URL in the environment or pass sign_server_endpoint explicitly." + "Set shared_settings.SIGN_SERVER_URL in the environment or pass sign_server_endpoint explicitly." ) self.timeout = timeout self._http = httpx.AsyncClient(timeout=timeout, follow_redirects=True) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py index 17032d5..96c3da5 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py @@ -18,7 +18,7 @@ from fastfetchbot_shared.utils.network import get_selector, get_redirect_url, get_response_json, get_random_user_agent, \ get_content_async, get_response from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType -from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV, FXZHIHU_HOST +from fastfetchbot_shared.services.scrapers.config import settings, JINJA2_ENV from .config import ( SHORT_LIMIT, ZHIHU_COLUMNS_API_HOST, @@ -234,21 +234,21 @@ async def _get_request_url(self) -> None: if self.zhihu_type == "answer": if self.question_id: self.request_url = ( - "https://" + FXZHIHU_HOST + '/question/' + self.question_id + '/answer/' + self.answer_id + "https://" + settings.FXZHIHU_HOST + '/question/' + self.question_id + '/answer/' + self.answer_id ) return self.request_url = ( - "https://" + FXZHIHU_HOST + '/answer/' + self.answer_id + "https://" + settings.FXZHIHU_HOST + '/answer/' + self.answer_id ) return elif self.zhihu_type == "article": self.request_url = ( - "https://" + FXZHIHU_HOST + '/p/' + self.article_id + "https://" + settings.FXZHIHU_HOST + '/p/' + self.article_id ) return elif self.zhihu_type == "status": self.request_url = ( - "https://" + FXZHIHU_HOST + '/pin/' + self.status_id + "https://" + settings.FXZHIHU_HOST + '/pin/' + self.status_id ) return if self.zhihu_type == "answer": diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py index d187797..0abb8e0 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/config.py @@ -1,4 +1,4 @@ -from fastfetchbot_shared.services.scrapers.config import ZHIHU_COOKIES_JSON, ZHIHU_Z_C0 +from fastfetchbot_shared.services.scrapers.config import settings, ZHIHU_COOKIES_JSON SHORT_LIMIT = 600 ZHIHU_COLUMNS_API_HOST = "https://zhuanlan.zhihu.com/api" @@ -13,8 +13,8 @@ """ # Cookie for direct API calls: prefer ZHIHU_Z_C0 env var, fall back to cookies JSON -if ZHIHU_Z_C0: - ZHIHU_API_COOKIE = f"z_c0={ZHIHU_Z_C0}" +if settings.ZHIHU_Z_C0: + ZHIHU_API_COOKIE = f"z_c0={settings.ZHIHU_Z_C0}" elif ZHIHU_COOKIES_JSON: ZHIHU_API_COOKIE = ';'.join(f"{cookie['name']}={cookie['value']}" for cookie in ZHIHU_COOKIES_JSON) else: diff --git a/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py b/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py index dbe429f..6921465 100644 --- a/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/telegraph/__init__.py @@ -8,7 +8,7 @@ ) from html_telegraph_poster_v2.async_poster.utils import DocumentPreprocessor -from fastfetchbot_shared.services.scrapers.config import TELEGRAPH_TOKEN_LIST +from fastfetchbot_shared.services.scrapers.config import settings from fastfetchbot_shared.models.telegraph_item import TelegraphItem, from_str from fastfetchbot_shared.utils.logger import logger @@ -50,12 +50,12 @@ async def get_telegraph(self, upload_images: bool = True) -> str: await temp_html.upload_all_images() self.content = temp_html.get_processed_html() logger.info("Telegraph: Uploading to telegraph...") - if not TELEGRAPH_TOKEN_LIST: + if not settings.telegraph_token_list: await self.telegraph.create_api_token( short_name=self.author[0:14], author_name=self.author ) else: - random_token = random.choice(TELEGRAPH_TOKEN_LIST) + random_token = random.choice(settings.telegraph_token_list) await self.telegraph.set_token(random_token) telegraph_post = await self.telegraph.post( diff --git a/packages/shared/fastfetchbot_shared/utils/logger.py b/packages/shared/fastfetchbot_shared/utils/logger.py index 2340539..216cb52 100644 --- a/packages/shared/fastfetchbot_shared/utils/logger.py +++ b/packages/shared/fastfetchbot_shared/utils/logger.py @@ -2,16 +2,16 @@ from loguru import logger -from fastfetchbot_shared.config import LOG_LEVEL, LOG_FILE_PATH +from fastfetchbot_shared.config import settings -log_path = os.path.join(LOG_FILE_PATH, "app.log") +log_path = os.path.join(settings.LOG_FILE_PATH, "app.log") logger.add( log_path, - level=LOG_LEVEL, + level=settings.LOG_LEVEL, rotation="1 week", retention="10 days", compression="zip", ) -logger.debug(f"Logger initialized with level: {LOG_LEVEL}") +logger.debug(f"Logger initialized with level: {settings.LOG_LEVEL}") logger.debug(f"Logger initialized with log file path: {log_path}") diff --git a/packages/shared/fastfetchbot_shared/utils/network.py b/packages/shared/fastfetchbot_shared/utils/network.py index d21d616..e8b5e48 100644 --- a/packages/shared/fastfetchbot_shared/utils/network.py +++ b/packages/shared/fastfetchbot_shared/utils/network.py @@ -12,7 +12,7 @@ from playwright.async_api import async_playwright from fastfetchbot_shared.models.classes import NamedBytesIO -from fastfetchbot_shared.config import HTTP_REQUEST_TIMEOUT, DOWNLOAD_DIR +from fastfetchbot_shared.config import settings from fastfetchbot_shared.utils.image import check_image_type from fastfetchbot_shared.utils.logger import logger @@ -24,13 +24,13 @@ async def get_response( headers = HEADERS if client: resp = await client.get( - url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT + url, headers=headers, params=params, timeout=settings.HTTP_REQUEST_TIMEOUT ) return resp else: async with httpx.AsyncClient() as client: resp = await client.get( - url, headers=headers, params=params, timeout=HTTP_REQUEST_TIMEOUT + url, headers=headers, params=params, timeout=settings.HTTP_REQUEST_TIMEOUT ) return resp @@ -62,7 +62,7 @@ async def get_selector( url, headers=headers, follow_redirects=follow_redirects, - timeout=HTTP_REQUEST_TIMEOUT, + timeout=settings.HTTP_REQUEST_TIMEOUT, ) if ( resp.history @@ -85,7 +85,7 @@ async def get_redirect_url(url: str, headers: Optional[dict] = None) -> str: if not headers: headers = HEADERS async with httpx.AsyncClient() as client: - resp = await client.get(url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT) + resp = await client.get(url, headers=headers, timeout=settings.HTTP_REQUEST_TIMEOUT) if resp.status_code == 302 or resp.status_code == 301: return resp.headers["Location"] else: @@ -149,7 +149,7 @@ async def download_file_by_metadata_item( headers["Accept"] = "image/avif,image/webp,*/*" async with httpx.AsyncClient() as client: response = await client.get( - url=url, headers=headers, timeout=HTTP_REQUEST_TIMEOUT + url=url, headers=headers, timeout=settings.HTTP_REQUEST_TIMEOUT ) # if redirect 302, get the final url if response.status_code == 302 or response.status_code == 301: @@ -168,7 +168,7 @@ async def download_file_by_metadata_item( async def download_file_to_local( url: str, file_path: str = None, - dir_path: str = DOWNLOAD_DIR, + dir_path: str = settings.DOWNLOAD_DIR, file_name: str = "", headers: dict = None, referer: str = None, diff --git a/packages/shared/fastfetchbot_shared/utils/parse.py b/packages/shared/fastfetchbot_shared/utils/parse.py index 61527af..98a5d4a 100644 --- a/packages/shared/fastfetchbot_shared/utils/parse.py +++ b/packages/shared/fastfetchbot_shared/utils/parse.py @@ -216,9 +216,3 @@ def get_bool(value: Optional[str], default: bool = True) -> bool: return False else: return default - - -def get_env_bool(env, var_name: Optional[str], default: bool = False): - """Retrieve environment variable as a boolean.""" - value = env.get(var_name, "").lower() - return get_bool(value, default) diff --git a/packages/shared/fastfetchbot_shared/utils/pydantic_types.py b/packages/shared/fastfetchbot_shared/utils/pydantic_types.py new file mode 100644 index 0000000..696d9ed --- /dev/null +++ b/packages/shared/fastfetchbot_shared/utils/pydantic_types.py @@ -0,0 +1,26 @@ +from typing import Annotated, Optional + +from pydantic import BeforeValidator + + +def _parse_comma_list(v: str | list[str]) -> list[str]: + """Parse a comma-separated string into a list of stripped, non-empty strings.""" + if isinstance(v, list): + return v + return [x.strip() for x in v.split(",") if x.strip()] if v else [] + + +def _parse_optional_comma_list(v: str | list[str] | None) -> Optional[list[str]]: + """Parse a comma-separated string into a list, returning None for empty input.""" + if v is None: + return None + if isinstance(v, list): + return v or None + result = [x.strip() for x in v.split(",") if x.strip()] + return result or None + + +CommaSeparatedList = Annotated[list[str], BeforeValidator(_parse_comma_list)] +OptionalCommaSeparatedList = Annotated[ + Optional[list[str]], BeforeValidator(_parse_optional_comma_list) +] diff --git a/packages/shared/pyproject.toml b/packages/shared/pyproject.toml index cc081f4..2ab8616 100644 --- a/packages/shared/pyproject.toml +++ b/packages/shared/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "httpx>=0.28.1", "lxml>=5.4.0", "pydantic>=2.0.0", + "pydantic-settings>=2.0.0", "pillow>=10.0.0", "python-magic>=0.4.27", "aiofiles>=24.1.0", diff --git a/tests/unit/async_worker/test_enrichment.py b/tests/unit/async_worker/test_enrichment.py index 376f9c0..50d7c24 100644 --- a/tests/unit/async_worker/test_enrichment.py +++ b/tests/unit/async_worker/test_enrichment.py @@ -193,8 +193,9 @@ class TestConfigDefaults: @pytest.mark.asyncio async def test_uses_config_defaults_when_none(self, base_metadata_item, mock_telegraph): """When store_telegraph/store_document are None, config defaults should be used.""" - with patch("async_worker.services.enrichment.STORE_TELEGRAPH", True), \ - patch("async_worker.services.enrichment.STORE_DOCUMENT", False): + from async_worker.config import settings as aw_settings + with patch.object(aw_settings, "STORE_TELEGRAPH", True), \ + patch.object(aw_settings, "STORE_DOCUMENT", False): result = await enrich(base_metadata_item) # STORE_TELEGRAPH=True means Telegraph should be called diff --git a/tests/unit/scrapers/test_general_base.py b/tests/unit/scrapers/test_general_base.py index e7d824a..aaf6e44 100644 --- a/tests/unit/scrapers/test_general_base.py +++ b/tests/unit/scrapers/test_general_base.py @@ -388,14 +388,14 @@ async def test_none_input(self): assert result is None @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", None) + @patch("fastfetchbot_shared.services.scrapers.config.settings.OPENAI_API_KEY", None) async def test_no_api_key(self): from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor result = await BaseGeneralDataProcessor.parsing_article_body_by_llm("

html

") assert result == "

html

" @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.config.settings.OPENAI_API_KEY", "sk-test") @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") async def test_success(self, mock_openai_cls): from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor @@ -410,7 +410,7 @@ async def test_success(self, mock_openai_cls): mock_openai_cls.assert_called_once_with(api_key="sk-test") @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.config.settings.OPENAI_API_KEY", "sk-test") @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") async def test_empty_response(self, mock_openai_cls): from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor @@ -424,7 +424,7 @@ async def test_empty_response(self, mock_openai_cls): assert result == "

raw

" @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.config.settings.OPENAI_API_KEY", "sk-test") @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") async def test_exception(self, mock_openai_cls): from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor @@ -436,7 +436,7 @@ async def test_exception(self, mock_openai_cls): assert result == "

raw

" @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.config.settings.OPENAI_API_KEY", "sk-test") @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") async def test_truncates_long_content(self, mock_openai_cls): from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor @@ -456,7 +456,7 @@ async def test_truncates_long_content(self, mock_openai_cls): assert len(user_msg) < 60000 + 200 @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.base.OPENAI_API_KEY", "sk-test") + @patch("fastfetchbot_shared.services.scrapers.config.settings.OPENAI_API_KEY", "sk-test") @patch("fastfetchbot_shared.services.scrapers.general.base.AsyncOpenAI") async def test_short_content_not_truncated(self, mock_openai_cls): from fastfetchbot_shared.services.scrapers.general.base import BaseGeneralDataProcessor diff --git a/tests/unit/scrapers/test_general_firecrawl.py b/tests/unit/scrapers/test_general_firecrawl.py index a619bfa..d095519 100644 --- a/tests/unit/scrapers/test_general_firecrawl.py +++ b/tests/unit/scrapers/test_general_firecrawl.py @@ -42,11 +42,11 @@ def teardown_method(self): @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_URL", "https://fc.example.com", ) @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_KEY", "test-key", ) def test_get_instance_creates_singleton(self, mock_fc_cls): @@ -59,11 +59,11 @@ def test_get_instance_creates_singleton(self, mock_fc_cls): @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_URL", "https://fc.example.com", ) @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_KEY", "test-key", ) def test_reset_instance(self, mock_fc_cls): @@ -75,11 +75,11 @@ def test_reset_instance(self, mock_fc_cls): @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_URL", "https://fc.example.com", ) @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_KEY", "test-key", ) def test_double_check_locking_inner_branch(self, mock_fc_cls): @@ -125,11 +125,11 @@ def teardown_method(self): @pytest.mark.asyncio @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_URL", "https://fc.example.com", ) @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_KEY", "k", ) async def test_scrape_url_success(self, mock_fc_cls): @@ -153,11 +153,11 @@ async def test_scrape_url_success(self, mock_fc_cls): @pytest.mark.asyncio @patch("fastfetchbot_shared.services.scrapers.general.firecrawl_client.AsyncFirecrawl") @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_URL", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_URL", "https://fc.example.com", ) @patch( - "fastfetchbot_shared.services.scrapers.general.firecrawl_client.FIRECRAWL_API_KEY", + "fastfetchbot_shared.services.scrapers.config.settings.FIRECRAWL_API_KEY", "k", ) async def test_scrape_url_exception(self, mock_fc_cls): diff --git a/tests/unit/scrapers/test_general_scraper.py b/tests/unit/scrapers/test_general_scraper.py index dc082df..75a96cf 100644 --- a/tests/unit/scrapers/test_general_scraper.py +++ b/tests/unit/scrapers/test_general_scraper.py @@ -30,7 +30,7 @@ def test_default_registry_has_firecrawl_and_zyte(self): class TestGeneralScraperInit: @patch( - "fastfetchbot_shared.services.scrapers.general.scraper.GENERAL_SCRAPING_API", + "fastfetchbot_shared.services.scrapers.config.settings.GENERAL_SCRAPING_API", "FIRECRAWL", ) @patch( diff --git a/tests/unit/scrapers/test_general_zyte.py b/tests/unit/scrapers/test_general_zyte.py index 8df0e95..8695986 100644 --- a/tests/unit/scrapers/test_general_zyte.py +++ b/tests/unit/scrapers/test_general_zyte.py @@ -31,7 +31,7 @@ def test_init(self): class TestZyteGetPageContent: @pytest.mark.asyncio @patch( - "fastfetchbot_shared.services.scrapers.general.zyte.ZYTE_API_KEY", + "fastfetchbot_shared.services.scrapers.config.settings.ZYTE_API_KEY", None, ) async def test_no_api_key_raises(self): @@ -40,7 +40,7 @@ async def test_no_api_key_raises(self): await proc._get_page_content() @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.zyte.ZYTE_API_KEY", "zyte-key") + @patch("fastfetchbot_shared.services.scrapers.config.settings.ZYTE_API_KEY", "zyte-key") @patch("fastfetchbot_shared.services.scrapers.general.zyte.AsyncZyteAPI") async def test_success(self, mock_zyte_cls): mock_client = AsyncMock() @@ -69,7 +69,7 @@ async def test_success(self, mock_zyte_cls): assert kw["og_image"] == "https://img.com/pic.jpg" @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.scrapers.general.zyte.ZYTE_API_KEY", "zyte-key") + @patch("fastfetchbot_shared.services.scrapers.config.settings.ZYTE_API_KEY", "zyte-key") @patch("fastfetchbot_shared.services.scrapers.general.zyte.AsyncZyteAPI") async def test_exception_propagates(self, mock_zyte_cls): mock_client = AsyncMock() diff --git a/tests/unit/scrapers/test_scraper_config.py b/tests/unit/scrapers/test_scraper_config.py index 42eddc4..5e23219 100644 --- a/tests/unit/scrapers/test_scraper_config.py +++ b/tests/unit/scrapers/test_scraper_config.py @@ -28,6 +28,10 @@ def _reload_config(env_overrides=None, path_exists_side_effect=None, if env_overrides: env.update(env_overrides) + # XHS_COOKIE_PATH is now an env var read by ScrapersSettings + if xhs_cookie_path_override is not None: + env["XHS_COOKIE_PATH"] = xhs_cookie_path_override + patches = [] # Patch os.environ @@ -55,17 +59,13 @@ def _reload_config(env_overrides=None, path_exists_side_effect=None, ) patches.append(p_cookies) - # Patch _XHS_COOKIE_PATH from shared config - xhs_path_val = xhs_cookie_path_override if xhs_cookie_path_override is not None else "" - p_xhs = patch("fastfetchbot_shared.config.XHS_COOKIE_PATH", xhs_path_val) - patches.append(p_xhs) - for p in patches: p.start() + mod_name = "fastfetchbot_shared.services.scrapers.config" + original_module = sys.modules.get(mod_name) try: # Remove cached module so reload actually re-executes - mod_name = "fastfetchbot_shared.services.scrapers.config" if mod_name in sys.modules: del sys.modules[mod_name] import fastfetchbot_shared.services.scrapers.config as cfg @@ -73,6 +73,11 @@ def _reload_config(env_overrides=None, path_exists_side_effect=None, finally: for p in patches: p.stop() + # Restore the original module to avoid polluting other tests + if mod_name in sys.modules: + del sys.modules[mod_name] + if original_module is not None: + sys.modules[mod_name] = original_module # --------------------------------------------------------------------------- @@ -84,52 +89,52 @@ def test_filesystem_defaults(self): cfg = _reload_config( path_exists_side_effect=lambda p: False, ) - assert cfg.TEMP_DIR == tempfile.gettempdir() - assert cfg.WORK_DIR == os.getcwd() - assert cfg.DOWNLOAD_DIR == os.path.join(os.getcwd(), "download") - assert cfg.DEBUG_MODE is False - assert cfg.CONF_DIR == os.path.join(os.getcwd(), "conf") + assert cfg.settings.TEMP_DIR == tempfile.gettempdir() + assert cfg.settings.WORK_DIR == os.getcwd() + assert cfg.settings.DOWNLOAD_DIR == os.path.join(os.getcwd(), "download") + assert cfg.settings.DEBUG_MODE is False + assert cfg.settings.CONF_DIR == os.path.join(os.getcwd(), "conf") def test_template_defaults(self): cfg = _reload_config( path_exists_side_effect=lambda p: False, ) - assert cfg.TEMPLATE_LANGUAGE == "zh_CN" + assert cfg.settings.TEMPLATE_LANGUAGE == "zh_CN" assert cfg.JINJA2_ENV is not None def test_platform_defaults_are_none(self): cfg = _reload_config( path_exists_side_effect=lambda p: False, ) - assert cfg.X_RAPIDAPI_KEY is None - assert cfg.TWITTER_EMAIL is None - assert cfg.TWITTER_PASSWORD is None - assert cfg.TWITTER_USERNAME is None - assert cfg.TWITTER_CT0 is None - assert cfg.TWITTER_AUTH_TOKEN is None - assert cfg.TWITTER_COOKIES == {"ct0": None, "auth_token": None} - assert cfg.BLUESKY_USERNAME is None - assert cfg.BLUESKY_PASSWORD is None - assert cfg.XIAOHONGSHU_A1 is None - assert cfg.XIAOHONGSHU_WEBID is None - assert cfg.XIAOHONGSHU_WEBSESSION is None - assert cfg.XIAOHONGSHU_COOKIES == {"a1": None, "web_id": None, "web_session": None} - assert cfg.REDDIT_CLIENT_ID is None - assert cfg.REDDIT_CLIENT_SECRET is None - assert cfg.REDDIT_PASSWORD is None - assert cfg.REDDIT_USERNAME is None - assert cfg.OPENAI_API_KEY is None - assert cfg.ZYTE_API_KEY is None - assert cfg.ZHIHU_Z_C0 is None + assert cfg.settings.X_RAPIDAPI_KEY is None + assert cfg.settings.TWITTER_EMAIL is None + assert cfg.settings.TWITTER_PASSWORD is None + assert cfg.settings.TWITTER_USERNAME is None + assert cfg.settings.TWITTER_CT0 is None + assert cfg.settings.TWITTER_AUTH_TOKEN is None + assert cfg.settings.TWITTER_COOKIES == {"ct0": None, "auth_token": None} + assert cfg.settings.BLUESKY_USERNAME is None + assert cfg.settings.BLUESKY_PASSWORD is None + assert cfg.settings.XIAOHONGSHU_A1 is None + assert cfg.settings.XIAOHONGSHU_WEBID is None + assert cfg.settings.XIAOHONGSHU_WEBSESSION is None + assert cfg.settings.XIAOHONGSHU_COOKIES == {"a1": None, "web_id": None, "web_session": None} + assert cfg.settings.REDDIT_CLIENT_ID is None + assert cfg.settings.REDDIT_CLIENT_SECRET is None + assert cfg.settings.REDDIT_PASSWORD is None + assert cfg.settings.REDDIT_USERNAME is None + assert cfg.settings.OPENAI_API_KEY is None + assert cfg.settings.ZYTE_API_KEY is None + assert cfg.settings.ZHIHU_Z_C0 is None def test_xhs_defaults(self): cfg = _reload_config( path_exists_side_effect=lambda p: False, ) - assert cfg.XHS_PHONE_LIST == [""] - assert cfg.XHS_IP_PROXY_LIST == [""] - assert cfg.XHS_ENABLE_IP_PROXY is False - assert cfg.XHS_SAVE_LOGIN_STATE is True + assert cfg.settings.xhs_phone_list == [] + assert cfg.settings.xhs_ip_proxy_list == [] + assert cfg.settings.XHS_ENABLE_IP_PROXY is False + assert cfg.settings.XHS_SAVE_LOGIN_STATE is True def test_weibo_cookies_default_from_env(self): cfg = _reload_config( @@ -142,24 +147,24 @@ def test_zhihu_cookies_default(self): path_exists_side_effect=lambda p: False, ) assert cfg.ZHIHU_COOKIES_JSON is None - assert cfg.FXZHIHU_HOST == "fxzhihu.com" + assert cfg.settings.FXZHIHU_HOST == "fxzhihu.com" def test_general_scraping_defaults(self): cfg = _reload_config( path_exists_side_effect=lambda p: False, ) - assert cfg.GENERAL_SCRAPING_ON is False - assert cfg.GENERAL_SCRAPING_API == "FIRECRAWL" - assert cfg.FIRECRAWL_API_URL == "" - assert cfg.FIRECRAWL_API_KEY == "" - assert cfg.FIRECRAWL_WAIT_FOR == 3000 - assert cfg.FIRECRAWL_USE_JSON_EXTRACTION is False + assert cfg.settings.GENERAL_SCRAPING_ON is False + assert cfg.settings.GENERAL_SCRAPING_API == "FIRECRAWL" + assert cfg.settings.FIRECRAWL_API_URL == "" + assert cfg.settings.FIRECRAWL_API_KEY == "" + assert cfg.settings.firecrawl_wait_for_int == 3000 + assert cfg.settings.FIRECRAWL_USE_JSON_EXTRACTION is False def test_telegraph_default_empty(self): cfg = _reload_config( path_exists_side_effect=lambda p: False, ) - assert cfg.TELEGRAPH_TOKEN_LIST is None + assert cfg.settings.telegraph_token_list is None def test_xhs_cookie_string_empty_when_no_file_no_env(self): cfg = _reload_config( @@ -184,11 +189,11 @@ def test_custom_filesystem_vars(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.TEMP_DIR == "/tmp/custom" - assert cfg.WORK_DIR == "/work" - assert cfg.DOWNLOAD_DIR == "/work/dl" - assert cfg.DEBUG_MODE is True - assert cfg.CONF_DIR == "/etc/myconf" + assert cfg.settings.TEMP_DIR == "/tmp/custom" + assert cfg.settings.WORK_DIR == "/work" + assert cfg.settings.DOWNLOAD_DIR == "/work/dl" + assert cfg.settings.DEBUG_MODE is True + assert cfg.settings.CONF_DIR == "/etc/myconf" def test_custom_twitter_vars(self): cfg = _reload_config( @@ -201,12 +206,12 @@ def test_custom_twitter_vars(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.TWITTER_EMAIL == "test@example.com" - assert cfg.TWITTER_PASSWORD == "pass123" - assert cfg.TWITTER_USERNAME == "tuser" - assert cfg.TWITTER_CT0 == "ct0val" - assert cfg.TWITTER_AUTH_TOKEN == "authval" - assert cfg.TWITTER_COOKIES == {"ct0": "ct0val", "auth_token": "authval"} + assert cfg.settings.TWITTER_EMAIL == "test@example.com" + assert cfg.settings.TWITTER_PASSWORD == "pass123" + assert cfg.settings.TWITTER_USERNAME == "tuser" + assert cfg.settings.TWITTER_CT0 == "ct0val" + assert cfg.settings.TWITTER_AUTH_TOKEN == "authval" + assert cfg.settings.TWITTER_COOKIES == {"ct0": "ct0val", "auth_token": "authval"} def test_custom_bluesky_vars(self): cfg = _reload_config( @@ -216,8 +221,8 @@ def test_custom_bluesky_vars(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.BLUESKY_USERNAME == "buser" - assert cfg.BLUESKY_PASSWORD == "bpass" + assert cfg.settings.BLUESKY_USERNAME == "buser" + assert cfg.settings.BLUESKY_PASSWORD == "bpass" def test_custom_xhs_phone_and_proxy(self): cfg = _reload_config( @@ -229,17 +234,17 @@ def test_custom_xhs_phone_and_proxy(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.XHS_PHONE_LIST == ["111", "222", "333"] - assert cfg.XHS_IP_PROXY_LIST == ["p1", "p2"] - assert cfg.XHS_ENABLE_IP_PROXY is True - assert cfg.XHS_SAVE_LOGIN_STATE is False + assert cfg.settings.xhs_phone_list == ["111", "222", "333"] + assert cfg.settings.xhs_ip_proxy_list == ["p1", "p2"] + assert cfg.settings.XHS_ENABLE_IP_PROXY is True + assert cfg.settings.XHS_SAVE_LOGIN_STATE is False def test_custom_template_language(self): cfg = _reload_config( env_overrides={"TEMPLATE_LANGUAGE": "en_US"}, path_exists_side_effect=lambda p: False, ) - assert cfg.TEMPLATE_LANGUAGE == "en_US" + assert cfg.settings.TEMPLATE_LANGUAGE == "en_US" def test_custom_reddit_vars(self): cfg = _reload_config( @@ -251,10 +256,10 @@ def test_custom_reddit_vars(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.REDDIT_CLIENT_ID == "rcid" - assert cfg.REDDIT_CLIENT_SECRET == "rsec" - assert cfg.REDDIT_PASSWORD == "rpass" - assert cfg.REDDIT_USERNAME == "ruser" + assert cfg.settings.REDDIT_CLIENT_ID == "rcid" + assert cfg.settings.REDDIT_CLIENT_SECRET == "rsec" + assert cfg.settings.REDDIT_PASSWORD == "rpass" + assert cfg.settings.REDDIT_USERNAME == "ruser" def test_custom_general_scraping_vars(self): cfg = _reload_config( @@ -269,27 +274,27 @@ def test_custom_general_scraping_vars(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.GENERAL_SCRAPING_ON is True - assert cfg.GENERAL_SCRAPING_API == "ZYTE" - assert cfg.FIRECRAWL_API_URL == "https://fc.example.com" - assert cfg.FIRECRAWL_API_KEY == "fc-key" - assert cfg.FIRECRAWL_WAIT_FOR == 5000 - assert cfg.FIRECRAWL_USE_JSON_EXTRACTION is True - assert cfg.ZYTE_API_KEY == "zyte-key" + assert cfg.settings.GENERAL_SCRAPING_ON is True + assert cfg.settings.GENERAL_SCRAPING_API == "ZYTE" + assert cfg.settings.FIRECRAWL_API_URL == "https://fc.example.com" + assert cfg.settings.FIRECRAWL_API_KEY == "fc-key" + assert cfg.settings.firecrawl_wait_for_int == 5000 + assert cfg.settings.FIRECRAWL_USE_JSON_EXTRACTION is True + assert cfg.settings.ZYTE_API_KEY == "zyte-key" def test_custom_openai_key(self): cfg = _reload_config( env_overrides={"OPENAI_API_KEY": "sk-test"}, path_exists_side_effect=lambda p: False, ) - assert cfg.OPENAI_API_KEY == "sk-test" + assert cfg.settings.OPENAI_API_KEY == "sk-test" def test_custom_x_rapidapi_key(self): cfg = _reload_config( env_overrides={"X_RAPIDAPI_KEY": "rapid-key"}, path_exists_side_effect=lambda p: False, ) - assert cfg.X_RAPIDAPI_KEY == "rapid-key" + assert cfg.settings.X_RAPIDAPI_KEY == "rapid-key" def test_custom_weibo_cookies_from_env(self): cfg = _reload_config( @@ -306,8 +311,8 @@ def test_custom_zhihu_vars(self): }, path_exists_side_effect=lambda p: False, ) - assert cfg.FXZHIHU_HOST == "custom.zhihu.com" - assert cfg.ZHIHU_Z_C0 == "z_c0_val" + assert cfg.settings.FXZHIHU_HOST == "custom.zhihu.com" + assert cfg.settings.ZHIHU_Z_C0 == "z_c0_val" # --------------------------------------------------------------------------- @@ -392,13 +397,13 @@ def exists_side_effect(path): assert cfg.XHS_COOKIE_STRING == "" def test_xhs_cookie_default_path_when_no_override(self): - """When XHS_COOKIE_PATH is empty, uses CONF_DIR/xhs_cookies.txt.""" + """When XHS_COOKIE_PATH is empty, _load_xhs_cookies uses CONF_DIR/xhs_cookies.txt.""" cfg = _reload_config( path_exists_side_effect=lambda p: False, xhs_cookie_path_override="", ) - expected = os.path.join(cfg.CONF_DIR, "xhs_cookies.txt") - assert cfg.xhs_cookie_path == expected + # The settings field stores the raw env value (empty string) + assert cfg.settings.XHS_COOKIE_PATH == "" # --------------------------------------------------------------------------- @@ -453,35 +458,36 @@ def test_zhihu_cookies_no_file(self): # --------------------------------------------------------------------------- -# FIRECRAWL_WAIT_FOR invalid value +# FIRECRAWL_WAIT_FOR (stored as str, parsed via firecrawl_wait_for_int) # --------------------------------------------------------------------------- class TestFirecrawlWaitFor: def test_firecrawl_wait_for_invalid_fallback(self): + """Non-numeric string should fall back to 3000 via firecrawl_wait_for_int.""" cfg = _reload_config( env_overrides={"FIRECRAWL_WAIT_FOR": "not_a_number"}, path_exists_side_effect=lambda p: False, ) - assert cfg.FIRECRAWL_WAIT_FOR == 3000 + assert cfg.settings.firecrawl_wait_for_int == 3000 def test_firecrawl_wait_for_valid(self): cfg = _reload_config( env_overrides={"FIRECRAWL_WAIT_FOR": "7000"}, path_exists_side_effect=lambda p: False, ) - assert cfg.FIRECRAWL_WAIT_FOR == 7000 + assert cfg.settings.firecrawl_wait_for_int == 7000 def test_firecrawl_wait_for_empty_string(self): - """Empty string should use default 3000 via `or 3000`.""" + """Empty string should fall back to 3000 via firecrawl_wait_for_int.""" cfg = _reload_config( env_overrides={"FIRECRAWL_WAIT_FOR": ""}, path_exists_side_effect=lambda p: False, ) - assert cfg.FIRECRAWL_WAIT_FOR == 3000 + assert cfg.settings.firecrawl_wait_for_int == 3000 # --------------------------------------------------------------------------- -# TELEGRAPH_TOKEN_LIST +# TELEGRAPH_TOKEN_LIST (stored as str, parsed via telegraph_token_list) # --------------------------------------------------------------------------- class TestTelegraphTokenList: @@ -490,18 +496,18 @@ def test_telegraph_empty_string(self): env_overrides={"TELEGRAPH_TOKEN_LIST": ""}, path_exists_side_effect=lambda p: False, ) - assert cfg.TELEGRAPH_TOKEN_LIST is None + assert cfg.settings.telegraph_token_list is None def test_telegraph_comma_separated(self): cfg = _reload_config( env_overrides={"TELEGRAPH_TOKEN_LIST": "tok1,tok2,tok3"}, path_exists_side_effect=lambda p: False, ) - assert cfg.TELEGRAPH_TOKEN_LIST == ["tok1", "tok2", "tok3"] + assert cfg.settings.telegraph_token_list == ["tok1", "tok2", "tok3"] def test_telegraph_single_token(self): cfg = _reload_config( env_overrides={"TELEGRAPH_TOKEN_LIST": "single_tok"}, path_exists_side_effect=lambda p: False, ) - assert cfg.TELEGRAPH_TOKEN_LIST == ["single_tok"] + assert cfg.settings.telegraph_token_list == ["single_tok"] diff --git a/tests/unit/scrapers/test_scraper_manager.py b/tests/unit/scrapers/test_scraper_manager.py index d015b97..24b0cc2 100644 --- a/tests/unit/scrapers/test_scraper_manager.py +++ b/tests/unit/scrapers/test_scraper_manager.py @@ -143,10 +143,10 @@ async def test_creates_and_inits_bluesky_scraper(self): "fastfetchbot_shared.services.scrapers.scraper_manager.BlueskyScraper", return_value=mock_instance, ) as MockCls, patch( - "fastfetchbot_shared.services.scrapers.scraper_manager.BLUESKY_USERNAME", + "fastfetchbot_shared.services.scrapers.config.settings.BLUESKY_USERNAME", "testuser", ), patch( - "fastfetchbot_shared.services.scrapers.scraper_manager.BLUESKY_PASSWORD", + "fastfetchbot_shared.services.scrapers.config.settings.BLUESKY_PASSWORD", "testpass", ): result = await ScraperManager.init_bluesky_scraper() diff --git a/tests/unit/scrapers/test_xiaohongshu.py b/tests/unit/scrapers/test_xiaohongshu.py index 87968df..eca8e50 100644 --- a/tests/unit/scrapers/test_xiaohongshu.py +++ b/tests/unit/scrapers/test_xiaohongshu.py @@ -99,12 +99,12 @@ def test_strips_trailing_slash(self): ) assert adapter.sign_server_endpoint == "http://sign:8989" - @patch("fastfetchbot_shared.services.scrapers.xiaohongshu.adaptar.SIGN_SERVER_URL", "") + @patch("fastfetchbot_shared.config.settings.SIGN_SERVER_URL", "") def test_no_sign_server_raises(self): with pytest.raises(ValueError, match="sign server URL"): XhsSinglePostAdapter(cookies="c=1", sign_server_endpoint="") - @patch("fastfetchbot_shared.services.scrapers.xiaohongshu.adaptar.SIGN_SERVER_URL", "http://fallback:8989") + @patch("fastfetchbot_shared.config.settings.SIGN_SERVER_URL", "http://fallback:8989") def test_fallback_to_env_sign_server(self): adapter = XhsSinglePostAdapter(cookies="c=1", sign_server_endpoint="") assert adapter.sign_server_endpoint == "http://fallback:8989" diff --git a/tests/unit/scrapers/test_zhihu.py b/tests/unit/scrapers/test_zhihu.py index 7e21eb1..ff0a03b 100644 --- a/tests/unit/scrapers/test_zhihu.py +++ b/tests/unit/scrapers/test_zhihu.py @@ -27,7 +27,7 @@ class TestZhihuConfig: def test_config_with_z_c0(self): """When ZHIHU_Z_C0 is set, ZHIHU_API_COOKIE uses it.""" with patch( - "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "test_token" + "fastfetchbot_shared.services.scrapers.config.settings.ZHIHU_Z_C0", "test_token" ), patch( "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", None ): @@ -41,7 +41,7 @@ def test_config_with_cookies_json(self): """When ZHIHU_Z_C0 is empty but ZHIHU_COOKIES_JSON is set, use cookies JSON.""" cookies = [{"name": "a", "value": "1"}, {"name": "b", "value": "2"}] with patch( - "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "" + "fastfetchbot_shared.services.scrapers.config.settings.ZHIHU_Z_C0", "" ), patch( "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", cookies, @@ -56,7 +56,7 @@ def test_config_with_cookies_json(self): def test_config_no_cookies(self): """When both ZHIHU_Z_C0 and ZHIHU_COOKIES_JSON are empty/None.""" with patch( - "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "" + "fastfetchbot_shared.services.scrapers.config.settings.ZHIHU_Z_C0", "" ), patch( "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", None, @@ -72,7 +72,7 @@ def test_config_z_c0_takes_precedence(self): """ZHIHU_Z_C0 takes priority over ZHIHU_COOKIES_JSON for API cookie.""" cookies = [{"name": "a", "value": "1"}] with patch( - "fastfetchbot_shared.services.scrapers.config.ZHIHU_Z_C0", "my_z_c0" + "fastfetchbot_shared.services.scrapers.config.settings.ZHIHU_Z_C0", "my_z_c0" ), patch( "fastfetchbot_shared.services.scrapers.config.ZHIHU_COOKIES_JSON", cookies, @@ -353,7 +353,7 @@ async def test_fxzhihu_answer_with_question_id(self, _patch_zhihu_module): ), patch( "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None ), patch( - "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + "fastfetchbot_shared.services.scrapers.config.settings.FXZHIHU_HOST", "fxzhihu.com" ): z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") z.zhihu_type = "answer" @@ -372,7 +372,7 @@ async def test_fxzhihu_answer_no_question_id(self, _patch_zhihu_module): ), patch( "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None ), patch( - "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + "fastfetchbot_shared.services.scrapers.config.settings.FXZHIHU_HOST", "fxzhihu.com" ): z = Zhihu(url="https://www.zhihu.com/answer/200") z.zhihu_type = "answer" @@ -391,7 +391,7 @@ async def test_fxzhihu_article(self, _patch_zhihu_module): ), patch( "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None ), patch( - "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + "fastfetchbot_shared.services.scrapers.config.settings.FXZHIHU_HOST", "fxzhihu.com" ): z = Zhihu(url="https://zhuanlan.zhihu.com/p/12345") z.zhihu_type = "article" @@ -409,7 +409,7 @@ async def test_fxzhihu_status(self, _patch_zhihu_module): ), patch( "fastfetchbot_shared.services.scrapers.zhihu.ZHIHU_COOKIES", None ), patch( - "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + "fastfetchbot_shared.services.scrapers.config.settings.FXZHIHU_HOST", "fxzhihu.com" ): z = Zhihu(url="https://www.zhihu.com/pin/999") z.zhihu_type = "status" @@ -2019,7 +2019,7 @@ async def mock_get_response_json(*args, **kwargs): new_callable=AsyncMock, return_value=mock_resp, ), patch( - "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + "fastfetchbot_shared.services.scrapers.config.settings.FXZHIHU_HOST", "fxzhihu.com" ): z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") await z._get_zhihu_item() @@ -2043,7 +2043,7 @@ async def test_all_methods_fail(self, _patch_zhihu_module): new_callable=AsyncMock, side_effect=Exception("fx fail"), ), patch( - "fastfetchbot_shared.services.scrapers.zhihu.FXZHIHU_HOST", "fxzhihu.com" + "fastfetchbot_shared.services.scrapers.config.settings.FXZHIHU_HOST", "fxzhihu.com" ): z = Zhihu(url="https://www.zhihu.com/question/100/answer/200") with pytest.raises(Exception): diff --git a/tests/unit/test_telegraph.py b/tests/unit/test_telegraph.py index 5992608..3f49db1 100644 --- a/tests/unit/test_telegraph.py +++ b/tests/unit/test_telegraph.py @@ -72,7 +72,7 @@ def test_from_dict_non_dict_raises(self): class TestGetTelegraph: @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok1", "tok2"]) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "tok1,tok2") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") @patch("fastfetchbot_shared.services.telegraph.DocumentPreprocessor") async def test_upload_images_true_with_token_list( @@ -100,7 +100,7 @@ async def test_upload_images_true_with_token_list( mock_poster.post.assert_awaited_once() @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok1"]) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "tok1") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") async def test_upload_images_false(self, mock_poster_cls): mock_poster = AsyncMock() @@ -115,7 +115,7 @@ async def test_upload_images_false(self, mock_poster_cls): mock_poster.post.assert_awaited_once() @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", None) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") async def test_no_token_list_creates_token(self, mock_poster_cls): mock_poster = AsyncMock() @@ -132,10 +132,10 @@ async def test_no_token_list_creates_token(self, mock_poster_cls): mock_poster.set_token.assert_not_awaited() @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", []) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") async def test_empty_token_list_creates_token(self, mock_poster_cls): - """Empty list is falsy, so it should create a token.""" + """Empty string parses to None (falsy), so it should create a token.""" mock_poster = AsyncMock() mock_poster_cls.return_value = mock_poster mock_poster.post.return_value = {"url": "https://telegra.ph/page3"} @@ -147,7 +147,7 @@ async def test_empty_token_list_creates_token(self, mock_poster_cls): mock_poster.create_api_token.assert_awaited_once() @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok"]) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "tok") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") async def test_exception_returns_empty_string(self, mock_poster_cls): mock_poster = AsyncMock() @@ -160,7 +160,7 @@ async def test_exception_returns_empty_string(self, mock_poster_cls): assert result == "" @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok"]) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "tok") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") @patch("fastfetchbot_shared.services.telegraph.DocumentPreprocessor") async def test_exception_during_image_upload_returns_empty( @@ -179,7 +179,7 @@ async def test_exception_during_image_upload_returns_empty( assert result == "" @pytest.mark.asyncio - @patch("fastfetchbot_shared.services.telegraph.TELEGRAPH_TOKEN_LIST", ["tok"]) + @patch("fastfetchbot_shared.services.scrapers.config.settings.TELEGRAPH_TOKEN_LIST", "tok") @patch("fastfetchbot_shared.services.telegraph.AsyncTelegraphPoster") @patch("fastfetchbot_shared.services.telegraph.DocumentPreprocessor") async def test_content_updated_after_image_processing( diff --git a/uv.lock b/uv.lock index b6fb501..e51b0bd 100644 --- a/uv.lock +++ b/uv.lock @@ -898,6 +898,7 @@ dependencies = [ { name = "pillow" }, { name = "playwright" }, { name = "pydantic" }, + { name = "pydantic-settings" }, { name = "python-magic" }, { name = "sqlalchemy", extra = ["asyncio"] }, ] @@ -945,6 +946,7 @@ requires-dist = [ { name = "pillow", specifier = ">=10.0.0" }, { name = "playwright", specifier = ">=1.52.0" }, { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pydantic-settings", specifier = ">=2.0.0" }, { name = "python-magic", specifier = ">=0.4.27" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.0" }, { name = "tenacity", marker = "extra == 'scrapers'", specifier = ">=9.1.2" }, @@ -1709,6 +1711,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, ] +[[package]] +name = "pydantic-settings" +version = "2.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/6d/fffca34caecc4a3f97bda81b2098da5e8ab7efc9a66e819074a11955d87e/pydantic_settings-2.13.1.tar.gz", hash = "sha256:b4c11847b15237fb0171e1462bf540e294affb9b86db4d9aa5c01730bdbe4025", size = 223826, upload-time = "2026-02-19T13:45:08.055Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, +] + [[package]] name = "pydub" version = "0.25.1"