From 4baabf289f95beddd584ff26e7acd4e548c3fbed Mon Sep 17 00:00:00 2001 From: TAP Date: Wed, 17 Dec 2025 22:42:41 +0530 Subject: [PATCH 01/13] feedback payload fix --- .env.prod | 119 ++++ api/api.py | 11 + app.py | 4 + database/db_manager.py | 11 + mq/rmq_client.py | 41 +- plag_checker/submissions_checker.py | 7 + requirements.txt | 6 +- start-dev-env.README.md | 698 ++++++++++---------- start-dev-env.sh | 944 ++++++++++++++-------------- 9 files changed, 1009 insertions(+), 832 deletions(-) create mode 100644 .env.prod mode change 100644 => 100755 start-dev-env.sh diff --git a/.env.prod b/.env.prod new file mode 100644 index 0000000..101b48d --- /dev/null +++ b/.env.prod @@ -0,0 +1,119 @@ +# RABBITMQ CONFIGURATION +RABBITMQ_HOST=armadillo.rmq.cloudamqp.com +RABBITMQ_PORT=5672 +RABBITMQ_USER=fzdqidte +RABBITMQ_PASS=0SMrDogBVcWUcu9brWwp2QhET_kArl59 +RABBITMQ_VHOST=fzdqidte +RABBITMQ_MANAGEMENT_PORT=15672 +RABBITMQ_PREFETCH_COUNT=1 + +# Message retry configuration +# Maximum number of retries before sending to DLQ (prevents poison messages) +MAX_RETRIES=3 + +# Queue Names +SUBMISSION_QUEUE=plagiarism_submissions +FEEDBACK_QUEUE=plagiarism_feedback +# Dead Letter Queue (optional - leave empty to disable) +DEAD_LETTER_QUEUE=plagiarism_failed_submissions + +# POSTGRESQL CONFIGURATION +POSTGRES_HOST=db.example.com +POSTGRES_PORT=5432 +POSTGRES_DB=plagiarism_db +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres + +# PGADMIN CONFIGURATION (Optional - for development only) +PGADMIN_EMAIL=admin@admin.com +PGADMIN_PASSWORD=admin123 + +# Connection Pool +POSTGRES_POOL_SIZE=10 +POSTGRES_MAX_OVERFLOW=20 + +# PLAGIARISM DETECTION THRESHOLDS +EXACT_DUPLICATE_THRESHOLD=0.95 +NEAR_DUPLICATE_THRESHOLD=0.90 +SEMANTIC_MATCH_THRESHOLD=0.80 + +# ==== PRODUCTION: Uncomment below for 7-day window ==== +RESUBMISSION_WINDOW_DAYS=14 + +# Hash comparison threshold (Hamming distance) +HASH_MATCH_THRESHOLD=10 + +# IMAGE PROCESSING +# Maximum image size in MB +MAX_IMAGE_SIZE_MB=10 + +# Image download timeout in seconds +IMAGE_DOWNLOAD_TIMEOUT=30 + +# Image validation thresholds +# Min variance to detect blank images (lower = more strict) +IMAGE_MIN_VARIANCE=5.0 +# Min unique colors required +IMAGE_MIN_UNIQUE_COLORS=10 +# Max ratio of dominant color (higher = more permissive) +IMAGE_MAX_SOLID_COLOR_RATIO=0.95 + +# CLIP Model Configuration +CLIP_MODEL=ViT-L/14 +CLIP_DEVICE=cpu +CLIP_PRETRAINED=laion2B-s32B-b82K + +# Local Model Path (Optional - use pre-downloaded models) +# If set, the system will load the model from this path instead of downloading from HuggingFace +# Example: CLIP_LOCAL_MODEL_PATH=./models/clip/open_clip_pytorch_model.bin +# Download models from: https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K +CLIP_LOCAL_MODEL_PATH=/app/models/clip/open_clip_pytorch_model.bin + +# Disable SSL verification for HuggingFace downloads (for corporate proxy/self-signed certs) +# Set to "true" only if you encounter SSL certificate errors +DISABLE_SSL_VERIFY=true +PYTHONHTTPSVERIFY=0 + +# VECTOR SEARCH CONFIGURATION +# Use pgvector (PostgreSQL) or FAISS for vector similarity search +USE_PGVECTOR=true + +# FAISS Configuration +FAISS_INDEX_PATH=/app/data/faiss_index.bin +FAISS_METADATA_PATH=/app/data/faiss_metadata.json +FAISS_DIMENSION=768 +FAISS_TOP_K=4 # Number of top candidates to retrieve from FAISS search + +# STORAGE PATHS +# Reference images directory +REFERENCE_IMAGES_DIR=./data/reference_images + +# Temporary storage for downloaded submissions +TEMP_IMAGES_DIR=./data/temp_images + +# Logs directory +#LOGS_DIR=./logs + +# APPLICATION SETTINGS +LOG_LEVEL=INFO + +# Worker concurrency (number of threads) +#WORKER_THREADS=4 + +# Enable performance metrics(ignore) +#ENABLE_METRICS=true + +# DEVELOPMENT SETTINGS +# Set to "development" or "production" +#ENVIRONMENT=development + +#DEBUG=true + + + +# Mock Glific API (for testing without WhatsApp) +#Used to skip WhatsApp delivery in testing mode +MOCK_GLIFIC=true +# ==== TESTING: 2-minute resubmission window (comment out for production) ==== +RESUBMISSION_WINDOW_MINUTES=2 + diff --git a/api/api.py b/api/api.py index 0d42d81..e8bc7ff 100644 --- a/api/api.py +++ b/api/api.py @@ -11,6 +11,7 @@ import aio_pika import json import uuid +from dotenv import load_dotenv from dotenv import load_dotenv import os @@ -26,6 +27,8 @@ ) logger = logging.getLogger(__name__) +load_dotenv() + # RabbitMQ Configuration RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost") RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", "5672") @@ -33,6 +36,14 @@ RABBITMQ_USER = os.getenv("RABBITMQ_USER", "admin") RABBITMQ_PASS = os.getenv("RABBITMQ_PASS", "admin123") +#PRINT THE RABBITMQ CONFIG FOR DEBUGGING +# logger.info("###################") +# logger.info(f"RABBITMQ_HOST={RABBITMQ_HOST}") +# logger.info(f"RABBITMQ_PORT={RABBITMQ_PORT}") +# logger.info(f"RABBITMQ_VHOST={RABBITMQ_VHOST}") +# logger.info(f"RABBITMQ_USER={RABBITMQ_USER}") +# logger.info("###################") + SUBMISSION_QUEUE = os.getenv("SUBMISSION_QUEUE", "plagiarism_submissions") FEEDBACK_QUEUE = os.getenv("FEEDBACK_QUEUE", "plagiarism_feedback") diff --git a/app.py b/app.py index b54e76d..344e037 100644 --- a/app.py +++ b/app.py @@ -30,6 +30,10 @@ def validate_configuration(): ] missing = [var for var in required_env_vars if not os.getenv(var)] + #print the required env vars and their values for debugging + # for var in required_env_vars: + # logger.info("###################") + # logger.info(f"{var}={os.getenv(var)}") if missing: logger.error(f"Missing required environment variables: {missing}") raise ValueError(f"Missing required environment variables: {missing}") diff --git a/database/db_manager.py b/database/db_manager.py index 9263f3d..6db7936 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -67,6 +67,17 @@ async def init_pool(self): db_name = os.getenv("POSTGRES_DB") or os.getenv("DB_NAME") db_host = os.getenv("POSTGRES_HOST") or os.getenv("DB_HOST", "localhost") db_port = int(os.getenv("POSTGRES_PORT") or os.getenv("DB_PORT", "5432")) + # db_port = 5435 # TEMP OVERRIDE FOR TESTING + + #print the db connection details for debugging + # logger.info("###################") + # logger.info(f"DB Host: {db_host}") + # logger.info(f"DB Port: {db_port}") + # logger.info(f"DB Name: {db_name}") + # logger.info(f"DB User: {db_user}") + # logger.info(f"DB db_password: {db_password}") + # logger.info("###################") + if not all([db_user, db_password, db_name]): raise ValueError("Missing required database environment variables") diff --git a/mq/rmq_client.py b/mq/rmq_client.py index f99877f..6571c0b 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -103,17 +103,38 @@ async def connect(self): ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") - # Declare main submission queue - self.submission_queue = await self.channel.declare_queue( - self.SUBMISSION_QUEUE, durable=True - ) - logger.info(f"Submission queue declared: {self.SUBMISSION_QUEUE}") + try: + # First try passive declaration to check if queue exists + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True, + passive=True # Only check, don't create + ) + logger.info(f"Submission queue already exists: {self.SUBMISSION_QUEUE}") + except Exception: + # Queue doesn't exist, create it + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True + ) + logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") + + try: + # First try passive declaration to check if queue exists + self.feedback_queue = await self.channel.declare_queue( + self.FEEDBACK_QUEUE, + durable=True, + passive=True # Only check, don't create + ) + logger.info(f"Feedback queue already exists: {self.FEEDBACK_QUEUE}") + except Exception: + # Queue doesn't exist, create it + self.feedback_queue = await self.channel.declare_queue( + self.FEEDBACK_QUEUE, + durable=True + ) + logger.info(f"Feedback queue created: {self.FEEDBACK_QUEUE}") - # Declare feedback queue for publishing results - self.feedback_queue = await self.channel.declare_queue( - self.FEEDBACK_QUEUE, durable=True - ) - logger.info(f"Feedback queue declared: {self.FEEDBACK_QUEUE}") logger.info( f"Connected to RabbitMQ with prefetch_count={self.PREFETCH_COUNT}, all queues declared" diff --git a/plag_checker/submissions_checker.py b/plag_checker/submissions_checker.py index 0787003..4fb46a8 100644 --- a/plag_checker/submissions_checker.py +++ b/plag_checker/submissions_checker.py @@ -119,6 +119,8 @@ async def initialize(self): if self.image_worker is None: raise RuntimeError("ImageWorker failed to initialize") + logger.info("Submission Checker initialized successfully") + await self.start_consumer() async def process_submission(self, submission): @@ -282,6 +284,11 @@ async def process_submission(self, submission): data["similarity_score"] = result_text.get("similarity_score") data["is_plagiarized"] = result_text.get("is_plagiarized") data["match_type"] = result_text.get("match_type") + data["assignment_id"] = data.pop("assign_id") + data["is_ai_generated"] = result_text.get("is_ai_generated", False) + data["ai_detection_source"] = result_text.get("ai_detection_source", "") + data["ai_confidence"] = result_text.get("ai_confidence", 0.0) + data["plagiarism_source"] = result_text.get("plagiarism_source", "") publish_data = {k: v for k, v in data.items() if k != "db_record_id"} diff --git a/requirements.txt b/requirements.txt index c2db3ae..37d4505 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,8 @@ numpy==2.3.4 # Utilities tqdm==4.67.1 -podman-compose==1.5.0 \ No newline at end of file +podman-compose==1.5.0 + +# Web Framework - FastAPI +fastapi==0.115.0 +uvicorn[standard]==0.34.0 \ No newline at end of file diff --git a/start-dev-env.README.md b/start-dev-env.README.md index c49ad6d..a13e591 100644 --- a/start-dev-env.README.md +++ b/start-dev-env.README.md @@ -1,349 +1,349 @@ -# Local Development Setup Scripts - Quick Guide - -## Overview -Automated scripts to set up local development environment for MentorMe Plagiarism Detection System. - ---- - -## Files -- `start-dev-env.sh` - Bash script (Linux/macOS/Git Bash) -- `start-dev-env.ps1` - PowerShell script (Windows) - ---- - -## Quick Start - -### **Option 1: Complete Setup (Recommended)** -Everything in one command - infrastructure + Python environment + dependencies: - -```bash -# Linux/macOS/Git Bash -chmod +x start-dev-env.sh -./start-dev-env.sh --full-setup - -# Windows PowerShell -.\start-dev-env.ps1 --full-setup -``` - -**What it does:** -- Creates PostgreSQL container (port 5432) -- Creates RabbitMQ container (ports 5672, 15672) -- Initializes database schema -- Creates `.env` configuration file -- Creates Python virtual environment -- Installs all dependencies (~5-10 minutes) -- Downloads CLIP model from HuggingFace (~3.5GB) -- Verifies setup - -**Time:** ~10-15 minutes (first run) - ---- - -### **Option 2: Infrastructure Only** (Default) -Just containers + database, manual Python setup: - -```bash -# Linux/macOS/Git Bash -./start-dev-env.sh - -# Windows PowerShell -.\start-dev-env.ps1 -``` - -**What it does:** -- Creates PostgreSQL + RabbitMQ containers -- Initializes database -- Creates `.env` file - -**Then manually:** -```bash -python -m venv venv -source venv/bin/activate # Linux/macOS -# OR -.\venv\Scripts\Activate.ps1 # Windows - -pip install -r requirements.txt -``` - ---- - -### **Option 3: With Wheelhouse** (Offline Installs) -Pre-compile dependencies for faster/offline installs: - -```bash -# Linux/macOS/Git Bash -./start-dev-env.sh --build-wheelhouse - -# Then install offline: -pip install --no-index --find-links=wheelhouse -r requirements.txt -``` - ---- - -## Available Flags - -| Flag | Description | -|------|-------------| -| `--full-setup` | Complete setup: infrastructure + Python + dependencies | -| `--build-wheelhouse` | Build wheelhouse for offline dependency installation | -| (no flags) | Infrastructure only (containers + database) | - ---- - -## What Gets Created - -### **Infrastructure** -| Service | Port | Access | Credentials | -|---------|------|--------|-------------| -| PostgreSQL | 5432 | localhost:5432 | postgres/postgres | -| RabbitMQ (AMQP) | 5672 | localhost:5672 | admin/admin123 | -| RabbitMQ (Management UI) | 15672 | http://localhost:15672 | guest/guest | - -### **Files** -- `.env` - Environment configuration (from `.env.example`) -- `venv/` - Python virtual environment (if `--full-setup`) -- `data/` - Data directories (reference_images, temp_images) -- `models/` - Model cache directory -- `logs/` - Application logs directory - -### **Database** -- Database: `plagiarism_db` -- Tables: `submissions`, `reference_images`, `feedback_logs` -- Extension: pgvector -- Indexes: B-tree, HNSW vector indexes - ---- - -## After Setup - -### **Start the Application** - -**Terminal 1 - Worker:** -```bash -source venv/bin/activate # Linux/macOS -# OR -.\venv\Scripts\Activate.ps1 # Windows - -python app.py -``` - -**Terminal 2 - API Server:** -```bash -source venv/bin/activate - -uvicorn api:app --reload --host 0.0.0.0 --port 8000 -``` - -**Access API:** http://localhost:8000/docs - ---- - -## Prerequisites - -### **Required** -- **Podman** (or Docker) - Container runtime -- **Python 3.10+** - Application runtime -- **8GB+ RAM** - For CLIP model -- **10GB+ disk** - For dependencies and models - -### **Optional** -- **CUDA GPU** - For faster CLIP inference (10x speedup) -- **curl** - For health checks - ---- - -## Troubleshooting - -### **"Podman not found"** -```bash -# Install Podman: https://podman.io/getting-started/installation -``` - -### **"Python not found"** -```bash -# Install Python 3.10+: https://www.python.org/downloads/ -``` - -### **"Port already in use"** -```bash -# Stop existing containers -podman stop mentorme-postgres mentorme-rabbitmq -podman rm mentorme-postgres mentorme-rabbitmq - -# Or change ports in script (POSTGRES_PORT, RABBITMQ_PORT) -``` - -### **"Database connection failed"** -```bash -# Check PostgreSQL is running -podman ps | grep mentorme-postgres - -# Check logs -podman logs mentorme-postgres - -# Restart container -podman restart mentorme-postgres -``` - -### **"RabbitMQ not ready"** -```bash -# Check RabbitMQ is running -podman ps | grep mentorme-rabbitmq - -# Access management UI -open http://localhost:15672 # guest/guest - -# Restart container -podman restart mentorme-rabbitmq -``` - ---- - -## Container Management - -### **View Logs** -```bash -podman logs mentorme-postgres -podman logs mentorme-rabbitmq -podman logs -f mentorme-postgres # Follow mode -``` - -### **Stop Containers** -```bash -podman stop mentorme-postgres mentorme-rabbitmq -``` - -### **Remove Containers** -```bash -podman rm mentorme-postgres mentorme-rabbitmq -``` - -### **Restart Containers** -```bash -podman restart mentorme-postgres mentorme-rabbitmq -``` - -### **Check Running Containers** -```bash -podman ps -``` - ---- - -## Configuration Override - -### **Edit `.env` After Creation** -Script creates `.env` from `.env.example` with localhost overrides. You can modify: - -```bash -# Example: Use different CLIP model -CLIP_MODEL=ViT-B/32 # Smaller, faster model (512D) - -# Example: Enable GPU -CLIP_DEVICE=cuda - -# Example: Enable pgvector instead of FAISS -USE_PGVECTOR=true -``` - -### **Environment Variables Priority** -1. System environment variables (highest) -2. `.env` file -3. `config.py` defaults (lowest) - ---- - -## What This Script Does NOT Do - - **Does not start the application** - You must run `python app.py` and `uvicorn api:app` - **Does not seed reference images** - Use `./seeding/seed-data.sh` or `python seeding/seed_ref_images.py` - **Does not expose port 8000** - Only exposed when API is running - **Does not use Docker Compose** - Uses Podman containers directly - ---- - -## Comparison: Script vs Docker Compose - -| Feature | This Script | Docker Compose | -|---------|-------------|----------------| -| **Tool** | Podman | Docker | -| **Python App** | Runs on host | Runs in container | -| **Development** | Faster (direct edits) | Requires rebuild | -| **Debugging** | Native debugger | Remote debugging | -| **Production** | Not recommended | Best practice | -| **Dependencies** | Installed on host | Isolated in container | - ---- - -## Examples - -### **First-Time Setup** -```bash -# Complete automated setup -./start-dev-env.sh --full-setup - -# Start worker -source venv/bin/activate -python app.py - -# In another terminal, start API -source venv/bin/activate -uvicorn api:app --host 0.0.0.0 --port 8000 -``` - -### **Daily Development** -```bash -# Containers already exist, just start them -podman start mentorme-postgres mentorme-rabbitmq - -# Activate venv and run -source venv/bin/activate -python app.py -``` - -### **Clean Restart** -```bash -# Stop and remove everything -podman stop mentorme-postgres mentorme-rabbitmq -podman rm mentorme-postgres mentorme-rabbitmq - -# Run script again -./start-dev-env.sh --full-setup -``` - ---- - -## Next Steps After Setup - -1. **(Optional) Seed reference images:** - ```bash - ./seeding/seed-data.sh --ref-images - # Or directly: python seeding/seed_ref_images.py --directory data/reference_images - ``` - -2. **Test the system:** - ```bash - python tests/simulation_e2e.py --vm-ip localhost \ - --image https://example.com/test.jpg \ - --student-id ST001 --assign-id A001 - ``` - -3. **Access API documentation:** - - OpenAPI: http://localhost:8000/docs - - ReDoc: http://localhost:8000/redoc - -4. **Monitor queues:** - - RabbitMQ UI: http://localhost:15672 - ---- - -## Support - -- **Documentation:** See `DOCUMENTATION.md` for complete system documentation -- **Issues:** Check logs in `logs/` directory -- **Database:** Connect with any PostgreSQL client to `localhost:5432` - ---- - -**Last Updated:** November 6, 2025 -**Version:** 1.0.0 +# Local Development Setup Scripts - Quick Guide + +## Overview +Automated scripts to set up local development environment for MentorMe Plagiarism Detection System. + +--- + +## Files +- `start-dev-env.sh` - Bash script (Linux/macOS/Git Bash) +- `start-dev-env.ps1` - PowerShell script (Windows) + +--- + +## Quick Start + +### **Option 1: Complete Setup (Recommended)** +Everything in one command - infrastructure + Python environment + dependencies: + +```bash +# Linux/macOS/Git Bash +chmod +x start-dev-env.sh +./start-dev-env.sh --full-setup + +# Windows PowerShell +.\start-dev-env.ps1 --full-setup +``` + +**What it does:** +- Creates PostgreSQL container (port 5432) +- Creates RabbitMQ container (ports 5672, 15672) +- Initializes database schema +- Creates `.env` configuration file +- Creates Python virtual environment +- Installs all dependencies (~5-10 minutes) +- Downloads CLIP model from HuggingFace (~3.5GB) +- Verifies setup + +**Time:** ~10-15 minutes (first run) + +--- + +### **Option 2: Infrastructure Only** (Default) +Just containers + database, manual Python setup: + +```bash +# Linux/macOS/Git Bash +./start-dev-env.sh + +# Windows PowerShell +.\start-dev-env.ps1 +``` + +**What it does:** +- Creates PostgreSQL + RabbitMQ containers +- Initializes database +- Creates `.env` file + +**Then manually:** +```bash +python -m venv venv +source venv/bin/activate # Linux/macOS +# OR +.\venv\Scripts\Activate.ps1 # Windows + +pip install -r requirements.txt +``` + +--- + +### **Option 3: With Wheelhouse** (Offline Installs) +Pre-compile dependencies for faster/offline installs: + +```bash +# Linux/macOS/Git Bash +./start-dev-env.sh --build-wheelhouse + +# Then install offline: +pip install --no-index --find-links=wheelhouse -r requirements.txt +``` + +--- + +## Available Flags + +| Flag | Description | +|------|-------------| +| `--full-setup` | Complete setup: infrastructure + Python + dependencies | +| `--build-wheelhouse` | Build wheelhouse for offline dependency installation | +| (no flags) | Infrastructure only (containers + database) | + +--- + +## What Gets Created + +### **Infrastructure** +| Service | Port | Access | Credentials | +|---------|------|--------|-------------| +| PostgreSQL | 5432 | localhost:5432 | postgres/postgres | +| RabbitMQ (AMQP) | 5672 | localhost:5672 | admin/admin123 | +| RabbitMQ (Management UI) | 15672 | http://localhost:15672 | guest/guest | + +### **Files** +- `.env` - Environment configuration (from `.env.example`) +- `venv/` - Python virtual environment (if `--full-setup`) +- `data/` - Data directories (reference_images, temp_images) +- `models/` - Model cache directory +- `logs/` - Application logs directory + +### **Database** +- Database: `plagiarism_db` +- Tables: `submissions`, `reference_images`, `feedback_logs` +- Extension: pgvector +- Indexes: B-tree, HNSW vector indexes + +--- + +## After Setup + +### **Start the Application** + +**Terminal 1 - Worker:** +```bash +source venv/bin/activate # Linux/macOS +# OR +.\venv\Scripts\Activate.ps1 # Windows + +python app.py +``` + +**Terminal 2 - API Server:** +```bash +source venv/bin/activate + +uvicorn api:app --reload --host 0.0.0.0 --port 8000 +``` + +**Access API:** http://localhost:8000/docs + +--- + +## Prerequisites + +### **Required** +- **Podman** (or Docker) - Container runtime +- **Python 3.10+** - Application runtime +- **8GB+ RAM** - For CLIP model +- **10GB+ disk** - For dependencies and models + +### **Optional** +- **CUDA GPU** - For faster CLIP inference (10x speedup) +- **curl** - For health checks + +--- + +## Troubleshooting + +### **"Podman not found"** +```bash +# Install Podman: https://podman.io/getting-started/installation +``` + +### **"Python not found"** +```bash +# Install Python 3.10+: https://www.python.org/downloads/ +``` + +### **"Port already in use"** +```bash +# Stop existing containers +podman stop mentorme-postgres mentorme-rabbitmq +podman rm mentorme-postgres mentorme-rabbitmq + +# Or change ports in script (POSTGRES_PORT, RABBITMQ_PORT) +``` + +### **"Database connection failed"** +```bash +# Check PostgreSQL is running +podman ps | grep mentorme-postgres + +# Check logs +podman logs mentorme-postgres + +# Restart container +podman restart mentorme-postgres +``` + +### **"RabbitMQ not ready"** +```bash +# Check RabbitMQ is running +podman ps | grep mentorme-rabbitmq + +# Access management UI +open http://localhost:15672 # guest/guest + +# Restart container +podman restart mentorme-rabbitmq +``` + +--- + +## Container Management + +### **View Logs** +```bash +podman logs mentorme-postgres +podman logs mentorme-rabbitmq +podman logs -f mentorme-postgres # Follow mode +``` + +### **Stop Containers** +```bash +podman stop mentorme-postgres mentorme-rabbitmq +``` + +### **Remove Containers** +```bash +podman rm mentorme-postgres mentorme-rabbitmq +``` + +### **Restart Containers** +```bash +podman restart mentorme-postgres mentorme-rabbitmq +``` + +### **Check Running Containers** +```bash +podman ps +``` + +--- + +## Configuration Override + +### **Edit `.env` After Creation** +Script creates `.env` from `.env.example` with localhost overrides. You can modify: + +```bash +# Example: Use different CLIP model +CLIP_MODEL=ViT-B/32 # Smaller, faster model (512D) + +# Example: Enable GPU +CLIP_DEVICE=cuda + +# Example: Enable pgvector instead of FAISS +USE_PGVECTOR=true +``` + +### **Environment Variables Priority** +1. System environment variables (highest) +2. `.env` file +3. `config.py` defaults (lowest) + +--- + +## What This Script Does NOT Do + + **Does not start the application** - You must run `python app.py` and `uvicorn api:app` + **Does not seed reference images** - Use `./seeding/seed-data.sh` or `python seeding/seed_ref_images.py` + **Does not expose port 8000** - Only exposed when API is running + **Does not use Docker Compose** - Uses Podman containers directly + +--- + +## Comparison: Script vs Docker Compose + +| Feature | This Script | Docker Compose | +|---------|-------------|----------------| +| **Tool** | Podman | Docker | +| **Python App** | Runs on host | Runs in container | +| **Development** | Faster (direct edits) | Requires rebuild | +| **Debugging** | Native debugger | Remote debugging | +| **Production** | Not recommended | Best practice | +| **Dependencies** | Installed on host | Isolated in container | + +--- + +## Examples + +### **First-Time Setup** +```bash +# Complete automated setup +./start-dev-env.sh --full-setup + +# Start worker +source venv/bin/activate +python app.py + +# In another terminal, start API +source venv/bin/activate +uvicorn api:app --host 0.0.0.0 --port 8000 +``` + +### **Daily Development** +```bash +# Containers already exist, just start them +podman start mentorme-postgres mentorme-rabbitmq + +# Activate venv and run +source venv/bin/activate +python app.py +``` + +### **Clean Restart** +```bash +# Stop and remove everything +podman stop mentorme-postgres mentorme-rabbitmq +podman rm mentorme-postgres mentorme-rabbitmq + +# Run script again +./start-dev-env.sh --full-setup +``` + +--- + +## Next Steps After Setup + +1. **(Optional) Seed reference images:** + ```bash + ./seeding/seed-data.sh --ref-images + # Or directly: python seeding/seed_ref_images.py --directory data/reference_images + ``` + +2. **Test the system:** + ```bash + python tests/simulation_e2e.py --vm-ip localhost \ + --image https://example.com/test.jpg \ + --student-id ST001 --assign-id A001 + ``` + +3. **Access API documentation:** + - OpenAPI: http://localhost:8000/docs + - ReDoc: http://localhost:8000/redoc + +4. **Monitor queues:** + - RabbitMQ UI: http://localhost:15672 + +--- + +## Support + +- **Documentation:** See `DOCUMENTATION.md` for complete system documentation +- **Issues:** Check logs in `logs/` directory +- **Database:** Connect with any PostgreSQL client to `localhost:5432` + +--- + +**Last Updated:** November 6, 2025 +**Version:** 1.0.0 diff --git a/start-dev-env.sh b/start-dev-env.sh old mode 100644 new mode 100755 index 298f205..15a9729 --- a/start-dev-env.sh +++ b/start-dev-env.sh @@ -1,472 +1,472 @@ -#!/usr/bin/env bash -# Local Development Startup Script -# Starts PostgreSQL and RabbitMQ containers using docker-compose - -set -e -FULL_SETUP=0 -START_API=0 -COMPOSE_FILE="docker-compose-dev.yml" - -while [[ "$#" -gt 0 ]]; do - case "$1" in - --full-setup) FULL_SETUP=1; shift ;; - --with-api) START_API=1; shift ;; - --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; - --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; - *) break ;; - esac -done - -# Detect platform -OS_TYPE="unknown" -UNAME_OUT=$(uname -s 2>/dev/null || true) -case "${UNAME_OUT}" in - MINGW*|MSYS*|CYGWIN*) OS_TYPE="windows-msys" ;; - Darwin) OS_TYPE="macos" ;; - Linux) OS_TYPE="linux" ;; - *) OS_TYPE="unix" ;; -esac - -# Detect Python executable -PY="" -if [ "${OS_TYPE}" = "windows-msys" ]; then - PY_CANDIDATES=("py" "python3" "python" "python.exe") -else - PY_CANDIDATES=("python3" "python" "py" "python.exe") -fi - -for candidate in "${PY_CANDIDATES[@]}"; do - if [ "$candidate" = "py" ]; then - if command -v py >/dev/null 2>&1; then - if py -3 -c "import sys; sys.stdout.write('Python found!!\n')" 2>/dev/null; then - PY='py -3' - break - fi - fi - continue - fi - - candidate_path=$(command -v "$candidate" 2>/dev/null || true) - if [ -n "$candidate_path" ]; then - case "$candidate_path" in - *WindowsApps*|*windowsapps*) continue ;; - esac - - if "$candidate" -c "import sys; sys.stdout.write('ok')" 2>/dev/null; then - PY="$candidate" - break - fi - fi -done - -if [ -z "$PY" ]; then - echo "ERROR: No Python 3.10+ found. Install Python or ensure it's in PATH." >&2 - exit 1 -fi - -# Detect container runtime (Podman or Docker) -CONTAINER_CMD="" -COMPOSE_CMD="" -if command -v podman &> /dev/null; then - CONTAINER_CMD="podman" - if command -v podman-compose &> /dev/null; then - COMPOSE_CMD="podman-compose" - else - echo "ERROR: podman-compose not found. Install it:" >&2 - echo " pip install podman-compose" >&2 - exit 1 - fi -elif command -v docker &> /dev/null; then - CONTAINER_CMD="docker" - if command -v docker-compose &> /dev/null; then - COMPOSE_CMD="docker-compose" - elif docker compose version &> /dev/null; then - COMPOSE_CMD="docker compose" - else - echo "ERROR: docker-compose not found. Install it:" >&2 - echo " https://docs.docker.com/compose/install/" >&2 - exit 1 - fi -else - echo "ERROR: Neither Podman nor Docker found. Install one of them:" >&2 - echo " Podman: https://podman.io/getting-started/installation" >&2 - echo " Docker: https://docs.docker.com/get-docker/" >&2 - exit 1 -fi - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -CYAN='\033[0;36m' -GRAY='\033[0;37m' -NC='\033[0m' - -echo -e "${CYAN}==================================================================" -echo -e " MentorMe Plagiarism Checker - Local Development Setup" -echo -e " Container Runtime: ${CONTAINER_CMD}" -echo -e " Compose File: ${COMPOSE_FILE}" -echo -e "==================================================================${NC}" -echo "" - -# Validate compose file exists -if [ ! -f "$COMPOSE_FILE" ]; then - echo -e "${RED}ERROR: $COMPOSE_FILE not found${NC}" - exit 1 -fi - -# Load configuration from .env if it exists -if [ -f ".env" ]; then - set -a - source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') - set +a -fi - -# Configuration (with defaults) -POSTGRES_CONTAINER="mentorme-plagiarism-postgres" -RABBITMQ_CONTAINER="mentorme-plagiarism-rabbitmq" -POSTGRES_PORT="${POSTGRES_PORT:-5432}" -RABBITMQ_PORT="${RABBITMQ_PORT:-5672}" -RABBITMQ_MGMT_PORT="${RABBITMQ_MANAGEMENT_PORT:-15672}" -POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" -POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" -POSTGRES_USER="${POSTGRES_USER:-postgres}" -RABBITMQ_USER="${RABBITMQ_USER:-admin}" -RABBITMQ_PASS="${RABBITMQ_PASS:-admin123}" -CLIP_MODEL_URL="${CLIP_MODEL_URL:-https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/resolve/main/open_clip_pytorch_model.bin}" - -stop_existing_containers() { - local containers_exist=false - - # Check if compose stack is running - if $COMPOSE_CMD -f $COMPOSE_FILE ps 2>/dev/null | grep -q "Up\|running"; then - containers_exist=true - fi - - if [ "$containers_exist" = true ]; then - echo -e "${YELLOW}Existing containers found:${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE ps - echo "" - echo -e "${YELLOW}This will stop and remove existing containers.${NC}" - read -p "Continue? (y/N): " -n 1 -r - echo - - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - echo -e "${RED}Aborted by user${NC}" - # exit 0 - else - echo -e "${CYAN}Stopping existing containers...${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE down - fi - fi - - echo -e "${GREEN}[OK] Ready to start containers${NC}" -} - -start_containers() { - echo -e "${CYAN}Starting containers with $COMPOSE_FILE...${NC}" - - # Determine which services to start - #local services="postgres rabbitmq pgadmin plagiarism-checker" - local services="postgres rabbitmq plagiarism-checker" - - if [ "$START_API" -eq 1 ]; then - services="$services api" - echo -e "${CYAN}Including API service${NC}" - - # Force rebuild API container to ensure it uses Dockerfile.api (not Dockerfile) - echo -e "${YELLOW}Rebuilding API container with Dockerfile.api...${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE build --no-cache api - - if [ $? -ne 0 ]; then - echo -e "${RED}ERROR: Failed to build API container${NC}" - exit 1 - fi - echo -e "${GREEN}[OK] API container rebuilt${NC}" - fi - - $COMPOSE_CMD -f $COMPOSE_FILE up -d $services - - if [ $? -ne 0 ]; then - echo -e "${RED}ERROR: Failed to start containers${NC}" - exit 1 - fi - - echo -e "${GREEN}[OK] Containers started${NC}" -} - -wait_for_postgres() { - echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" - - local max_attempts=30 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - attempt=$((attempt + 1)) - - if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then - echo -e "${GREEN}[OK] PostgreSQL ready${NC}" - return 0 - fi - - sleep 2 - done - - echo -e "${RED}ERROR: PostgreSQL timeout${NC}" - exit 1 -} - -wait_for_rabbitmq() { - echo -e "${YELLOW}Waiting for RabbitMQ...${NC}" - - local max_attempts=30 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - attempt=$((attempt + 1)) - - if curl -s -f http://localhost:$RABBITMQ_MGMT_PORT &>/dev/null; then - echo -e "${GREEN}[OK] RabbitMQ ready${NC}" - return 0 - fi - - sleep 2 - done - - echo -e "${RED}ERROR: RabbitMQ timeout${NC}" - exit 1 -} - -wait_for_api() { - if [ "$START_API" -ne 1 ]; then - return 0 - fi - - echo -e "${YELLOW}Waiting for API service...${NC}" - - local max_attempts=30 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - attempt=$((attempt + 1)) - - if curl -s -f http://localhost:8000/health &>/dev/null; then - echo -e "${GREEN}[OK] API service ready${NC}" - return 0 - fi - - sleep 2 - done - - echo -e "${RED}ERROR: API service timeout${NC}" - echo -e "${YELLOW}Checking API container logs:${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE logs --tail=50 api - exit 1 -} - -initialize_database() { - if [ ! -f "database/init.sql" ]; then - echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" - return - fi - - echo -e "${CYAN}Initializing database...${NC}" - - # Run init.sql - if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then - echo -e "${GREEN}[OK] Database schema initialized${NC}" - else - echo -e "${GRAY}Database schema already exists${NC}" - fi - - # Create migrations tracking table if it doesn't exist - $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " - CREATE TABLE IF NOT EXISTS schema_migrations ( - id SERIAL PRIMARY KEY, - migration_name VARCHAR(255) UNIQUE NOT NULL, - applied_at TIMESTAMP DEFAULT NOW() - ); - " 2>/dev/null - - # Run migration scripts - if [ -d "database/migrations" ]; then - local migration_count=0 - for migration_file in database/migrations/*.sql; do - if [ -f "$migration_file" ]; then - local migration_name=$(basename "$migration_file") - - # Check if migration already applied - local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " - SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; - " 2>/dev/null | tr -d '[:space:]') - - if [ "$already_applied" = "0" ]; then - echo -e "${CYAN}Applying migration: $migration_name${NC}" - - if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then - # Record migration as applied - $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " - INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); - " 2>/dev/null - echo -e "${GREEN}[OK] Applied: $migration_name${NC}" - migration_count=$((migration_count + 1)) - else - echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" - fi - fi - fi - done - - if [ $migration_count -eq 0 ]; then - echo -e "${GRAY}All migrations already applied${NC}" - else - echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" - fi - fi -} - -create_env_file() { - if [ -f ".env" ]; then - echo -e "${GREEN}[OK] .env exists${NC}" - return - fi - - if [ ! -f ".env.example" ]; then - echo -e "${RED}ERROR: .env.example not found${NC}" - exit 1 - fi - - echo -e "${CYAN}Creating .env from template...${NC}" - cp .env.example .env - - # Keep service names for docker-compose (containers communicate via service names) - # No transformation needed - .env.example already has correct service names - - echo -e "${GREEN}[OK] .env created${NC}" -} - -show_summary() { - echo -e "\n${CYAN}==================================================================" - echo -e " Environment Ready!" - echo -e "==================================================================${NC}" - echo "" - echo -e "${GREEN}✓ Services Running:${NC}" - echo -e " PostgreSQL: localhost:$POSTGRES_PORT (with pgvector)" - echo -e " RabbitMQ: localhost:$RABBITMQ_PORT" - echo -e " RabbitMQ UI: http://localhost:$RABBITMQ_MGMT_PORT ($RABBITMQ_USER/$RABBITMQ_PASS)" - - if [ "$START_API" -eq 1 ]; then - echo -e " API: http://localhost:8000" - echo -e " API Docs: http://localhost:8000/docs" - fi - - echo "" - echo -e "${CYAN}💡 Quick Start:${NC}" - echo -e " ${YELLOW}./start-dev-env.sh --full-setup${NC} ${GRAY}# Development mode (default)${NC}" - echo -e " ${YELLOW}./start-dev-env.sh --with-api${NC} ${GRAY}# Include API container${NC}" - echo -e " ${YELLOW}./start-dev-env.sh --prod${NC} ${GRAY}# Production mode${NC}" - echo "" - echo -e "${CYAN}📋 Manual Setup:${NC}" - echo -e " 1. ${YELLOW}${PY} -m venv venv${NC}" - - case "${OS_TYPE}" in - windows-msys) echo -e " ${YELLOW}source venv/Scripts/activate${NC}" ;; - *) echo -e " ${YELLOW}source venv/bin/activate${NC}" ;; - esac - - echo -e " 2. ${YELLOW}${PY} -m pip install -r requirements.txt${NC}" - echo -e " 3. ${YELLOW}${PY} app.py${NC}" - echo "" - echo -e "${CYAN}🔧 Container Commands:${NC}" - echo -e " ${GRAY}Logs: $COMPOSE_CMD -f $COMPOSE_FILE logs -f${NC}" - echo -e " ${GRAY}Stop: $COMPOSE_CMD -f $COMPOSE_FILE stop${NC}" - echo -e " ${GRAY}Remove: $COMPOSE_CMD -f $COMPOSE_FILE down${NC}" - echo "" - echo -e "${CYAN}==================================================================${NC}" -} - -main() { - create_env_file - stop_existing_containers - start_containers - wait_for_postgres - wait_for_rabbitmq - wait_for_api - initialize_database - show_summary - echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" -} - -setup_python_environment() { - echo -e "\n${CYAN}==================================================================" - echo -e " Full Setup: Python Environment" - echo -e "==================================================================${NC}" - - if [ -d "venv" ]; then - echo -e "${GRAY}Virtual environment exists${NC}" - else - echo -e "${CYAN}Creating virtual environment...${NC}" - $PY -m venv venv - echo -e "${GREEN}[OK] venv created${NC}" - fi - - echo -e "${CYAN}Activating virtual environment...${NC}" - if [ "${OS_TYPE}" = "windows-msys" ]; then - source venv/Scripts/activate - else - source venv/bin/activate - fi - - echo -e "${CYAN}Installing dependencies (5-10 minutes)...${NC}" - python -m pip install --upgrade pip setuptools wheel - python -m pip install -r requirements.txt - echo -e "${GREEN}[OK] Dependencies installed${NC}" - - echo -e "${CYAN}Creating directories...${NC}" - mkdir -p data/reference_images data/models/clip logs - echo -e "${GREEN}[OK] Directories created${NC}" - - # Download CLIP model using curl - echo -e "${CYAN}Checking CLIP model...${NC}" - if [ ! -f "data/models/clip/open_clip_pytorch_model.bin" ]; then - echo -e "${YELLOW}Downloading CLIP model (this may take a while)...${NC}" - curl -L -o data/models/clip/open_clip_pytorch_model.bin \ - ${CLIP_MODEL_URL} || { - echo -e "${YELLOW}WARNING: CLIP model download failed, will download on first run${NC}" - } - - if [ -f "data/models/clip/open_clip_pytorch_model.bin" ]; then - echo -e "${GREEN}[OK] CLIP model downloaded${NC}" - fi - else - echo -e "${GRAY}CLIP model already exists${NC}" - fi - - echo -e "${CYAN}Verifying environment...${NC}" - python -c " -import open_clip, asyncpg, aio_pika, PIL, imagehash -print('✓ All imports successful') -" || { - echo -e "${RED}ERROR: Environment verification failed${NC}" - exit 1 - } - - echo -e "\n${CYAN}==================================================================" - echo -e " Setup Complete!" - echo -e "==================================================================${NC}" - echo -e "\n${GREEN}✓ Next Steps:${NC}" - echo -e "\n${CYAN}Terminal 1 - Worker:${NC}" - [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" - echo -e " ${YELLOW}python app.py${NC}" - echo -e "\n${CYAN}Terminal 2 - API:${NC}" - [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" - echo -e " ${YELLOW}cd api && uvicorn api:app --reload --host 0.0.0.0 --port 8000${NC}" - echo -e "\n${CYAN}API Docs:${NC} ${YELLOW}http://localhost:8000/docs${NC}" - echo "" -} - -main - -if [ "$FULL_SETUP" -eq 1 ]; then - setup_python_environment -fi +#!/usr/bin/env bash +# Local Development Startup Script +# Starts PostgreSQL and RabbitMQ containers using docker-compose + +set -e +FULL_SETUP=0 +START_API=0 +COMPOSE_FILE="docker-compose-dev.yml" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --full-setup) FULL_SETUP=1; shift ;; + --with-api) START_API=1; shift ;; + --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; + --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; + *) break ;; + esac +done + +# Detect platform +OS_TYPE="unknown" +UNAME_OUT=$(uname -s 2>/dev/null || true) +case "${UNAME_OUT}" in + MINGW*|MSYS*|CYGWIN*) OS_TYPE="windows-msys" ;; + Darwin) OS_TYPE="macos" ;; + Linux) OS_TYPE="linux" ;; + *) OS_TYPE="unix" ;; +esac + +# Detect Python executable +PY="" +if [ "${OS_TYPE}" = "windows-msys" ]; then + PY_CANDIDATES=("py" "python3" "python" "python.exe") +else + PY_CANDIDATES=("python3" "python" "py" "python.exe") +fi + +for candidate in "${PY_CANDIDATES[@]}"; do + if [ "$candidate" = "py" ]; then + if command -v py >/dev/null 2>&1; then + if py -3 -c "import sys; sys.stdout.write('Python found!!\n')" 2>/dev/null; then + PY='py -3' + break + fi + fi + continue + fi + + candidate_path=$(command -v "$candidate" 2>/dev/null || true) + if [ -n "$candidate_path" ]; then + case "$candidate_path" in + *WindowsApps*|*windowsapps*) continue ;; + esac + + if "$candidate" -c "import sys; sys.stdout.write('ok')" 2>/dev/null; then + PY="$candidate" + break + fi + fi +done + +if [ -z "$PY" ]; then + echo "ERROR: No Python 3.10+ found. Install Python or ensure it's in PATH." >&2 + exit 1 +fi + +# Detect container runtime (Podman or Docker) +CONTAINER_CMD="" +COMPOSE_CMD="" +if command -v podman &> /dev/null; then + CONTAINER_CMD="podman" + if command -v podman-compose &> /dev/null; then + COMPOSE_CMD="podman-compose" + else + echo "ERROR: podman-compose not found. Install it:" >&2 + echo " pip install podman-compose" >&2 + exit 1 + fi +elif command -v docker &> /dev/null; then + CONTAINER_CMD="docker" + if command -v docker-compose &> /dev/null; then + COMPOSE_CMD="docker-compose" + elif docker compose version &> /dev/null; then + COMPOSE_CMD="docker compose" + else + echo "ERROR: docker-compose not found. Install it:" >&2 + echo " https://docs.docker.com/compose/install/" >&2 + exit 1 + fi +else + echo "ERROR: Neither Podman nor Docker found. Install one of them:" >&2 + echo " Podman: https://podman.io/getting-started/installation" >&2 + echo " Docker: https://docs.docker.com/get-docker/" >&2 + exit 1 +fi + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + +echo -e "${CYAN}==================================================================" +echo -e " MentorMe Plagiarism Checker - Local Development Setup" +echo -e " Container Runtime: ${CONTAINER_CMD}" +echo -e " Compose File: ${COMPOSE_FILE}" +echo -e "==================================================================${NC}" +echo "" + +# Validate compose file exists +if [ ! -f "$COMPOSE_FILE" ]; then + echo -e "${RED}ERROR: $COMPOSE_FILE not found${NC}" + exit 1 +fi + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="mentorme-plagiarism-postgres" +RABBITMQ_CONTAINER="mentorme-plagiarism-rabbitmq" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +RABBITMQ_PORT="${RABBITMQ_PORT:-5672}" +RABBITMQ_MGMT_PORT="${RABBITMQ_MANAGEMENT_PORT:-15672}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +RABBITMQ_USER="${RABBITMQ_USER:-admin}" +RABBITMQ_PASS="${RABBITMQ_PASS:-admin123}" +CLIP_MODEL_URL="${CLIP_MODEL_URL:-https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/resolve/main/open_clip_pytorch_model.bin}" + +stop_existing_containers() { + local containers_exist=false + + # Check if compose stack is running + if $COMPOSE_CMD -f $COMPOSE_FILE ps 2>/dev/null | grep -q "Up\|running"; then + containers_exist=true + fi + + if [ "$containers_exist" = true ]; then + echo -e "${YELLOW}Existing containers found:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE ps + echo "" + echo -e "${YELLOW}This will stop and remove existing containers.${NC}" + read -p "Continue? (y/N): " -n 1 -r + echo + + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo -e "${RED}Aborted by user${NC}" + # exit 0 + else + echo -e "${CYAN}Stopping existing containers...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE down + fi + fi + + echo -e "${GREEN}[OK] Ready to start containers${NC}" +} + +start_containers() { + echo -e "${CYAN}Starting containers with $COMPOSE_FILE...${NC}" + + # Determine which services to start + #local services="postgres rabbitmq pgadmin plagiarism-checker" + local services="postgres rabbitmq plagiarism-checker" + + if [ "$START_API" -eq 1 ]; then + services="$services api" + echo -e "${CYAN}Including API service${NC}" + + # Force rebuild API container to ensure it uses Dockerfile.api (not Dockerfile) + echo -e "${YELLOW}Rebuilding API container with Dockerfile.api...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE build --no-cache api + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to build API container${NC}" + exit 1 + fi + echo -e "${GREEN}[OK] API container rebuilt${NC}" + fi + + $COMPOSE_CMD -f $COMPOSE_FILE up -d $services + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to start containers${NC}" + exit 1 + fi + + echo -e "${GREEN}[OK] Containers started${NC}" +} + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + +wait_for_rabbitmq() { + echo -e "${YELLOW}Waiting for RabbitMQ...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if curl -s -f http://localhost:$RABBITMQ_MGMT_PORT &>/dev/null; then + echo -e "${GREEN}[OK] RabbitMQ ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: RabbitMQ timeout${NC}" + exit 1 +} + +wait_for_api() { + if [ "$START_API" -ne 1 ]; then + return 0 + fi + + echo -e "${YELLOW}Waiting for API service...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if curl -s -f http://localhost:8000/health &>/dev/null; then + echo -e "${GREEN}[OK] API service ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: API service timeout${NC}" + echo -e "${YELLOW}Checking API container logs:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE logs --tail=50 api + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +create_env_file() { + if [ -f ".env" ]; then + echo -e "${GREEN}[OK] .env exists${NC}" + return + fi + + if [ ! -f ".env.example" ]; then + echo -e "${RED}ERROR: .env.example not found${NC}" + exit 1 + fi + + echo -e "${CYAN}Creating .env from template...${NC}" + cp .env.example .env + + # Keep service names for docker-compose (containers communicate via service names) + # No transformation needed - .env.example already has correct service names + + echo -e "${GREEN}[OK] .env created${NC}" +} + +show_summary() { + echo -e "\n${CYAN}==================================================================" + echo -e " Environment Ready!" + echo -e "==================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Services Running:${NC}" + echo -e " PostgreSQL: localhost:$POSTGRES_PORT (with pgvector)" + echo -e " RabbitMQ: localhost:$RABBITMQ_PORT" + echo -e " RabbitMQ UI: http://localhost:$RABBITMQ_MGMT_PORT ($RABBITMQ_USER/$RABBITMQ_PASS)" + + if [ "$START_API" -eq 1 ]; then + echo -e " API: http://localhost:8000" + echo -e " API Docs: http://localhost:8000/docs" + fi + + echo "" + echo -e "${CYAN}💡 Quick Start:${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --full-setup${NC} ${GRAY}# Development mode (default)${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --with-api${NC} ${GRAY}# Include API container${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --prod${NC} ${GRAY}# Production mode${NC}" + echo "" + echo -e "${CYAN}📋 Manual Setup:${NC}" + echo -e " 1. ${YELLOW}${PY} -m venv venv${NC}" + + case "${OS_TYPE}" in + windows-msys) echo -e " ${YELLOW}source venv/Scripts/activate${NC}" ;; + *) echo -e " ${YELLOW}source venv/bin/activate${NC}" ;; + esac + + echo -e " 2. ${YELLOW}${PY} -m pip install -r requirements.txt${NC}" + echo -e " 3. ${YELLOW}${PY} app.py${NC}" + echo "" + echo -e "${CYAN}🔧 Container Commands:${NC}" + echo -e " ${GRAY}Logs: $COMPOSE_CMD -f $COMPOSE_FILE logs -f${NC}" + echo -e " ${GRAY}Stop: $COMPOSE_CMD -f $COMPOSE_FILE stop${NC}" + echo -e " ${GRAY}Remove: $COMPOSE_CMD -f $COMPOSE_FILE down${NC}" + echo "" + echo -e "${CYAN}==================================================================${NC}" +} + +main() { + create_env_file + stop_existing_containers + start_containers + wait_for_postgres + wait_for_rabbitmq + wait_for_api + initialize_database + show_summary + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +setup_python_environment() { + echo -e "\n${CYAN}==================================================================" + echo -e " Full Setup: Python Environment" + echo -e "==================================================================${NC}" + + if [ -d "venv" ]; then + echo -e "${GRAY}Virtual environment exists${NC}" + else + echo -e "${CYAN}Creating virtual environment...${NC}" + $PY -m venv venv + echo -e "${GREEN}[OK] venv created${NC}" + fi + + echo -e "${CYAN}Activating virtual environment...${NC}" + if [ "${OS_TYPE}" = "windows-msys" ]; then + source venv/Scripts/activate + else + source venv/bin/activate + fi + + echo -e "${CYAN}Installing dependencies (5-10 minutes)...${NC}" + python -m pip install --upgrade pip setuptools wheel + python -m pip install -r requirements.txt + echo -e "${GREEN}[OK] Dependencies installed${NC}" + + echo -e "${CYAN}Creating directories...${NC}" + mkdir -p data/reference_images data/models/clip logs + echo -e "${GREEN}[OK] Directories created${NC}" + + # Download CLIP model using curl + echo -e "${CYAN}Checking CLIP model...${NC}" + if [ ! -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${YELLOW}Downloading CLIP model (this may take a while)...${NC}" + curl -L -o data/models/clip/open_clip_pytorch_model.bin \ + ${CLIP_MODEL_URL} || { + echo -e "${YELLOW}WARNING: CLIP model download failed, will download on first run${NC}" + } + + if [ -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${GREEN}[OK] CLIP model downloaded${NC}" + fi + else + echo -e "${GRAY}CLIP model already exists${NC}" + fi + + echo -e "${CYAN}Verifying environment...${NC}" + python -c " +import open_clip, asyncpg, aio_pika, PIL, imagehash +print('✓ All imports successful') +" || { + echo -e "${RED}ERROR: Environment verification failed${NC}" + exit 1 + } + + echo -e "\n${CYAN}==================================================================" + echo -e " Setup Complete!" + echo -e "==================================================================${NC}" + echo -e "\n${GREEN}✓ Next Steps:${NC}" + echo -e "\n${CYAN}Terminal 1 - Worker:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}python app.py${NC}" + echo -e "\n${CYAN}Terminal 2 - API:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}cd api && uvicorn api:app --reload --host 0.0.0.0 --port 8000${NC}" + echo -e "\n${CYAN}API Docs:${NC} ${YELLOW}http://localhost:8000/docs${NC}" + echo "" +} + +main + +if [ "$FULL_SETUP" -eq 1 ]; then + setup_python_environment +fi From e87d4fb7c418b29ec4fbffeea5385b307c795bd0 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 2 Jan 2026 10:28:09 +0530 Subject: [PATCH 02/13] plg check against reference images --- .gitignore | 2 + config/config.py | 2 +- database/init.sql | 4 +- image_worker/assigment_ref_images.py | 347 +++++++++++++++++++++++++++ image_worker/worker.py | 87 ++++++- processors/image_processor.py | 3 + 6 files changed, 438 insertions(+), 7 deletions(-) create mode 100644 image_worker/assigment_ref_images.py diff --git a/.gitignore b/.gitignore index 3e1e6d7..897c762 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +scratch* + # Python __pycache__/ *.py[cod] diff --git a/config/config.py b/config/config.py index a9d9572..63060ff 100644 --- a/config/config.py +++ b/config/config.py @@ -93,7 +93,7 @@ class DetectionConfig(BaseSettings): exact_dup_threshold: float = Field(default=0.95, env="EXACT_DUPLICATE_THRESHOLD") near_dup_threshold: float = Field(default=0.90, env="NEAR_DUPLICATE_THRESHOLD") - semantic_threshold: float = Field(default=0.80, env="SEMANTIC_MATCH_THRESHOLD") + semantic_threshold: float = Field(default=0.70, env="SEMANTIC_MATCH_THRESHOLD") # Hash matching thresholds (Hamming distance, 0-64 bits) hash_threshold: int = Field(default=8, env="HASH_MATCH_THRESHOLD") diff --git a/database/init.sql b/database/init.sql index 0448456..265a61b 100644 --- a/database/init.sql +++ b/database/init.sql @@ -72,12 +72,12 @@ USING hnsw (clip_embedding vector_ip_ops); -- Reference images corpus CREATE TABLE IF NOT EXISTS reference_images ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), - reference_id VARCHAR(100) UNIQUE NOT NULL, + reference_id VARCHAR(200) UNIQUE NOT NULL, -- will be used as assignment id when fetching references from assignments image_path TEXT NOT NULL, phash VARCHAR(64) NOT NULL, dhash VARCHAR(64) NOT NULL, ahash VARCHAR(64) NOT NULL, - category VARCHAR(100), + category VARCHAR(200), description TEXT, source VARCHAR(200), faiss_index_position INTEGER, diff --git a/image_worker/assigment_ref_images.py b/image_worker/assigment_ref_images.py new file mode 100644 index 0000000..10617f4 --- /dev/null +++ b/image_worker/assigment_ref_images.py @@ -0,0 +1,347 @@ +import base64 +import io +import os +from PIL import Image +import requests +from typing import List, Dict, Optional +from dotenv import load_dotenv +import asyncpg +from datetime import datetime +import logging + +load_dotenv() + +logger = logging.getLogger(__name__) + +# Environment variables +ASSIGNMENT_CACHE_DAYS = int(os.getenv("ASSIGNMENT_CACHE_DAYS", "2")) +ENABLE_CACHE = os.getenv("ENABLE_CACHE", "true").lower() == "true" +PURGE_CACHE = os.getenv("PURGE_CACHE", "false").lower() == "true" + +# Database configuration +DB_CONFIG = { + "host": os.getenv("POSTGRES_HOST", "localhost"), + "port": int(os.getenv("POSTGRES_PORT", 5432)), + "database": os.getenv("POSTGRES_DB", "plagiarism_db"), + "user": os.getenv("POSTGRES_USER", "postgres"), + "password": os.getenv("POSTGRES_PASSWORD", "postgres"), +} + + +async def get_db_connection(): + """Create async database connection.""" + conn_string = ( + f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}" + f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}" + ) + return await asyncpg.connect(conn_string) + + +async def cleanup_cache(): + """Delete cached assignment reference images older than ASSIGNMENT_CACHE_DAYS.""" + try: + conn = await get_db_connection() + try: + # cutoff_date = datetime.utcnow() - timedelta(days=ASSIGNMENT_CACHE_DAYS) + + # Delete old assignment caches (reference_ids starting with "ASSIGN-") + result = await conn.execute( + """ + DELETE FROM reference_images + WHERE reference_id LIKE 'ASSIGN-%' + """ + ) + + deleted_count = int(result.split()[-1]) if result else 0 + if deleted_count > 0: + logger.info(f"Cleaned up {deleted_count} cached assignment reference images") + + finally: + await conn.close() + + except Exception as e: + logger.error(f"Cache cleanup failed: {e}") + + +async def get_cached_assignment(assignment_id: str) -> Optional[List[Dict]]: + """ + Retrieve cached reference images for an assignment from database. + + Args: + assignment_id: Assignment identifier + + Returns: + List of reference image dicts with precomputed hashes (no embeddings), or None if not cached + """ + try: + conn = await get_db_connection() + try: + # Query for cached images with this assignment_id (no clip_embedding in SELECT) + rows = await conn.fetch( + """ + SELECT reference_id, image_path, phash, dhash, ahash, created_at + FROM reference_images + WHERE reference_id LIKE $1 + ORDER BY reference_id + """, + f"ASSIGN-{assignment_id}-%" + ) + + if not rows: + logger.info(f"No cached reference images found for assignment: {assignment_id}") + return None + + # Check if cache is still valid + oldest_created = min(row['created_at'] for row in rows) + age_days = (datetime.utcnow() - oldest_created).days + + if age_days > ASSIGNMENT_CACHE_DAYS: + logger.info(f"Cache expired for assignment {assignment_id} (age: {age_days} days)") + return None + + logger.info(f"Retrieved {len(rows)} cached reference images for assignment: {assignment_id}") + + # Return just the hashes and name (no embeddings) + images = [] + for row in rows: + images.append({ + "name": row['reference_id'] + row['image_path'], # Image name stored in image_path + "phash": row['phash'], + "dhash": row['dhash'], + "ahash": row['ahash'] + }) + + return images + + finally: + await conn.close() + + except Exception as e: + logger.error(f"Failed to retrieve cached assignment: {e}") + return None + + +async def save_to_cache(assignment_id: str, images: List[Dict]): + """ + Save assignment reference images (hashes + embeddings) to database cache. + + Args: + assignment_id: Assignment identifier + images: List of image dicts with "name", "phash", "dhash", "ahash", and optionally "embedding" + """ + try: + conn = await get_db_connection() + try: + cached_count = 0 + + for idx, img_data in enumerate(images): + try: + # Create unique reference_id for this assignment's reference image + reference_id = f"ASSIGN-{assignment_id}-{idx:03d}" + image_name = img_data.get("name", f"ref_{idx}") + + # Check if hashes are precomputed + if "phash" not in img_data or "dhash" not in img_data or "ahash" not in img_data: + logger.warning(f"Hashes not precomputed for image {idx}, skipping cache") + continue + + # Extract embedding if present + embedding = img_data.get("embedding") + + if embedding is not None: + # Convert numpy array to pgvector format + embedding_str = '[' + ','.join(map(str, embedding.tolist())) + ']' + clip_generated = True + else: + embedding_str = None + clip_generated = False + + # Insert into database with embedding + await conn.execute( + """ + INSERT INTO reference_images + (reference_id, image_path, phash, dhash, ahash, category, description, source, + clip_embedding_generated, clip_embedding) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::vector) + ON CONFLICT (reference_id) DO UPDATE SET + image_path = EXCLUDED.image_path, + phash = EXCLUDED.phash, + dhash = EXCLUDED.dhash, + ahash = EXCLUDED.ahash, + clip_embedding_generated = EXCLUDED.clip_embedding_generated, + clip_embedding = EXCLUDED.clip_embedding, + updated_at = NOW() + """, + reference_id, + image_name, + img_data["phash"], + img_data["dhash"], + img_data["ahash"], + "assignment_cache", + f"Reference image from {image_name}", + f"assignment_{assignment_id}", + clip_generated, + embedding_str + ) + + cached_count += 1 + + except Exception as img_err: + logger.error(f"Failed to cache image {idx} for assignment {assignment_id}: {img_err}") + continue + + logger.info(f"Cached {cached_count}/{len(images)} reference images for assignment: {assignment_id}") + + finally: + await conn.close() + + except Exception as e: + logger.error(f"Failed to save to cache: {e}") + + +async def get_reference_images( + assignment_id: str, + clip_handler=None, + hash_handler=None +) -> Optional[List[Dict]]: + """ + Fetch reference images for an assignment from TAP LMS API with caching support. + + Args: + assignment_id: Assignment identifier + clip_handler: CLIPHandler instance from worker (optional) + hash_handler: HashHandler instance from worker (optional) + + Returns: + List of reference image dictionaries with precomputed hashes (no embeddings in return) + """ + try: + # Cleanup old cache if enabled + if PURGE_CACHE: + await cleanup_cache() + + # Check cache first if enabled + if ENABLE_CACHE: + cached_images = await get_cached_assignment(assignment_id) + if cached_images is not None: + logger.info(f"Using cached reference images for assignment: {assignment_id}") + return cached_images + + # Fetch from API + logger.info(f"Fetching reference images from API for assignment: {assignment_id}") + images = await fetch_from_api(assignment_id, clip_handler, hash_handler) + + # Save to cache if enabled (embeddings will be saved to DB but not returned) + if ENABLE_CACHE and images: + await save_to_cache(assignment_id, images) + + # Remove embeddings from return object + if images: + for img in images: + img.pop("embedding", None) + + return images + + except Exception as e: + logger.error(f"Error fetching reference images: {e}") + return None + + +async def fetch_from_api( + assignment_id: str, + clip_handler=None, + hash_handler=None +) -> Optional[List[Dict]]: + """ + Fetch reference images for an assignment from TAP LMS API and compute hashes + embeddings. + + Args: + assignment_id: Assignment identifier + clip_handler: CLIPHandler instance from worker (optional) + hash_handler: HashHandler instance from worker (optional) + + Returns: + List of reference image dictionaries with precomputed hashes and embeddings + """ + api_key = os.getenv("FRAPPE_API_KEY") + api_secret = os.getenv("FRAPPE_API_SECRET") + base_url = os.getenv("FRAPPE_API_BASE_URL") + + if not all([api_key, api_secret, base_url]): + logger.error("Missing API configuration: FRAPPE_API_KEY, FRAPPE_API_SECRET, or FRAPPE_API_BASE_URL") + return None + + if hash_handler is None: + logger.error("No hash_handler provided, cannot compute hashes") + return None + + assignment_context_endpoint = "api/method/tap_lms.imgana.submission.get_assignment_context" + + headers = { + "Content-Type": "application/json", + "Authorization": f"token {api_key}:{api_secret}" + } + + api_url = f"{base_url.rstrip('/')}/{assignment_context_endpoint.lstrip('/')}" + + try: + # Use synchronous requests in async context (consider aiohttp for true async) + response = requests.post( + api_url, + headers=headers, + json={"assignment_id": assignment_id}, + timeout=30 + ) + + response.raise_for_status() + data = response.json() + + reference_images = data.get("message", {}).get("assignment", {}).get("reference_images", []) + + if not reference_images: + logger.warning(f"No reference images found for assignment: {assignment_id}") + return [] + + # Process images: decode, compute hashes and embeddings, then discard PIL objects + processed_images = [] + for image in reference_images: + try: + decoded_bytes = base64.b64decode(image["content"]) + image_obj = Image.open(io.BytesIO(decoded_bytes)) + + # Compute hashes using passed handler + hashes = hash_handler.compute_hashes(image_obj) + + # Generate CLIP embedding if handler provided + embedding = None + if clip_handler is not None: + try: + embedding = clip_handler.generate_embedding(image_obj) + except Exception as embed_err: + logger.error(f"Failed to generate embedding for {image.get('name', 'unknown')}: {embed_err}") + + # Close PIL Image - we don't need it anymore + image_obj.close() + + # Store hashes and embedding (embedding will be saved to DB but removed before return) + processed_images.append({ + "name": image.get("name", f"ref_{len(processed_images)}"), + "phash": hashes["phash"], + "dhash": hashes["dhash"], + "ahash": hashes["ahash"], + "embedding": embedding # Temporary, for save_to_cache + }) + + except Exception as img_err: + logger.error(f"Failed to process image {image.get('name', 'unknown')}: {img_err}") + continue + + logger.info(f"Fetched and processed {len(processed_images)} reference images from API for assignment: {assignment_id}") + return processed_images + + except requests.exceptions.RequestException as e: + logger.error(f"API request failed for assignment {assignment_id}: {e}") + return None + except Exception as e: + logger.error(f"Failed to fetch from API: {e}") + return None \ No newline at end of file diff --git a/image_worker/worker.py b/image_worker/worker.py index 0798bef..012d2e9 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -11,6 +11,7 @@ from typing import Dict, Optional, Tuple, Any from urllib.parse import urlparse from dotenv import load_dotenv +from image_worker.assigment_ref_images import get_reference_images from config.config import config from database.db_manager import DatabaseManager @@ -291,7 +292,75 @@ async def _async_compare_self(self, hashes, prev): ) return comparison, prev - async def check_hash_match( + async def check_assignment_reference_hash_match( + self, hashes: dict, assignment_id: str + ) -> Tuple[bool, Optional[str], Optional[float], Optional[str]]: + """ + Check if submission matches any reference image via perceptual hash comparison. + + Uses three hash types (pHash, dHash, aHash) for robust duplicate detection. + + Args: + hashes: Dict containing 'phash', 'dhash', 'ahash' hex strings + + Returns: + Tuple of (is_match, reference_id, similarity_score, image_url) + - is_match: True if hash match found + - reference_id: UUID of matched reference (or None) + - similarity_score: 0.0-1.0 similarity score (or None) + - image_url: URL of matched reference image (or None) + + Raises: + Exception: If database query fails + """ + try: + + references = await get_reference_images(assignment_id, self.clip_handler, self.hash_handler) + if not references: + return False, None, None, None + + # for ref_image in references: + # if ref_image["content"] is not None: + # hashes = self.hash_handler.compute_hashes(ref_image["content"]) + # ref_image['phash'] = hashes['phash'] + # ref_image['dhash'] = hashes['dhash'] + # ref_image['ahash'] = hashes['ahash'] + + tasks = [self._async_compare_ref(hashes, ref) for ref in references] + results = await asyncio.gather(*tasks) + + + best_match = None + best_score = 999 + best_comparison = None + for comparison, ref in results: + if comparison["is_match"] and comparison["avg_distance"] < best_score: + best_score = comparison["avg_distance"] + best_match = ref + best_comparison = comparison + + if best_match and best_comparison: + print("#"*70) + print("Best match found:",best_match["name"]) + print("#"*70) + logger.info("Assignment reference match found") + similarity = 1 - (best_score / 64.0) + return ( + True, + str(best_match["name"]), + similarity, + str(best_match["name"]), + ) + else: + logger.info("No assignment reference match found") + return False, None, None, None + + except Exception as e: + logger.error(f"Hash check failed: {e}", exc_info=True) + raise + + + async def check_db_reference_hash_match( self, hashes: dict ) -> Tuple[bool, Optional[str], Optional[float], Optional[str]]: """ @@ -375,6 +444,11 @@ async def check_clip_match( if not results: return None, 0.0, None + + print("#"*70) + for ref_id, sim, meta in results: + print(f" Ref ID: {ref_id}, Similarity: {sim:.4f}, Meta: {meta}") + print("#"*70) matches = [ (ref_id, sim, meta) @@ -540,7 +614,8 @@ async def process_submission(self, data: Dict[str, Any]) -> Optional[str]: student_id, self_result.get("first_submission_date_for_image", None), ) - hash_check_result = await self.check_hash_match(hashes) + # hash_check_result = await self.check_db_reference_hash_match(hashes) + hash_check_result = await self.check_assignment_reference_hash_match(hashes,assign_id) ( hash_match, @@ -1020,6 +1095,9 @@ def _create_stock_image_result( "student_id": student_id, "assignment_id": assign_id, "image_url": image_url, + "is_ai_generated": False, + "ai_detection_source": "None", + "ai_confidence": 0.0, "is_plagiarized": True, "similarity_score": 1.0, "match_type": "stock_image", @@ -1290,8 +1368,9 @@ def format_results( ) message["ai_confidence"] = plagiarism_status.get("ai_confidence", 0.0) - payload_preview = json.dumps(message, indent=2)[:2000] - logger.info(f"Result payload preview (2000 chars):\n{payload_preview}...") + # payload_preview = json.dumps(message, indent=2)[:2000] + # logger.info(f"Result payload preview (2000 chars):\n{payload_preview}...") + return json.dumps(message) diff --git a/processors/image_processor.py b/processors/image_processor.py index 653a66c..02f0035 100644 --- a/processors/image_processor.py +++ b/processors/image_processor.py @@ -65,6 +65,9 @@ async def process(self, data: dict) -> dict: ) return {"error": "Invalid JSON response from worker"} + logger.info("Result payload ") + logger.info(json.dumps(result, indent=2)) + logger.info( f"Successfully processed submission: {data.get('submission_id')}" ) From 2b7ca67e60b9a3fd58d60a4f4a61066d8237ff15 Mon Sep 17 00:00:00 2001 From: manua-glitch Date: Fri, 2 Jan 2026 10:32:25 +0530 Subject: [PATCH 03/13] Delete .env.prod --- .env.prod | 119 ------------------------------------------------------ 1 file changed, 119 deletions(-) delete mode 100644 .env.prod diff --git a/.env.prod b/.env.prod deleted file mode 100644 index 101b48d..0000000 --- a/.env.prod +++ /dev/null @@ -1,119 +0,0 @@ -# RABBITMQ CONFIGURATION -RABBITMQ_HOST=armadillo.rmq.cloudamqp.com -RABBITMQ_PORT=5672 -RABBITMQ_USER=fzdqidte -RABBITMQ_PASS=0SMrDogBVcWUcu9brWwp2QhET_kArl59 -RABBITMQ_VHOST=fzdqidte -RABBITMQ_MANAGEMENT_PORT=15672 -RABBITMQ_PREFETCH_COUNT=1 - -# Message retry configuration -# Maximum number of retries before sending to DLQ (prevents poison messages) -MAX_RETRIES=3 - -# Queue Names -SUBMISSION_QUEUE=plagiarism_submissions -FEEDBACK_QUEUE=plagiarism_feedback -# Dead Letter Queue (optional - leave empty to disable) -DEAD_LETTER_QUEUE=plagiarism_failed_submissions - -# POSTGRESQL CONFIGURATION -POSTGRES_HOST=db.example.com -POSTGRES_PORT=5432 -POSTGRES_DB=plagiarism_db -POSTGRES_USER=postgres -POSTGRES_PASSWORD=postgres - -# PGADMIN CONFIGURATION (Optional - for development only) -PGADMIN_EMAIL=admin@admin.com -PGADMIN_PASSWORD=admin123 - -# Connection Pool -POSTGRES_POOL_SIZE=10 -POSTGRES_MAX_OVERFLOW=20 - -# PLAGIARISM DETECTION THRESHOLDS -EXACT_DUPLICATE_THRESHOLD=0.95 -NEAR_DUPLICATE_THRESHOLD=0.90 -SEMANTIC_MATCH_THRESHOLD=0.80 - -# ==== PRODUCTION: Uncomment below for 7-day window ==== -RESUBMISSION_WINDOW_DAYS=14 - -# Hash comparison threshold (Hamming distance) -HASH_MATCH_THRESHOLD=10 - -# IMAGE PROCESSING -# Maximum image size in MB -MAX_IMAGE_SIZE_MB=10 - -# Image download timeout in seconds -IMAGE_DOWNLOAD_TIMEOUT=30 - -# Image validation thresholds -# Min variance to detect blank images (lower = more strict) -IMAGE_MIN_VARIANCE=5.0 -# Min unique colors required -IMAGE_MIN_UNIQUE_COLORS=10 -# Max ratio of dominant color (higher = more permissive) -IMAGE_MAX_SOLID_COLOR_RATIO=0.95 - -# CLIP Model Configuration -CLIP_MODEL=ViT-L/14 -CLIP_DEVICE=cpu -CLIP_PRETRAINED=laion2B-s32B-b82K - -# Local Model Path (Optional - use pre-downloaded models) -# If set, the system will load the model from this path instead of downloading from HuggingFace -# Example: CLIP_LOCAL_MODEL_PATH=./models/clip/open_clip_pytorch_model.bin -# Download models from: https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K -CLIP_LOCAL_MODEL_PATH=/app/models/clip/open_clip_pytorch_model.bin - -# Disable SSL verification for HuggingFace downloads (for corporate proxy/self-signed certs) -# Set to "true" only if you encounter SSL certificate errors -DISABLE_SSL_VERIFY=true -PYTHONHTTPSVERIFY=0 - -# VECTOR SEARCH CONFIGURATION -# Use pgvector (PostgreSQL) or FAISS for vector similarity search -USE_PGVECTOR=true - -# FAISS Configuration -FAISS_INDEX_PATH=/app/data/faiss_index.bin -FAISS_METADATA_PATH=/app/data/faiss_metadata.json -FAISS_DIMENSION=768 -FAISS_TOP_K=4 # Number of top candidates to retrieve from FAISS search - -# STORAGE PATHS -# Reference images directory -REFERENCE_IMAGES_DIR=./data/reference_images - -# Temporary storage for downloaded submissions -TEMP_IMAGES_DIR=./data/temp_images - -# Logs directory -#LOGS_DIR=./logs - -# APPLICATION SETTINGS -LOG_LEVEL=INFO - -# Worker concurrency (number of threads) -#WORKER_THREADS=4 - -# Enable performance metrics(ignore) -#ENABLE_METRICS=true - -# DEVELOPMENT SETTINGS -# Set to "development" or "production" -#ENVIRONMENT=development - -#DEBUG=true - - - -# Mock Glific API (for testing without WhatsApp) -#Used to skip WhatsApp delivery in testing mode -MOCK_GLIFIC=true -# ==== TESTING: 2-minute resubmission window (comment out for production) ==== -RESUBMISSION_WINDOW_MINUTES=2 - From 598574cf710e6fca8978318d6d914f99ec6092f5 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 2 Jan 2026 11:11:54 +0530 Subject: [PATCH 04/13] podman-compose changes --- docker-compose-prod.yml | 57 ++++- docs/DOCUMENTATION.md | 6 +- requirements.txt | 61 +++++- start-dev-env.ps1 | 4 +- start-dev-env.sh | 4 +- start-prod-env.sh | 445 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 554 insertions(+), 23 deletions(-) create mode 100755 start-prod-env.sh diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 89a21cb..3e2e144 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -1,6 +1,42 @@ version: '3.8' services: + # =================================== + # POSTGRESQL - Database + # =================================== + postgres: + image: pgvector/pgvector:pg16 + container_name: plg-postgres + ports: + - "5432:5432" + environment: + POSTGRES_DB: ${POSTGRES_DB} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_INITDB_ARGS: "-E UTF8" + POSTGRES_MAX_CONNECTIONS: 20 + PGDATA: /var/lib/postgresql/data/pgdata + volumes: + - postgres_data:/var/lib/postgresql/data + - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + networks: + - plg-network + restart: unless-stopped + # =================================== # PLAGIARISM CHECKER SERVICE # =================================== @@ -8,32 +44,37 @@ services: build: context: . dockerfile: Dockerfile - container_name: mentorme-plagiarism-checker + container_name: plg-checker env_file: - .env volumes: - ./data:/app/data - ./logs:/app/logs depends_on: - rabbitmq: - condition: service_healthy postgres: condition: service_healthy deploy: resources: limits: - cpus: '1.0' - memory: 4G + cpus: '8.0' # Increased from 4.0 - allows up to 8 CPU cores + memory: 8G reservations: - cpus: '0.5' + cpus: '2' # Increased from 1 - guarantees 2 cores minimum memory: 2G restart: unless-stopped networks: - - mentorme-plagiarism-network + - plg-network + +# =================================== +# VOLUMES +# =================================== +volumes: + postgres_data: + driver: local # =================================== # NETWORKS # =================================== networks: - mentorme-plagiarism-network: + plg-network: driver: bridge diff --git a/docs/DOCUMENTATION.md b/docs/DOCUMENTATION.md index 9bae9d4..9747212 100644 --- a/docs/DOCUMENTATION.md +++ b/docs/DOCUMENTATION.md @@ -776,11 +776,11 @@ asyncio.run(test()) #### Build Docker Image ```bash # Standard build (model downloaded on first run) -docker build -t mentorme-plagiarism:latest . +docker build -t plg:latest . # With HuggingFace token for model prefetch during build (optional) # This pre-downloads the CLIP model into the Docker image -docker build -t mentorme-plagiarism:latest \ +docker build -t plg:latest \ --build-arg HUGGINGFACE_HUB_TOKEN=your_token_here . # Note: HuggingFace token is optional - public models can be downloaded without authentication @@ -804,7 +804,7 @@ docker-compose down # docker-compose.yml snippet services: worker: - image: mentorme-plagiarism:latest + image: plg:latest environment: - POSTGRES_HOST=postgres - RABBITMQ_HOST=rabbitmq diff --git a/requirements.txt b/requirements.txt index 37d4505..4e590f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,28 +8,73 @@ pgvector==0.4.1 # HTTP Client aiohttp==3.13.2 +aiohappyeyeballs==2.6.1 +aiosignal==1.4.0 requests==2.32.3 # Configuration Management (Pydantic v2 compatible) -pydantic<2.0.0,>=1.10.0 +pydantic==1.10.24 python-dotenv==1.2.1 # Image Processing pillow==12.0.0 -imagehash==4.3.2 +ImageHash==4.3.2 # Machine Learning - CLIP & FAISS torch==2.9.0 torchvision==0.24.0 -open-clip-torch==3.2.0 +open_clip_torch==3.2.0 faiss-cpu==1.12.0 numpy==2.3.4 +timm==1.0.22 +safetensors==0.7.0 +huggingface_hub==1.2.1 +filelock==3.20.0 +fsspec==2025.12.0 +ftfy==6.3.1 +regex==2025.11.3 +scipy==1.16.3 +mpmath==1.3.0 +sympy==1.14.0 +networkx==3.6 +PyWavelets==1.9.0 + +# Web Framework - FastAPI +fastapi==0.115.0 +uvicorn==0.34.0 +starlette==0.38.6 +anyio==4.12.0 +h11==0.16.0 +httptools==0.7.1 +uvloop==0.22.1 +watchfiles==1.1.1 +websockets==15.0.1 +click==8.3.1 +typer-slim==0.20.0 +shellingham==1.5.4 +httpcore==1.0.9 +httpx==0.28.1 +hf-xet==1.2.0 # Utilities tqdm==4.67.1 - podman-compose==1.5.0 - -# Web Framework - FastAPI -fastapi==0.115.0 -uvicorn[standard]==0.34.0 \ No newline at end of file +annotated-doc==0.0.4 +attrs==25.4.0 +certifi==2025.11.12 +charset-normalizer==3.4.4 +frozenlist==1.8.0 +idna==3.11 +Jinja2==3.1.6 +MarkupSafe==3.0.3 +multidict==6.7.0 +packaging==25.0 +pamqp==3.3.0 +propcache==0.4.1 +PyYAML==6.0.3 +setuptools==80.9.0 +typing_extensions==4.15.0 +urllib3==2.6.0 +wcwidth==0.2.14 +wheel==0.45.1 +yarl==1.22.0 diff --git a/start-dev-env.ps1 b/start-dev-env.ps1 index 2186b0d..d59d24d 100644 --- a/start-dev-env.ps1 +++ b/start-dev-env.ps1 @@ -80,8 +80,8 @@ if (Test-Path ".env") { } # Configuration (with defaults from environment or hardcoded) -$POSTGRES_CONTAINER = "mentorme-plagiarism-postgres" -$RABBITMQ_CONTAINER = "mentorme-plagiarism-rabbitmq" +$POSTGRES_CONTAINER = "plg-postgres" +$RABBITMQ_CONTAINER = "plg-rabbitmq" $POSTGRES_PORT = if ($env:POSTGRES_PORT) { $env:POSTGRES_PORT } else { 5432 } $RABBITMQ_PORT = if ($env:RABBITMQ_PORT) { $env:RABBITMQ_PORT } else { 5672 } $RABBITMQ_MGMT_PORT = if ($env:RABBITMQ_MANAGEMENT_PORT) { $env:RABBITMQ_MANAGEMENT_PORT } else { 15672 } diff --git a/start-dev-env.sh b/start-dev-env.sh index 15a9729..8878a03 100755 --- a/start-dev-env.sh +++ b/start-dev-env.sh @@ -123,8 +123,8 @@ if [ -f ".env" ]; then fi # Configuration (with defaults) -POSTGRES_CONTAINER="mentorme-plagiarism-postgres" -RABBITMQ_CONTAINER="mentorme-plagiarism-rabbitmq" +POSTGRES_CONTAINER="plg-postgres" +RABBITMQ_CONTAINER="plg-rabbitmq" POSTGRES_PORT="${POSTGRES_PORT:-5432}" RABBITMQ_PORT="${RABBITMQ_PORT:-5672}" RABBITMQ_MGMT_PORT="${RABBITMQ_MANAGEMENT_PORT:-15672}" diff --git a/start-prod-env.sh b/start-prod-env.sh new file mode 100755 index 0000000..8f92a20 --- /dev/null +++ b/start-prod-env.sh @@ -0,0 +1,445 @@ +#!/usr/bin/env bash +# Local Development Startup Script +# Starts PostgreSQL and RabbitMQ containers using docker-compose + +set -e +FULL_SETUP=0 +START_API=0 +COMPOSE_FILE="docker-compose-prod.yml" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --full-setup) FULL_SETUP=1; shift ;; + --with-api) START_API=1; shift ;; + --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; + --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; + *) break ;; + esac +done + +# Detect platform +OS_TYPE="unknown" +UNAME_OUT=$(uname -s 2>/dev/null || true) +case "${UNAME_OUT}" in + MINGW*|MSYS*|CYGWIN*) OS_TYPE="windows-msys" ;; + Darwin) OS_TYPE="macos" ;; + Linux) OS_TYPE="linux" ;; + *) OS_TYPE="unix" ;; +esac + +# Detect Python executable +PY="" +if [ "${OS_TYPE}" = "windows-msys" ]; then + PY_CANDIDATES=("py" "python3" "python" "python.exe") +else + PY_CANDIDATES=("python3" "python" "py" "python.exe") +fi + +for candidate in "${PY_CANDIDATES[@]}"; do + if [ "$candidate" = "py" ]; then + if command -v py >/dev/null 2>&1; then + if py -3 -c "import sys; sys.stdout.write('Python found!!\n')" 2>/dev/null; then + PY='py -3' + break + fi + fi + continue + fi + + candidate_path=$(command -v "$candidate" 2>/dev/null || true) + if [ -n "$candidate_path" ]; then + case "$candidate_path" in + *WindowsApps*|*windowsapps*) continue ;; + esac + + if "$candidate" -c "import sys; sys.stdout.write('ok')" 2>/dev/null; then + PY="$candidate" + break + fi + fi +done + +if [ -z "$PY" ]; then + echo "ERROR: No Python 3.10+ found. Install Python or ensure it's in PATH." >&2 + exit 1 +fi + +# Detect container runtime (Podman or Docker) +CONTAINER_CMD="" +COMPOSE_CMD="" +if command -v podman &> /dev/null; then + CONTAINER_CMD="podman" + if command -v podman-compose &> /dev/null; then + COMPOSE_CMD="podman-compose" + else + echo "ERROR: podman-compose not found. Install it:" >&2 + echo " pip install podman-compose" >&2 + exit 1 + fi +elif command -v docker &> /dev/null; then + CONTAINER_CMD="docker" + if command -v docker-compose &> /dev/null; then + COMPOSE_CMD="docker-compose" + elif docker compose version &> /dev/null; then + COMPOSE_CMD="docker compose" + else + echo "ERROR: docker-compose not found. Install it:" >&2 + echo " https://docs.docker.com/compose/install/" >&2 + exit 1 + fi +else + echo "ERROR: Neither Podman nor Docker found. Install one of them:" >&2 + echo " Podman: https://podman.io/getting-started/installation" >&2 + echo " Docker: https://docs.docker.com/get-docker/" >&2 + exit 1 +fi + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + +echo -e "${CYAN}==================================================================" +echo -e " Plagiarism Checker - PROD Development Setup" +echo -e " Container Runtime: ${CONTAINER_CMD}" +echo -e " Compose File: ${COMPOSE_FILE}" +echo -e "==================================================================${NC}" +echo "" + +# Validate compose file exists +if [ ! -f "$COMPOSE_FILE" ]; then + echo -e "${RED}ERROR: $COMPOSE_FILE not found${NC}" + exit 1 +fi + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="plg-postgres" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +CLIP_MODEL_URL="${CLIP_MODEL_URL:-https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/resolve/main/open_clip_pytorch_model.bin}" + +stop_existing_containers() { + local containers_exist=false + + # Check if compose stack is running + if $COMPOSE_CMD -f $COMPOSE_FILE ps 2>/dev/null | grep -q "Up\|running"; then + containers_exist=true + fi + + if [ "$containers_exist" = true ]; then + echo -e "${YELLOW}Existing containers found:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE ps + echo "" + echo -e "${YELLOW}This will stop and remove existing containers.${NC}" + read -p "Continue? (y/N): " -n 1 -r + echo + + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo -e "${RED}Aborted by user${NC}" + # exit 0 + else + echo -e "${CYAN}Stopping existing containers...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE down + fi + fi + + echo -e "${GREEN}[OK] Ready to start containers${NC}" +} + +start_containers() { + echo -e "${CYAN}Starting containers with $COMPOSE_FILE...${NC}" + + # Determine which services to start + #local services="postgres rabbitmq pgadmin plagiarism-checker" + local services="postgres plagiarism-checker" + + if [ "$START_API" -eq 1 ]; then + services="$services api" + echo -e "${CYAN}Including API service${NC}" + + # Force rebuild API container to ensure it uses Dockerfile.api (not Dockerfile) + echo -e "${YELLOW}Rebuilding API container with Dockerfile.api...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE build --no-cache api + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to build API container${NC}" + exit 1 + fi + echo -e "${GREEN}[OK] API container rebuilt${NC}" + fi + + $COMPOSE_CMD -f $COMPOSE_FILE up -d $services + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to start containers${NC}" + exit 1 + fi + + echo -e "${GREEN}[OK] Containers started${NC}" +} + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + + +wait_for_api() { + if [ "$START_API" -ne 1 ]; then + return 0 + fi + + echo -e "${YELLOW}Waiting for API service...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if curl -s -f http://localhost:8000/health &>/dev/null; then + echo -e "${GREEN}[OK] API service ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: API service timeout${NC}" + echo -e "${YELLOW}Checking API container logs:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE logs --tail=50 api + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +create_env_file() { + if [ -f ".env" ]; then + echo -e "${GREEN}[OK] .env exists${NC}" + return + fi + + if [ ! -f ".env.example" ]; then + echo -e "${RED}ERROR: .env.example not found${NC}" + exit 1 + fi + + echo -e "${CYAN}Creating .env from template...${NC}" + cp .env.example .env + + # Keep service names for docker-compose (containers communicate via service names) + # No transformation needed - .env.example already has correct service names + + echo -e "${GREEN}[OK] .env created${NC}" +} + +show_summary() { + echo -e "\n${CYAN}==================================================================" + echo -e " Environment Ready!" + echo -e "==================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Services Running:${NC}" + echo -e " PostgreSQL: localhost:$POSTGRES_PORT (with pgvector)" + + if [ "$START_API" -eq 1 ]; then + echo -e " API: http://localhost:8000" + echo -e " API Docs: http://localhost:8000/docs" + fi + + echo "" + echo -e "${CYAN}💡 Quick Start:${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --full-setup${NC} ${GRAY}# Development mode (default)${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --with-api${NC} ${GRAY}# Include API container${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --prod${NC} ${GRAY}# Production mode${NC}" + echo "" + echo -e "${CYAN}📋 Manual Setup:${NC}" + echo -e " 1. ${YELLOW}${PY} -m venv venv${NC}" + + case "${OS_TYPE}" in + windows-msys) echo -e " ${YELLOW}source venv/Scripts/activate${NC}" ;; + *) echo -e " ${YELLOW}source venv/bin/activate${NC}" ;; + esac + + echo -e " 2. ${YELLOW}${PY} -m pip install -r requirements.txt${NC}" + echo -e " 3. ${YELLOW}${PY} app.py${NC}" + echo "" + echo -e "${CYAN}🔧 Container Commands:${NC}" + echo -e " ${GRAY}Logs: $COMPOSE_CMD -f $COMPOSE_FILE logs -f${NC}" + echo -e " ${GRAY}Stop: $COMPOSE_CMD -f $COMPOSE_FILE stop${NC}" + echo -e " ${GRAY}Remove: $COMPOSE_CMD -f $COMPOSE_FILE down${NC}" + echo "" + echo -e "${CYAN}==================================================================${NC}" +} + +main() { + create_env_file + stop_existing_containers + start_containers + wait_for_postgres + wait_for_rabbitmq + wait_for_api + initialize_database + show_summary + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +setup_python_environment() { + echo -e "\n${CYAN}==================================================================" + echo -e " Full Setup: Python Environment" + echo -e "==================================================================${NC}" + + if [ -d "venv" ]; then + echo -e "${GRAY}Virtual environment exists${NC}" + else + echo -e "${CYAN}Creating virtual environment...${NC}" + $PY -m venv venv + echo -e "${GREEN}[OK] venv created${NC}" + fi + + echo -e "${CYAN}Activating virtual environment...${NC}" + if [ "${OS_TYPE}" = "windows-msys" ]; then + source venv/Scripts/activate + else + source venv/bin/activate + fi + + echo -e "${CYAN}Installing dependencies (5-10 minutes)...${NC}" + python -m pip install --upgrade pip setuptools wheel + python -m pip install -r requirements.txt + echo -e "${GREEN}[OK] Dependencies installed${NC}" + + echo -e "${CYAN}Creating directories...${NC}" + mkdir -p data/reference_images data/models/clip logs + echo -e "${GREEN}[OK] Directories created${NC}" + + # Download CLIP model using curl + echo -e "${CYAN}Checking CLIP model...${NC}" + if [ ! -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${YELLOW}Downloading CLIP model (this may take a while)...${NC}" + curl -L -o data/models/clip/open_clip_pytorch_model.bin \ + ${CLIP_MODEL_URL} || { + echo -e "${YELLOW}WARNING: CLIP model download failed, will download on first run${NC}" + } + + if [ -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${GREEN}[OK] CLIP model downloaded${NC}" + fi + else + echo -e "${GRAY}CLIP model already exists${NC}" + fi + + echo -e "${CYAN}Verifying environment...${NC}" + python -c " +import open_clip, asyncpg, aio_pika, PIL, imagehash +print('✓ All imports successful') +" || { + echo -e "${RED}ERROR: Environment verification failed${NC}" + exit 1 + } + + echo -e "\n${CYAN}==================================================================" + echo -e " Setup Complete!" + echo -e "==================================================================${NC}" + echo -e "\n${GREEN}✓ Next Steps:${NC}" + echo -e "\n${CYAN}Terminal 1 - Worker:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}python app.py${NC}" + echo -e "\n${CYAN}Terminal 2 - API:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}cd api && uvicorn api:app --reload --host 0.0.0.0 --port 8000${NC}" + echo -e "\n${CYAN}API Docs:${NC} ${YELLOW}http://localhost:8000/docs${NC}" + echo "" +} + +main + +if [ "$FULL_SETUP" -eq 1 ]; then + setup_python_environment +fi From 578cc5dcadc385596227e0316b5e1fad9a2ce3c0 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 9 Jan 2026 19:45:45 +0530 Subject: [PATCH 05/13] Reference image --- .gitignore | 1 + Dockerfile | 21 +++---- Dockerfile.api | 20 +++--- database/db_manager.py | 10 +-- docker-compose-prod.yml | 29 +++++++++ image_worker/worker.py | 11 ++-- mq/rmq_client.py | 5 ++ scripts/docker-postgres.yml | 53 ++++++++++++++++ scripts/postgres_setup.sh | 118 ++++++++++++++++++++++++++++++++++++ start-prod-env.sh | 5 +- 10 files changed, 236 insertions(+), 37 deletions(-) create mode 100644 scripts/docker-postgres.yml create mode 100644 scripts/postgres_setup.sh diff --git a/.gitignore b/.gitignore index 897c762..cf0db6d 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ build/ # Environment variables .env .env.local +.env.prod # Logs logs/ diff --git a/Dockerfile b/Dockerfile index 5426394..37475e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ WORKDIR /app # Install build dependencies in a single layer RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ + vim \ gcc \ g++ \ git \ @@ -18,10 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Copy only requirements first for better caching COPY requirements.txt . -# Use pip cache and install in parallel -RUN --mount=type=cache,target=/root/.cache/pip \ - python -m pip install --upgrade pip setuptools wheel && \ - pip install -r requirements.txt --user --no-warn-script-location +# Install to explicit location +RUN python -m pip install --no-cache-dir --prefix=/install --upgrade pip setuptools wheel && \ + pip install --no-cache-dir --no-deps --prefix=/install -r requirements.txt + +RUN pip install --prefix=/install -r requirements.txt --no-cache-dir # ============================================ # Final stage - minimal runtime image @@ -37,18 +39,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # Copy installed packages from builder -COPY --from=builder /root/.local /root/.local - -ENV PATH=/root/.local/bin:$PATH +COPY --from=builder /install /usr/local # Create necessary directories RUN mkdir -p /app/data /app/logs /root/.cache/clip +RUN ls + # Copy application code (do this last for better caching) COPY . . -# Lightweight healthcheck -HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD python -c "import sys; sys.exit(0)" || exit 1 -CMD ["python", "app.py"] +CMD ["python", "app.py"] \ No newline at end of file diff --git a/Dockerfile.api b/Dockerfile.api index 01fcc66..d0b5c03 100644 --- a/Dockerfile.api +++ b/Dockerfile.api @@ -1,4 +1,4 @@ -FROM python:3.13-slim +FROM python:3.13-slim as builder WORKDIR /app @@ -7,9 +7,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf # Copy requirements and install dependencies COPY api/requirements.txt /app/api/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python -m pip install --upgrade pip && \ - pip install -r /app/api/requirements.txt +RUN python -m pip install --prefix=/install -r --upgrade pip && \ + pip install --prefix=/install -r /app/api/requirements.txt --no-cache-dir + +# ============================================ +# Final stage - minimal runtime image +# ============================================ +FROM python:3.13-slim + +WORKDIR /app + +# Copy installed packages from builder +COPY --from=builder /install /usr/local # Copy application code COPY api/ /app/api/ @@ -18,9 +27,6 @@ COPY utils/ /app/utils/ # Expose API port EXPOSE 8000 -# Healthcheck (check if uvicorn is responding) -HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ - CMD curl -f http://localhost:8000/ || exit 1 # Run the API CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/database/db_manager.py b/database/db_manager.py index 6db7936..d77b70a 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -69,14 +69,6 @@ async def init_pool(self): db_port = int(os.getenv("POSTGRES_PORT") or os.getenv("DB_PORT", "5432")) # db_port = 5435 # TEMP OVERRIDE FOR TESTING - #print the db connection details for debugging - # logger.info("###################") - # logger.info(f"DB Host: {db_host}") - # logger.info(f"DB Port: {db_port}") - # logger.info(f"DB Name: {db_name}") - # logger.info(f"DB User: {db_user}") - # logger.info(f"DB db_password: {db_password}") - # logger.info("###################") if not all([db_user, db_password, db_name]): @@ -525,7 +517,7 @@ async def fetch_reference_images_by_id(self, reference_id): image_path = await self._fetch( """ SELECT image_path - FROM reference_images where id = $1; + FROM reference_images where reference_id = $1; """, reference_id, ) diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 3e2e144..0f28bd6 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -65,6 +65,35 @@ services: networks: - plg-network + # =================================== + # API SERVICE + # =================================== + api: + build: + context: . + dockerfile: Dockerfile.api + container_name: plg-api + env_file: + - .env + volumes: + - ./data:/app/data + - ./logs:/app/logs + depends_on: + postgres: + condition: service_healthy + deploy: + resources: + limits: + cpus: '8.0' # Increased from 4.0 - allows up to 8 CPU cores + memory: 8G + reservations: + cpus: '2' # Increased from 1 - guarantees 2 cores minimum + memory: 2G + restart: unless-stopped + networks: + - plg-network + + # =================================== # VOLUMES # =================================== diff --git a/image_worker/worker.py b/image_worker/worker.py index 012d2e9..a54c630 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -340,9 +340,6 @@ async def check_assignment_reference_hash_match( best_comparison = comparison if best_match and best_comparison: - print("#"*70) - print("Best match found:",best_match["name"]) - print("#"*70) logger.info("Assignment reference match found") similarity = 1 - (best_score / 64.0) return ( @@ -445,10 +442,10 @@ async def check_clip_match( if not results: return None, 0.0, None - print("#"*70) - for ref_id, sim, meta in results: - print(f" Ref ID: {ref_id}, Similarity: {sim:.4f}, Meta: {meta}") - print("#"*70) + # print("#"*70) + # for ref_id, sim, meta in results: + # print(f" Ref ID: {ref_id}, Similarity: {sim:.4f}, Meta: {meta}") + # print("#"*70) matches = [ (ref_id, sim, meta) diff --git a/mq/rmq_client.py b/mq/rmq_client.py index 6571c0b..a0f0786 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -102,6 +102,11 @@ async def connect(self): self.DEAD_LETTER_QUEUE, durable=True ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True + ) + logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") try: # First try passive declaration to check if queue exists diff --git a/scripts/docker-postgres.yml b/scripts/docker-postgres.yml new file mode 100644 index 0000000..f6dac63 --- /dev/null +++ b/scripts/docker-postgres.yml @@ -0,0 +1,53 @@ +version: '3.8' + +services: + # =================================== + # POSTGRESQL - Database + # =================================== + postgres: + image: pgvector/pgvector:pg16 + container_name: plg-postgres + ports: + - "5432:5432" + environment: + POSTGRES_DB: plagiarism_db + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_INITDB_ARGS: "-E UTF8" + POSTGRES_MAX_CONNECTIONS: 20 + PGDATA: /var/lib/postgresql/data/pgdata + volumes: + - postgres_data:/var/lib/postgresql/data + - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + networks: + - plg-network + restart: always + + +# =================================== +# VOLUMES +# =================================== +volumes: + postgres_data: + driver: local + +# =================================== +# NETWORKS +# =================================== +networks: + plg-network: + driver: bridge diff --git a/scripts/postgres_setup.sh b/scripts/postgres_setup.sh new file mode 100644 index 0000000..92396b9 --- /dev/null +++ b/scripts/postgres_setup.sh @@ -0,0 +1,118 @@ +COMPOSE_FILE="docker-postgres.yml" +COMPOSE_CMD="podman-compose" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="plg-postgres" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" + +$COMPOSE_CMD -f $COMPOSE_FILE up -d + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +main() { + wait_for_postgres + initialize_database + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +main +echo -e "${YELLOW}PostgreSQL done...${NC}" \ No newline at end of file diff --git a/start-prod-env.sh b/start-prod-env.sh index 8f92a20..b9b8e0c 100755 --- a/start-prod-env.sh +++ b/start-prod-env.sh @@ -4,12 +4,12 @@ set -e FULL_SETUP=0 -START_API=0 +START_API=1 COMPOSE_FILE="docker-compose-prod.yml" while [[ "$#" -gt 0 ]]; do case "$1" in - --full-setup) FULL_SETUP=1; shift ;; + --full-setup) FULpodmanL_SETUP=1; shift ;; --with-api) START_API=1; shift ;; --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; @@ -363,7 +363,6 @@ main() { stop_existing_containers start_containers wait_for_postgres - wait_for_rabbitmq wait_for_api initialize_database show_summary From c1406eb8256c37ed0c3a9cd0be1e2a7596ba1266 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 9 Jan 2026 19:48:42 +0530 Subject: [PATCH 06/13] Reference Image --- database/db_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database/db_manager.py b/database/db_manager.py index d77b70a..fd7fb8d 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -514,7 +514,7 @@ async def fetch_reference_images_by_id(self, reference_id): raise RuntimeError("Database pool not initialized") try: - image_path = await self._fetch( + image_path = await self._fetchval( """ SELECT image_path FROM reference_images where reference_id = $1; From 825065497cdea6f215024a9d12da20a2a34ec4a5 Mon Sep 17 00:00:00 2001 From: Manu Date: Tue, 17 Feb 2026 22:27:33 +0530 Subject: [PATCH 07/13] video media handling --- image_worker/image_validator.py | 38 +++++++++++++++++++++++++++++++ image_worker/worker.py | 40 +++++++++++++++++++++++++++++++-- mq/rmq_client.py | 9 ++++---- 3 files changed, 80 insertions(+), 7 deletions(-) diff --git a/image_worker/image_validator.py b/image_worker/image_validator.py index 43a7b25..e4d1666 100644 --- a/image_worker/image_validator.py +++ b/image_worker/image_validator.py @@ -53,6 +53,26 @@ class ImageValidator: "vecteezy", ] + + IMAGE_EXTENSIONS = { + "jpg", + "jpeg", + "png", + "gif", + "webp", + "bmp", + "tiff", + "heic", + } + VIDEO_EXTENSIONS = { + "mp4", + "mov", + "webm", + "mkv", + "avi", + "mpeg", + "mpg", + } def __init__( self, min_variance_threshold: float = 5.0, @@ -71,6 +91,24 @@ def __init__( self.min_unique_colors = min_unique_colors self.max_solid_color_ratio = max_solid_color_ratio + + def detect_media_type(self, submission_url: str) -> str: + """Detect media type based on URL/extension.""" + if not submission_url: + return "image" + + url_without_query = submission_url.split("?", 1)[0].lower() + if "." in url_without_query: + ext = url_without_query.rsplit(".", 1)[-1] + if ext in self.IMAGE_EXTENSIONS: + return "image" + if ext in self.VIDEO_EXTENSIONS: + return "video" + + if "video" in url_without_query: + return "video" + return "image" + def check_stock_image_url(self, image_url: str) -> Tuple[bool, Optional[str]]: """ Check if URL is from a known stock image website. diff --git a/image_worker/worker.py b/image_worker/worker.py index a54c630..b5d67b1 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -356,7 +356,6 @@ async def check_assignment_reference_hash_match( logger.error(f"Hash check failed: {e}", exc_info=True) raise - async def check_db_reference_hash_match( self, hashes: dict ) -> Tuple[bool, Optional[str], Optional[float], Optional[str]]: @@ -550,10 +549,24 @@ async def process_submission(self, data: Dict[str, Any]) -> Optional[str]: try: extracted = self._validate_input(data) - submission_id, student_id, assign_id, image_url, db_record_id = extracted + submission_id, student_id, assign_id, submission_url, db_record_id = extracted logger.info(f"Processing submission: {submission_id}") + # Check for video URLs before attempting to download + media_type = self.image_validator.detect_media_type(submission_url) + if media_type == "video": + logger.warning( + f"Video URL rejected: submission={submission_id}, url={submission_url}" + ) + video_result = self._create_video_url_result( + submission_id, student_id, assign_id, submission_url + ) + processing_time_ms = int((time.time() - start_time) * 1000) + return json.dumps(video_result) + else: + image_url = submission_url + # Check for stock image URLs before downloading is_stock, stock_site = self.image_validator.check_stock_image_url(image_url) if is_stock and stock_site: @@ -1101,6 +1114,29 @@ def _create_stock_image_result( "plagiarism_source": f"stock_image_{stock_site}", "similar_sources": [{"source": stock_site, "url": image_url}], } + + def _create_video_url_result( + self, + submission_id: str, + student_id: str, + assign_id: str, + submission_url: str + ) -> dict: + """Create video URL detection result dictionary.""" + return { + "submission_id": submission_id, + "student_id": student_id, + "assignment_id": assign_id, + "image_url": submission_url, + "is_ai_generated": False, + "ai_detection_source": "None", + "ai_confidence": 0.0, + "is_plagiarized": False, + "similarity_score": 1.0, + "match_type": "original", + "plagiarism_source": None, + "similar_sources": None, + } async def _build_reference_result( self, diff --git a/mq/rmq_client.py b/mq/rmq_client.py index a0f0786..1e4f31a 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -102,11 +102,7 @@ async def connect(self): self.DEAD_LETTER_QUEUE, durable=True ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") - self.submission_queue = await self.channel.declare_queue( - self.SUBMISSION_QUEUE, - durable=True - ) - logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") + try: # First try passive declaration to check if queue exists @@ -124,6 +120,8 @@ async def connect(self): ) logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") + + try: # First try passive declaration to check if queue exists self.feedback_queue = await self.channel.declare_queue( @@ -165,6 +163,7 @@ async def publish_message(self, message_body): logger.info( f"Published submission {message_body.get('submission_id')} for user {message_body.get('student_id')}" ) + logger.info(f"Published message body: {message_body}") except asyncio.CancelledError as e: logger.warning("publish_message CancelledError") raise Exception("publish_message CancelledError") from e From e64cf1db1d4680a05f012dd0c7fd603594bfac0a Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 24 Apr 2026 20:05:01 +0530 Subject: [PATCH 08/13] added more submission types --- api/api.py | 55 ++++++-- database/db_manager.py | 54 ++++---- database/init.sql | 6 +- .../migrations/002_rename_and_add_fields.sql | 4 + image_worker/worker.py | 4 +- plag_checker/submissions_checker.py | 121 ++++++++---------- processors/__init__.py | 10 +- processors/image_processor.py | 10 +- processors/text_processor.py | 10 -- tests/conftest.py | 4 +- tests/test_db_manager.py | 4 +- tests/test_processors.py | 88 ++----------- tests/test_submission_checker.py | 81 +++++++++--- tests/test_worker.py | 14 +- 14 files changed, 228 insertions(+), 237 deletions(-) create mode 100644 database/migrations/002_rename_and_add_fields.sql delete mode 100644 processors/text_processor.py diff --git a/api/api.py b/api/api.py index e8bc7ff..42b12ba 100644 --- a/api/api.py +++ b/api/api.py @@ -64,15 +64,42 @@ class SubmissionRequest(BaseModel): """Request model for submission creation""" student_id: str = Field(..., description="Student identifier", min_length=1) - image_url: str = Field(..., description="URL of the submitted image", min_length=1) + submission_type: str = Field(..., description="Type of submission: text, audio, video, or image") + submission_url: Optional[str] = Field( + None, description="URL of the submitted resource (required for image submissions)" + ) + submission_text: Optional[str] = Field( + None, description="Text content of the submission (required for text submissions)" + ) + submitted_at: Optional[str] = Field( + None, + description="Original submission timestamp in ISO format. If provided, this value is preserved in the processed result.", + ) assignment_id: Optional[str] = Field(None, description="Assignment identifier") - @validator("image_url") - def validate_url(cls, v): - """Validate that image_url is not empty""" - if not v or not v.strip(): - raise ValueError("image_url cannot be empty") - return v.strip() + @validator("submission_type") + def validate_submission_type(cls, v): + """Validate that submission_type is one of the supported values""" + allowed = {"text", "audio", "video", "image"} + if v not in allowed: + raise ValueError(f"submission_type must be one of {sorted(allowed)}") + return v + + @validator("submission_url", always=True) + def validate_submission_url(cls, v, values): + if values.get("submission_type") == "image": + if not v or not v.strip(): + raise ValueError("submission_url is required for image submissions") + return v.strip() + return v + + @validator("submission_text", always=True) + def validate_submission_text(cls, v, values): + if values.get("submission_type") == "text": + if not v or not v.strip(): + raise ValueError("submission_text is required for text submissions") + return v.strip() + return v @validator("student_id") def validate_student_id(cls, v): @@ -210,13 +237,13 @@ async def send_to_rabbitmq(message: dict, queue_name: str): ) async def create_submission(request: SubmissionRequest): """ - Submit an image for plagiarism detection + Submit a new user submission for plagiarism detection - Creates a new plagiarism check submission by sending the image URL - and metadata to the processing queue. + Creates a new plagiarism check submission by sending submission metadata + to the processing queue. Args: - request: Submission request containing student_id, image_url, and optional assignment_id + request: Submission request containing student_id, submission_type, submission_url, and optional assignment_id Returns: SubmissionResponse with submission details and unique ID @@ -248,9 +275,11 @@ async def create_submission(request: SubmissionRequest): payload = { "student_id": hashed_student_id, "submission_id": submission_id, - "img_url": request.image_url, + "submission_type": request.submission_type, + "submission_url": request.submission_url, + "submission_text": request.submission_text, + "submitted_at": request.submitted_at, "assign_id": assignment_id, - "submitted_at": datetime.datetime.utcnow().isoformat(), } # Send to RabbitMQ diff --git a/database/db_manager.py b/database/db_manager.py index fd7fb8d..a962f8c 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -27,7 +27,7 @@ class DatabaseManager: await db.init_pool() try: # Use database operations - await db.insert_submission_if_not_exists(data, image_url) + await db.insert_submission_if_not_exists(data, submission_url) finally: await db.close() """ @@ -165,7 +165,7 @@ def _normalize_vector(self, embedding_list: Optional[Any]) -> str: return str(normalized) async def insert_submission_if_not_exists( - self, submission_data: dict, image_url: str, status: int + self, submission_data: dict, submission_url: Optional[str], status: int ): """ Insert a new submission if it doesn't already exist. @@ -173,8 +173,8 @@ async def insert_submission_if_not_exists( Uses INSERT ... ON CONFLICT for atomic upsert operation to prevent race conditions. Args: - submission_data: Dict containing submission_id, student_id, assign_id, img_url - image_url: URL of the submitted image + submission_data: Dict containing submission_id, student_id, assign_id, submission_url, submission_type, submission_text + submission_url: URL of the submitted content status: Initial submission status Returns: @@ -193,8 +193,8 @@ async def insert_submission_if_not_exists( try: insert_sql = """ - INSERT INTO submissions (submission_id, student_id, assign_id, image_url, status) - VALUES ($1, $2, $3, $4, $5) + INSERT INTO submissions (submission_id, student_id, assign_id, submission_url, submission_type, submission_text, status) + VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (submission_id) DO NOTHING RETURNING id; """ @@ -206,7 +206,9 @@ async def insert_submission_if_not_exists( submission_id, submission_data.get("student_id"), submission_data.get("assign_id"), - submission_data.get("img_url") or image_url, + submission_data.get("submission_url") or submission_url, + submission_data.get("submission_type"), + submission_data.get("submission_text"), status, ) @@ -251,8 +253,10 @@ async def update_result(self, submission_id: str, result: dict, status_val: int) await self._execute( """ UPDATE submissions - SET result = $1, status = $2, updated_at = NOW() - WHERE submission_id = $3; + SET result = $1, + status = $2, + updated_at = NOW() + WHERE submission_id = $3; """, json.dumps(result), status_val, @@ -271,13 +275,13 @@ async def update_status( message_value: str, ): """ - Update submission status. - - Valid status values: RECEIVED, PROCESSING, PROCESSED, PENDING_FEEDBACK_PUSH, COMPLETED, FAILED + Update submission status and retry metadata. Args: submission_id: Unique submission identifier - status_value: New status value + status_value: New status code + retry_count_value: Retry counter value + message_value: Human-readable status message Raises: asyncpg.PostgresError: If update fails @@ -290,13 +294,15 @@ async def update_status( result = await self._execute( """ UPDATE submissions - SET status = $1, message = $2, retry_count = $3, updated_at = $4 - WHERE submission_id = $5; + SET status = $1, + retry_count = $2, + message = $3, + updated_at = NOW() + WHERE submission_id = $4; """, status_value, - message_value, retry_count_value, - datetime.utcnow(), + message_value, submission_id, ) if result == "UPDATE 0": @@ -568,7 +574,7 @@ async def fetch_peer_submissions( # This prevents flagging the original submitter as a plagiarist records = await self._fetch( """ - SELECT id, student_id, assign_id,image_url, phash, dhash, ahash, created_at + SELECT id, student_id, assign_id, submission_url, phash, dhash, ahash, created_at FROM submissions WHERE student_id != $1 AND created_at > $2 @@ -583,7 +589,7 @@ async def fetch_peer_submissions( else: records = await self._fetch( """ - SELECT id, student_id, assign_id,image_url, phash, dhash, ahash, created_at + SELECT id, student_id, assign_id, submission_url, phash, dhash, ahash, created_at FROM submissions WHERE student_id != $1 AND phash IS NOT NULL @@ -615,7 +621,7 @@ async def fetch_self_submissions(self, current_student_id: str): try: records = await self._fetch( """ - SELECT id, created_at, phash, dhash, ahash, assign_id, image_url + SELECT id, created_at, phash, dhash, ahash, assign_id, submission_url FROM submissions WHERE student_id = $1 AND phash IS NOT NULL @@ -695,11 +701,11 @@ async def pgvector_search_peer_submissions( vec = self._normalize_vector(embedding_list) results = await self._fetch( """ - SELECT + SELECT submission_id, student_id, assign_id, - image_url, + submission_url, created_at, (clip_embedding <#> $1::vector) * -1 as similarity FROM submissions @@ -744,11 +750,11 @@ async def pgvector_search_self_submissions( vec = self._normalize_vector(embedding_list) results = await self._fetch( """ - SELECT + SELECT submission_id, student_id, assign_id, - image_url, + submission_url, created_at, (clip_embedding <#> $1::vector) * -1 as similarity FROM submissions diff --git a/database/init.sql b/database/init.sql index 265a61b..55997a5 100644 --- a/database/init.sql +++ b/database/init.sql @@ -1,7 +1,5 @@ -- UUID support CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; - --- pgvector extension for vector similarity search (only needed if USE_PGVECTOR=true) CREATE EXTENSION IF NOT EXISTS vector; -- Drop existing tables if doing a clean restore (commented by default) @@ -16,7 +14,9 @@ CREATE TABLE IF NOT EXISTS submissions ( submission_id VARCHAR(100) UNIQUE NOT NULL, assign_id VARCHAR(200), student_id VARCHAR(64) NOT NULL, -- SHA-256 hashed - image_url TEXT, + submission_url TEXT, + submission_type VARCHAR(20), + submission_text TEXT, status integer DEFAULT 0, retry_count integer DEFAULT 0, result jsonb, diff --git a/database/migrations/002_rename_and_add_fields.sql b/database/migrations/002_rename_and_add_fields.sql new file mode 100644 index 0000000..ac4e7dd --- /dev/null +++ b/database/migrations/002_rename_and_add_fields.sql @@ -0,0 +1,4 @@ +-- Migration: Rename image_url to submission_url and add submission_type, submission_text +ALTER TABLE submissions RENAME COLUMN image_url TO submission_url; +ALTER TABLE submissions ADD COLUMN submission_type VARCHAR(20); +ALTER TABLE submissions ADD COLUMN submission_text TEXT; \ No newline at end of file diff --git a/image_worker/worker.py b/image_worker/worker.py index b5d67b1..0e5aa90 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -228,7 +228,7 @@ def _validate_input(self, data: Dict[str, Any]) -> Tuple[str, str, str, str, str Raises: ValidationError: If required fields are missing or invalid """ - required_fields = ["submission_id", "student_id", "img_url", "db_record_id"] + required_fields = ["submission_id", "student_id", "submission_url", "db_record_id"] for field in required_fields: if field not in data or not data[field]: raise ValidationError( @@ -239,7 +239,7 @@ def _validate_input(self, data: Dict[str, Any]) -> Tuple[str, str, str, str, str submission_id = data["submission_id"] student_id = data["student_id"] assign_id = data.get("assign_id", "N/A") - image_url = data["img_url"] + image_url = data["submission_url"] db_record_id = data["db_record_id"] return submission_id, student_id, assign_id, image_url, db_record_id diff --git a/plag_checker/submissions_checker.py b/plag_checker/submissions_checker.py index 4fb46a8..daef45e 100644 --- a/plag_checker/submissions_checker.py +++ b/plag_checker/submissions_checker.py @@ -6,7 +6,6 @@ from mq.mq_client import MQClient from database.db_manager import DatabaseManager from processors.image_processor import ImageProcessor -from processors.text_processor import TextProcessor from typing import TYPE_CHECKING @@ -92,7 +91,6 @@ def __init__( self._owns_image_worker = image_worker is None self.image_processor = None # Will be set after image_worker initialization - self.text_processor = TextProcessor() self.STARTUP_RETRY_DELAY = startup_retry_delay self.MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3")) @@ -159,7 +157,13 @@ async def process_submission(self, submission): redelivered = getattr(submission, "redelivered", False) delivery_count = 0 - if hasattr(submission, "headers") and submission.headers: + from collections.abc import Mapping + + if ( + hasattr(submission, "headers") + and isinstance(submission.headers, Mapping) + and submission.headers + ): delivery_count = submission.headers.get("x-delivery-count", 0) if redelivered and delivery_count == 0: @@ -186,28 +190,29 @@ async def process_submission(self, submission): f"Failed to send poison message to DLQ for submission {submission_id}" ) - await submission.nack( - submission, requeue=False, submission_id=submission_id - ) + await submission.nack(requeue=False, submission_id=submission_id) message_acked = True return - image_url = data.get("img_url") or data.get("image_url") + submission_url = data.get("submission_url") + submission_type = data.get("submission_type") record_id = await self.db.insert_submission_if_not_exists( - data, image_url or "", status + data, submission_url or "", status ) - if not image_url: - logger.error(f"No image URL in submission {submission_id}") + if submission_type != "image": + logger.info( + f"Skipping plagiarism processing for non-image submission {submission_id}" + ) + elif not submission_url: + logger.error(f"No submission_url in image submission {submission_id}") await self.db.update_status( submission_id, SubmissionStatus.FAILED, 0, - "No image URL in submission", - ) - await submission.nack( - submission, requeue=False, submission_id=submission_id + "No submission_url in image submission", ) + await submission.nack(requeue=False, submission_id=submission_id) message_acked = True return @@ -217,41 +222,33 @@ async def process_submission(self, submission): data["db_record_id"] = str(record_id) # Convert UUID to string - # Get processor and validate initialization - try: - processor = self.get_processor(data) - except RuntimeError as init_error: - # Processor not initialized - this is a system error, should retry - logger.error(f"System not ready: {init_error}") - await self.db.update_status( - submission_id, - SubmissionStatus.FAILED, - 0, - f"System initialization error: {str(init_error)}", - ) - await submission.nack( - submission, requeue=True, submission_id=submission_id - ) # Retry - system might be initializing - message_acked = True - return - except ValueError as validation_error: - # Invalid message format - should not retry - logger.error( - f"Invalid message format for submission {submission_id}: {validation_error}" - ) - await self.db.update_status( - submission_id, - SubmissionStatus.FAILED, - 0, - f"Invalid message format: {str(validation_error)}", - ) - await submission.nack( - submission, requeue=False, submission_id=submission_id - ) # Don't retry invalid format - message_acked = True - return - - result_text = await processor.process(data) + submission_type = data.get("submission_type") + if submission_type == "image": + try: + processor = self.get_processor(data) + except RuntimeError as init_error: + logger.error(f"System not ready: {init_error}") + await self.db.update_status( + submission_id, + SubmissionStatus.FAILED, + 0, + f"System initialization error: {str(init_error)}", + ) + await submission.nack(requeue=True, submission_id=submission_id) + message_acked = True + return + result_text = await processor.process(data) + else: + result_text = { + "similar_sources": [], + "similarity_score": 0.0, + "is_plagiarized": False, + "match_type": "original", + "is_ai_generated": False, + "ai_detection_source": "", + "ai_confidence": 0.0, + "plagiarism_source": "", + } try: if isinstance(result_text, (str, bytes, bytearray)): @@ -308,9 +305,7 @@ async def process_submission(self, submission): except json.JSONDecodeError as e: logger.error(f"Invalid JSON in message: {e}") - await submission.nack( - submission, requeue=False, submission_id=submission_id - ) + await submission.nack(requeue=False, submission_id=submission_id) message_acked = True except RuntimeError as e: @@ -341,9 +336,7 @@ async def process_submission(self, submission): rc + 1, f"Retry {rc + 1}/{self.MAX_RETRIES}: {str(e)[:200]}", ) - await submission.nack( - submission, requeue=True, submission_id=submission_id - ) + await submission.nack(requeue=True, submission_id=submission_id) message_acked = True else: await self.db.update_status( @@ -362,9 +355,7 @@ async def process_submission(self, submission): f"Failed to send max-retry message to DLQ for submission {submission_id}" ) - await submission.nack( - submission, requeue=False, submission_id=submission_id - ) + await submission.nack(requeue=False, submission_id=submission_id) message_acked = True logger.warning( f"Message discarded after {rc} retries and sent to DLQ" @@ -392,18 +383,18 @@ def get_processor(self, data: dict): RuntimeError: If processor is not initialized ValueError: If data format is invalid """ - if data.get("img_url"): + submission_type = data.get("submission_type") + if submission_type == "image": if self.image_processor is None: raise RuntimeError( "ImageProcessor not initialized - initialize() must be called before processing messages" ) return self.image_processor - elif data.get("text"): - return self.text_processor - else: - raise ValueError( - "Invalid submission format: missing both 'img_url' and 'text' fields" - ) + if submission_type is None: + raise ValueError("Invalid submission format: missing submission_type") + raise ValueError( + f"Unsupported submission_type for processing: {data.get('submission_type')}" + ) async def start_consumer(self): await self.client.start_consumer(self.process_submission) diff --git a/processors/__init__.py b/processors/__init__.py index fa8cdbb..960feed 100644 --- a/processors/__init__.py +++ b/processors/__init__.py @@ -1,17 +1,13 @@ -from .text_processor import TextProcessor from .image_processor import ImageProcessor async def get_processor( - data: dict, db_manager=None, image_processor=None, text_processor=None + data: dict, db_manager=None, image_processor=None ): """ Returns appropriate processor based on content type. - Priority: if 'img_url' present → ImageProcessor - otherwise → TextProcessor + Priority: if submission_type is 'image' → ImageProcessor """ - if data.get("img_url"): + if data.get("submission_type") == "image": return image_processor - elif data.get("text"): - return text_processor return None diff --git a/processors/image_processor.py b/processors/image_processor.py index 02f0035..e3dc36e 100644 --- a/processors/image_processor.py +++ b/processors/image_processor.py @@ -29,7 +29,7 @@ async def process(self, data: dict) -> dict: Args: data: dict containing: - - img_url or image_url: URL of the image to process + - submission_url: URL of the image to process - submission_id: unique submission identifier - student_id: hashed student identifier - assign_id: assignment identifier @@ -38,11 +38,11 @@ async def process(self, data: dict) -> dict: Returns: dict with plagiarism detection results or error message """ - image_url = data.get("img_url") or data.get("image_url") + submission_url = data.get("submission_url") - if not image_url: - logger.error("No image URL provided in submission data") - return {"error": "No image URL provided"} + if not submission_url: + logger.error("No submission_url provided in submission data") + return {"error": "No submission_url provided"} try: logger.info(f"Processing image submission: {data.get('submission_id')}") diff --git a/processors/text_processor.py b/processors/text_processor.py deleted file mode 100644 index b8a678a..0000000 --- a/processors/text_processor.py +++ /dev/null @@ -1,10 +0,0 @@ -from .base_processor import BaseProcessor - -class TextProcessor(BaseProcessor): - """Processor for text-based submissions.""" - - async def process(self, data: dict): - # Example: process text for plagiarism - text = data.get("payload", "") - # Dummy similarity result - return {"similarity_score": 78, "matched_sources": ["Source A", "Source B"]} diff --git a/tests/conftest.py b/tests/conftest.py index d41114e..26dfec9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -112,7 +112,7 @@ def sample_submission_data(sample_hashes, sample_clip_embedding): "submission_id": "SUB-001", "student_id": "ST001", "assign_id": "A001", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", "db_record_id": "123e4567-e89b-12d3-a456-426614174000", "hashes": sample_hashes, "clip_embedding": sample_clip_embedding, @@ -194,7 +194,7 @@ def mock_fetchrow_side_effect(*args, **kwargs): "submission_id": "SUB-001", "student_id": "ST001", "assign_id": "A001", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", "is_plagiarized": False, "plag_source": None, "plag_confidence": None, diff --git a/tests/test_db_manager.py b/tests/test_db_manager.py index cedcc89..1f56453 100644 --- a/tests/test_db_manager.py +++ b/tests/test_db_manager.py @@ -178,7 +178,9 @@ async def test_insert_submission_success(self): submission_id="SUB-001", student_id="ST001", assign_id="A001", - img_url="http://example.com/image.jpg", + submission_url="http://example.com/image.jpg", + submission_type="image", + submission_text=None, hashes={"phash": "abc", "dhash": "def", "ahash": "ghi"}, clip_embedding=[0.1] * 512, ) diff --git a/tests/test_processors.py b/tests/test_processors.py index 49c9b00..4ba6271 100644 --- a/tests/test_processors.py +++ b/tests/test_processors.py @@ -1,5 +1,5 @@ """ -Unit tests for image_processor and text_processor. +Unit tests for ImageProcessor. """ import pytest @@ -7,7 +7,6 @@ import json from processors.image_processor import ImageProcessor -from processors.text_processor import TextProcessor class TestImageProcessor: @@ -60,15 +59,15 @@ async def test_process_no_image_url(self, image_processor): result = await image_processor.process(data) - assert result == {"error": "No image URL provided"} + assert result == {"error": "No submission_url provided"} @pytest.mark.asyncio - async def test_process_with_img_url(self, image_processor, mock_image_worker): - """Test processing submission with img_url.""" + async def test_process_with_submission_url(self, image_processor, mock_image_worker): + """Test processing submission with submission_url.""" data = { "submission_id": "SUB-001", "student_id": "ST001", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", } expected_result = { @@ -89,7 +88,7 @@ async def test_process_with_image_url(self, image_processor, mock_image_worker): data = { "submission_id": "SUB-002", "student_id": "ST002", - "image_url": "https://example.com/photo.png", + "submission_url": "https://example.com/photo.png", } expected_result = { @@ -111,7 +110,7 @@ async def test_process_worker_returns_none( """Test when worker returns None.""" data = { "submission_id": "SUB-003", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", } mock_image_worker.process_submission.return_value = None @@ -127,7 +126,7 @@ async def test_process_worker_returns_json_string( """Test when worker returns JSON string.""" data = { "submission_id": "SUB-004", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", } expected_result = { @@ -148,7 +147,7 @@ async def test_process_worker_returns_invalid_json( """Test when worker returns invalid JSON string.""" data = { "submission_id": "SUB-005", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", } mock_image_worker.process_submission.return_value = "not valid json {{" @@ -165,7 +164,7 @@ async def test_process_worker_raises_exception( """Test when worker raises exception.""" data = { "submission_id": "SUB-006", - "img_url": "https://example.com/image.jpg", + "submission_url": "https://example.com/image.jpg", } mock_image_worker.process_submission.side_effect = RuntimeError( @@ -186,7 +185,7 @@ async def test_process_multiple_submissions( submissions = [ { "submission_id": f"SUB-{i}", - "img_url": f"https://example.com/image{i}.jpg", + "submission_url": f"https://example.com/image{i}.jpg", } for i in range(3) ] @@ -205,68 +204,3 @@ async def test_process_multiple_submissions( assert results[1]["similarity_score"] == 0.6 assert results[2]["similarity_score"] == 0.7 - -class TestTextProcessor: - """Test cases for TextProcessor.""" - - @pytest.fixture - def text_processor(self): - """Create TextProcessor instance.""" - return TextProcessor() - - @pytest.mark.asyncio - async def test_process_text_submission(self, text_processor): - """Test processing text submission.""" - data = { - "submission_id": "TEXT-001", - "student_id": "ST001", - "payload": "This is some text to check for plagiarism.", - } - - result = await text_processor.process(data) - - assert "similarity_score" in result - assert "matched_sources" in result - assert isinstance(result["similarity_score"], (int, float)) - assert isinstance(result["matched_sources"], list) - - @pytest.mark.asyncio - async def test_process_empty_text(self, text_processor): - """Test processing empty text.""" - data = {"submission_id": "TEXT-002", "payload": ""} - - result = await text_processor.process(data) - - assert "similarity_score" in result - assert "matched_sources" in result - - @pytest.mark.asyncio - async def test_process_missing_payload(self, text_processor): - """Test processing submission without payload field.""" - data = {"submission_id": "TEXT-003"} - - result = await text_processor.process(data) - - # Should handle missing payload gracefully - assert "similarity_score" in result - assert "matched_sources" in result - - @pytest.mark.asyncio - async def test_process_returns_expected_format(self, text_processor): - """Test that result has expected format.""" - data = {"payload": "Sample text"} - - result = await text_processor.process(data) - - # Verify structure - assert isinstance(result, dict) - assert "similarity_score" in result - assert "matched_sources" in result - - # Verify types - assert isinstance(result["similarity_score"], (int, float)) - assert isinstance(result["matched_sources"], list) - - # Verify reasonable values (based on dummy implementation) - assert 0 <= result["similarity_score"] <= 100 - assert all(isinstance(source, str) for source in result["matched_sources"]) diff --git a/tests/test_submission_checker.py b/tests/test_submission_checker.py index 062400a..92fa300 100644 --- a/tests/test_submission_checker.py +++ b/tests/test_submission_checker.py @@ -163,7 +163,7 @@ async def test_initialize(self, submission_checker, mock_db_manager): async def test_get_processor_for_image(self, submission_checker): """Test getting processor for image submission.""" submission_checker.image_processor = MagicMock() - data = {"img_url": "https://example.com/image.jpg"} + data = {"submission_type": "image", "submission_url": "https://example.com/image.jpg"} processor = submission_checker.get_processor(data) @@ -172,16 +172,15 @@ async def test_get_processor_for_image(self, submission_checker): @pytest.mark.asyncio async def test_get_processor_for_text(self, submission_checker): """Test getting processor for text submission.""" - data = {"text": "Some text content"} + data = {"submission_type": "text", "submission_text": "Some text content"} - processor = submission_checker.get_processor(data) - - assert processor is submission_checker.text_processor + with pytest.raises(ValueError, match="Unsupported submission_type"): + submission_checker.get_processor(data) @pytest.mark.asyncio async def test_get_processor_invalid_format(self, submission_checker): """Test getting processor with invalid format raises error.""" - data = {"submission_id": "SUB-001"} # No img_url or text + data = {"submission_id": "SUB-001"} # No submission_type or submission_url with pytest.raises(ValueError, match="Invalid submission format"): submission_checker.get_processor(data) @@ -190,7 +189,7 @@ async def test_get_processor_invalid_format(self, submission_checker): async def test_get_processor_not_initialized(self, submission_checker): """Test getting processor when not initialized raises error.""" submission_checker.image_processor = None - data = {"img_url": "https://example.com/image.jpg"} + data = {"submission_type": "image", "submission_url": "https://example.com/image.jpg"} with pytest.raises(RuntimeError, match="not initialized"): submission_checker.get_processor(data) @@ -216,7 +215,8 @@ async def test_process_submission_success( "submission_id": "SUB-001", "student_id": "ST001", "assign_id": "A001", - "img_url": "https://example.com/image.jpg", + "submission_type": "image", + "submission_url": "https://example.com/image.jpg", } ).encode() mock_message.ack = AsyncMock() @@ -233,23 +233,57 @@ async def test_process_submission_success( submission_checker.client.publish_message.assert_called_once() @pytest.mark.asyncio - async def test_process_submission_no_image_url( + async def test_process_submission_non_image_skips_plagiarism( + self, submission_checker, mock_db_manager + ): + """Test non-image submissions skip plagiarism and are published as original.""" + submission_checker.image_processor = MagicMock() + + mock_message = MagicMock() + mock_message.body = json.dumps( + { + "submission_id": "SUB-006", + "student_id": "ST006", + "assign_id": "A006", + "submission_type": "text", + "submission_text": "Hello WHO ARE YOU?", + } + ).encode() + mock_message.ack = AsyncMock() + mock_message.redelivered = False + mock_message.headers = {"x-delivery-count": 0} + + await submission_checker.process_submission(mock_message) + + submission_checker.image_processor.process.assert_not_called() + mock_db_manager.update_result.assert_called_once() + submission_checker.client.publish_message.assert_called_once() + mock_message.ack.assert_called_once() + + @pytest.mark.asyncio + async def test_process_submission_text_submission_without_url( self, submission_checker, mock_db_manager ): - """Test processing submission without image URL.""" + """Test processing a text submission without submission_url.""" mock_message = MagicMock() mock_message.body = json.dumps( - {"submission_id": "SUB-002", "student_id": "ST002"} + { + "submission_id": "SUB-002", + "student_id": "ST002", + "assign_id": "A002", + "submission_type": "text", + "submission_text": "Hello", + } ).encode() + mock_message.ack = AsyncMock() mock_message.nack = AsyncMock() mock_message.redelivered = False mock_message.headers = {} await submission_checker.process_submission(mock_message) - mock_db_manager.update_status.assert_called() - - mock_message.nack.assert_called_once_with(requeue=False) + mock_db_manager.update_result.assert_called_once() + mock_message.ack.assert_called_once() @pytest.mark.asyncio async def test_process_submission_invalid_json(self, submission_checker): @@ -257,11 +291,13 @@ async def test_process_submission_invalid_json(self, submission_checker): mock_message = MagicMock() mock_message.body = b"invalid json {{" mock_message.nack = AsyncMock() + mock_message.redelivered = False + mock_message.headers = {} await submission_checker.process_submission(mock_message) # Should nack without requeue - mock_message.nack.assert_called_once_with(requeue=False) + mock_message.nack.assert_called_once_with(requeue=False, submission_id="unknown") @pytest.mark.asyncio async def test_process_submission_retry_logic( @@ -280,16 +316,18 @@ async def test_process_submission_retry_logic( mock_message.body = json.dumps( { "submission_id": "SUB-003", - "img_url": "https://example.com/image.jpg", + "submission_type": "image", + "submission_url": "https://example.com/image.jpg", } ).encode() mock_message.nack = AsyncMock() mock_message.redelivered = False + mock_message.headers = {} await submission_checker.process_submission(mock_message) # Should nack with requeue (retry) - mock_message.nack.assert_called_once_with(requeue=True) + mock_message.nack.assert_called_once_with(requeue=True, submission_id="SUB-003") # Should update status with retry count mock_db_manager.update_status.assert_called() @@ -311,7 +349,8 @@ async def test_process_submission_max_retries( mock_message.body = json.dumps( { "submission_id": "SUB-004", - "img_url": "https://example.com/image.jpg", + "submission_type": "image", + "submission_url": "https://example.com/image.jpg", } ).encode() mock_message.nack = AsyncMock() @@ -322,7 +361,7 @@ async def test_process_submission_max_retries( submission_checker.client.publish_to_dlq.assert_called_once() - mock_message.nack.assert_called_once_with(requeue=False) + mock_message.nack.assert_called_once_with(requeue=False, submission_id="SUB-004") @pytest.mark.asyncio async def test_process_submission_poison_message( @@ -333,7 +372,7 @@ async def test_process_submission_poison_message( mock_message = MagicMock() mock_message.body = json.dumps( - {"submission_id": "SUB-005", "img_url": "http://example.com"} + {"submission_id": "SUB-005", "submission_type": "image", "submission_url": "http://example.com"} ).encode() mock_message.headers = {"x-delivery-count": 5} mock_message.redelivered = True @@ -343,7 +382,7 @@ async def test_process_submission_poison_message( # Should detect poison message and publish to DLQ submission_checker.client.publish_to_dlq.assert_called_once() - mock_message.nack.assert_called_once_with(requeue=False) + mock_message.nack.assert_called_once_with(requeue=False, submission_id="SUB-005") @pytest.mark.asyncio async def test_close_owns_resources( diff --git a/tests/test_worker.py b/tests/test_worker.py index 431c105..b761fb5 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -45,7 +45,7 @@ def sample_submission_data(): "submission_id": "SUB-TEST-001", "student_id": "ST-TEST-001", "assign_id": "A-TEST-001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 1, } @@ -222,7 +222,7 @@ async def test_process_submission_with_peer_check(self, mocked_worker, mock_db): "submission_id": "SUB-002", "student_id": "ST002", "assign_id": "A001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 2, } @@ -251,7 +251,7 @@ async def test_process_submission_with_self_check(self, mocked_worker, mock_db): "submission_id": "SUB-003", "student_id": "ST003", "assign_id": "A001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 3, } @@ -274,7 +274,7 @@ async def test_hash_computation(self, mocked_worker): "submission_id": "SUB-004", "student_id": "ST004", "assign_id": "A001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 4, } @@ -298,7 +298,7 @@ async def test_clip_embedding_generation(self, mocked_worker): "submission_id": "SUB-005", "student_id": "ST005", "assign_id": "A001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 5, } @@ -321,7 +321,7 @@ async def test_ai_detection_called(self, mocked_worker): "submission_id": "SUB-006", "student_id": "ST006", "assign_id": "A001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 6, } @@ -344,7 +344,7 @@ async def test_image_validation_called(self, mocked_worker): "submission_id": "SUB-007", "student_id": "ST007", "assign_id": "A001", - "img_url": "https://example.com/test.png", + "submission_url": "https://example.com/test.png", "db_record_id": 7, } From ab5ca7530db692ea01fc9994231b12fa185e8c0e Mon Sep 17 00:00:00 2001 From: Manu Date: Mon, 27 Apr 2026 10:59:34 +0530 Subject: [PATCH 09/13] gcp authentication --- .env.example | 4 + config/config.py | 14 +++ image_worker/gcs_client.py | 231 +++++++++++++++++++++++++++++++++++++ image_worker/worker.py | 91 +++++++++------ requirements.txt | 4 + utils/exceptions.py | 6 + 6 files changed, 313 insertions(+), 37 deletions(-) create mode 100644 image_worker/gcs_client.py diff --git a/.env.example b/.env.example index 63da271..4842182 100644 --- a/.env.example +++ b/.env.example @@ -16,6 +16,10 @@ FEEDBACK_QUEUE=plagiarism_feedback # Dead Letter Queue (optional - leave empty to disable) DEAD_LETTER_QUEUE=plagiarism_failed_submissions +#GCP CONFIGURATION +GCP_ENABLED=true +GCP_KEY_PATH=/app/credentials/gcp_service_account.json + # POSTGRESQL CONFIGURATION POSTGRES_HOST=postgres POSTGRES_PORT=5432 diff --git a/config/config.py b/config/config.py index 63060ff..a805c93 100644 --- a/config/config.py +++ b/config/config.py @@ -215,6 +215,19 @@ class Config: case_sensitive = False +class GCPConfig(BaseSettings): + """GCP Cloud Storage configuration for authenticated image downloads.""" + + gcp_enabled: bool = Field(default=True, env="GCP_ENABLED") + gcp_key_path: str = Field( + default="", env="GCP_KEY_PATH", description="Path to GCP service account JSON key file" + ) + + class Config: + env_file = ".env" + case_sensitive = False + + class LoggingConfig(BaseSettings): """Logging configuration.""" @@ -239,6 +252,7 @@ class AppConfig(BaseSettings): detection: DetectionConfig = DetectionConfig() vector_search: VectorSearchConfig = VectorSearchConfig() image_processing: ImageProcessingConfig = ImageProcessingConfig() + gcp: GCPConfig = GCPConfig() logging: LoggingConfig = LoggingConfig() app_name: str = Field(default="MentorMe Plagiarism Detection", env="APP_NAME") diff --git a/image_worker/gcs_client.py b/image_worker/gcs_client.py new file mode 100644 index 0000000..96d2a03 --- /dev/null +++ b/image_worker/gcs_client.py @@ -0,0 +1,231 @@ +""" +GCP Cloud Storage (GCS) client helper functions for authenticated image downloads. + +Provides async-compatible methods to download images from GCS buckets using +service account authentication via JSON key files. +""" + +import logging +import asyncio +import json +from io import BytesIO +from pathlib import Path +from typing import Optional +from PIL import Image +from google.cloud import storage +from google.oauth2 import service_account + +logger = logging.getLogger(__name__) + + +def load_gcp_credentials(key_path: str) -> Optional[service_account.Credentials]: + """ + Load GCP service account credentials from JSON key file. + + Args: + key_path: Path to the GCP service account JSON key file + + Returns: + service_account.Credentials object, or None if loading fails + + Raises: + FileNotFoundError: If key file does not exist + ValueError: If key file is invalid JSON or missing required fields + """ + try: + key_file = Path(key_path) + if not key_file.exists(): + raise FileNotFoundError(f"GCP key file not found: {key_path}") + + with open(key_file, "r") as f: + key_data = json.load(f) + + credentials = service_account.Credentials.from_service_account_info(key_data) + logger.debug(f"GCP credentials loaded successfully from: {key_path}") + return credentials + + except FileNotFoundError as e: + logger.error(f"GCP key file error: {e}") + raise + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in GCP key file: {e}") + raise ValueError(f"Invalid JSON in GCP key file: {e}") + except Exception as e: + logger.error(f"Failed to load GCP credentials: {e}") + raise + + +def parse_gcs_url(gcs_url: str) -> tuple[str, str]: + """ + Parse a GCS URL into bucket name and blob path. + + Args: + gcs_url: GCS URL in format gs://bucket-name/path/to/object + + Returns: + Tuple of (bucket_name, blob_path) + + Raises: + ValueError: If URL is not a valid GCS URL + """ + if not gcs_url.startswith("gs://"): + raise ValueError(f"Invalid GCS URL: {gcs_url}. Must start with 'gs://'") + + # Remove gs:// prefix + path = gcs_url[5:] + + # Split bucket name and blob path + parts = path.split("/", 1) + if len(parts) < 2: + raise ValueError(f"Invalid GCS URL: {gcs_url}. Must include bucket and blob path") + + bucket_name = parts[0] + blob_path = parts[1] + + if not bucket_name or not blob_path: + raise ValueError(f"Invalid GCS URL: {gcs_url}. Bucket and blob path cannot be empty") + + return bucket_name, blob_path + + +def create_gcs_client(credentials: service_account.Credentials) -> storage.Client: + """ + Create a GCP Storage client with provided credentials. + + Args: + credentials: GCP service account credentials + + Returns: + google.cloud.storage.Client instance + + Raises: + Exception: If client creation fails + """ + try: + client = storage.Client(credentials=credentials) + logger.debug("GCS client created successfully") + return client + except Exception as e: + logger.error(f"Failed to create GCS client: {e}") + raise + + +async def download_from_gcs( + gcs_url: str, + credentials: service_account.Credentials, + timeout: int = 30, +) -> Image.Image: + """ + Download an image from GCS bucket asynchronously. + + Args: + gcs_url: GCS URL in format gs://bucket-name/path/to/image + credentials: GCP service account credentials + timeout: Download timeout in seconds + + Returns: + PIL Image object + + Raises: + ValueError: If URL is invalid or credentials missing + FileNotFoundError: If bucket or blob not found + Exception: For other GCS operation failures + """ + try: + bucket_name, blob_path = parse_gcs_url(gcs_url) + logger.debug( + f"Downloading from GCS: bucket={bucket_name}, blob={blob_path}, timeout={timeout}s" + ) + + # Run blocking GCS operations in executor to avoid blocking event loop + loop = asyncio.get_event_loop() + image = await loop.run_in_executor( + None, + _download_gcs_blob_sync, + gcs_url, + credentials, + bucket_name, + blob_path, + ) + + logger.info(f"Successfully downloaded image from GCS: {gcs_url}") + return image + + except Exception as e: + logger.error(f"GCS download failed: url={gcs_url}, error={e}", exc_info=True) + raise + + +def _download_gcs_blob_sync( + gcs_url: str, + credentials: service_account.Credentials, + bucket_name: str, + blob_path: str, +) -> Image.Image: + """ + Synchronous helper to download blob from GCS (runs in executor). + + Args: + gcs_url: Full GCS URL for logging + credentials: GCP service account credentials + bucket_name: Name of the GCS bucket + blob_path: Path to the blob within the bucket + + Returns: + PIL Image object + + Raises: + FileNotFoundError: If bucket or blob not found + Exception: For other GCS operation failures + """ + try: + client = create_gcs_client(credentials) + bucket = client.bucket(bucket_name) + + # Check if bucket exists + if not bucket.exists(): + raise FileNotFoundError(f"GCS bucket not found: {bucket_name}") + + blob = bucket.blob(blob_path) + + # Check if blob exists + if not blob.exists(): + raise FileNotFoundError( + f"GCS blob not found: gs://{bucket_name}/{blob_path}" + ) + + # Download blob content + blob_content = blob.download_as_bytes() + + # Parse image + try: + image = Image.open(BytesIO(blob_content)) + logger.debug( + f"Image parsed successfully: format={image.format}, size={image.size}" + ) + return image + except Exception as img_error: + raise ValueError( + f"Invalid or corrupted image format from GCS: {str(img_error)}" + ) + + except FileNotFoundError: + raise + except Exception as e: + logger.error( + f"GCS blob download error: bucket={bucket_name}, blob={blob_path}, error={e}" + ) + raise + + +def is_gcs_url(url: str) -> bool: + """ + Check if a URL is a GCS URL. + + Args: + url: URL to check + + Returns: + True if URL starts with 'gs://', False otherwise + """ + return url.startswith("gs://") if url else False diff --git a/image_worker/worker.py b/image_worker/worker.py index 0e5aa90..dffbb67 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -12,6 +12,7 @@ from urllib.parse import urlparse from dotenv import load_dotenv from image_worker.assigment_ref_images import get_reference_images +from image_worker.gcs_client import is_gcs_url, download_from_gcs, load_gcp_credentials from config.config import config from database.db_manager import DatabaseManager @@ -28,6 +29,7 @@ NetworkTimeoutError, InvalidImageURLError, InvalidImageFormatError, + GCSDownloadError, ) load_dotenv() @@ -87,6 +89,20 @@ def __init__(self, db_manager=None): ) self.vector_handler = None + self.gcp_credentials = None + + # Initialize GCP credentials if enabled + if config.gcp.gcp_enabled: + try: + self.gcp_credentials = load_gcp_credentials(config.gcp.gcp_key_path) + logger.info(f"GCP credentials loaded from: {config.gcp.gcp_key_path}") + except Exception as e: + logger.error(f"Failed to load GCP credentials: {e}") + if config.environment == "production": + raise GCSDownloadError( + f"GCP authentication failed: {e}", + details={"key_path": config.gcp.gcp_key_path}, + ) if not self.use_pgvector: self.vector_handler = FAISSHandler( @@ -123,8 +139,10 @@ async def download_image(self, image_url: str) -> Image.Image: """ Download image from URL with timeout and validation. + Supports both public HTTP/HTTPS URLs and authenticated GCS bucket URLs (gs://). + Args: - image_url: HTTP/HTTPS URL of the image + image_url: HTTP/HTTPS URL or GCS URL (gs://bucket-name/path/to/image) Returns: PIL Image object @@ -134,7 +152,41 @@ async def download_image(self, image_url: str) -> Image.Image: NetworkTimeoutError: If download times out InvalidImageFormatError: If file is not a valid image ImageDownloadError: For other download failures + GCSDownloadError: For GCS authentication, bucket, blob, or download failures """ + # Check if URL is a GCS URL + if is_gcs_url(image_url): + if not self.gcp_credentials: + raise GCSDownloadError( + "GCP credentials not initialized. Set GCP_ENABLED=true and GCP_KEY_PATH in .env", + details={"url": image_url}, + ) + + logger.debug(f"Downloading from GCS: url={image_url}") + try: + image = await download_from_gcs( + image_url, + self.gcp_credentials, + timeout=self.DOWNLOAD_TIMEOUT, + ) + return image + except FileNotFoundError as e: + raise GCSDownloadError( + f"GCS bucket or blob not found: {str(e)}", + details={"url": image_url}, + ) + except ValueError as e: + raise GCSDownloadError( + f"Invalid or corrupted image from GCS: {str(e)}", + details={"url": image_url}, + ) + except Exception as e: + raise GCSDownloadError( + f"GCS download failed: {str(e)}", + details={"url": image_url, "error": str(e)}, + ) + + # Handle HTTP/HTTPS URLs max_retries = self.DOWNLOAD_RETRIES for attempt in range(max_retries): try: @@ -553,19 +605,7 @@ async def process_submission(self, data: Dict[str, Any]) -> Optional[str]: logger.info(f"Processing submission: {submission_id}") - # Check for video URLs before attempting to download - media_type = self.image_validator.detect_media_type(submission_url) - if media_type == "video": - logger.warning( - f"Video URL rejected: submission={submission_id}, url={submission_url}" - ) - video_result = self._create_video_url_result( - submission_id, student_id, assign_id, submission_url - ) - processing_time_ms = int((time.time() - start_time) * 1000) - return json.dumps(video_result) - else: - image_url = submission_url + image_url = submission_url # Check for stock image URLs before downloading is_stock, stock_site = self.image_validator.check_stock_image_url(image_url) @@ -1114,29 +1154,6 @@ def _create_stock_image_result( "plagiarism_source": f"stock_image_{stock_site}", "similar_sources": [{"source": stock_site, "url": image_url}], } - - def _create_video_url_result( - self, - submission_id: str, - student_id: str, - assign_id: str, - submission_url: str - ) -> dict: - """Create video URL detection result dictionary.""" - return { - "submission_id": submission_id, - "student_id": student_id, - "assignment_id": assign_id, - "image_url": submission_url, - "is_ai_generated": False, - "ai_detection_source": "None", - "ai_confidence": 0.0, - "is_plagiarized": False, - "similarity_score": 1.0, - "match_type": "original", - "plagiarism_source": None, - "similar_sources": None, - } async def _build_reference_result( self, diff --git a/requirements.txt b/requirements.txt index 4e590f1..a415d36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,10 @@ aiohappyeyeballs==2.6.1 aiosignal==1.4.0 requests==2.32.3 +# GCP Cloud Storage +google-cloud-storage==2.10.0 +google-auth==2.27.0 + # Configuration Management (Pydantic v2 compatible) pydantic==1.10.24 python-dotenv==1.2.1 diff --git a/utils/exceptions.py b/utils/exceptions.py index df0a713..a0b4406 100644 --- a/utils/exceptions.py +++ b/utils/exceptions.py @@ -48,6 +48,12 @@ class InvalidImageFormatError(ImageProcessingError): pass +class GCSDownloadError(ImageDownloadError): + """Failed to download image from GCS bucket (includes auth, bucket, and download errors).""" + + pass + + class ImageTooLargeError(ImageProcessingError): """Image exceeds maximum allowed size.""" From 70147aa8fdd39eca7ec31250adafd1f9f1fb9bfd Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 1 May 2026 11:50:56 +0000 Subject: [PATCH 10/13] postgres db --- docker-postgres.yml | 45 +++++++++++++++++ pg_setup.sh | 118 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 docker-postgres.yml create mode 100755 pg_setup.sh diff --git a/docker-postgres.yml b/docker-postgres.yml new file mode 100644 index 0000000..28f0cab --- /dev/null +++ b/docker-postgres.yml @@ -0,0 +1,45 @@ +version: '3.8' + +services: + # =================================== + # POSTGRESQL - Database + # =================================== + postgres: + image: docker.io/pgvector/pgvector:pg16 + container_name: plg-postgresdb + ports: + - "5432:5432" + environment: + POSTGRES_DB: plagiarism_db + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_INITDB_ARGS: "-E UTF8" + POSTGRES_MAX_CONNECTIONS: 20 + PGDATA: /var/lib/postgresql/data/pgdata + volumes: + - postgres_data:/var/lib/postgresql/data + - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + restart: always + + +# =================================== +# VOLUMES +# =================================== +volumes: + postgres_data: + driver: local + diff --git a/pg_setup.sh b/pg_setup.sh new file mode 100755 index 0000000..7172f5a --- /dev/null +++ b/pg_setup.sh @@ -0,0 +1,118 @@ +COMPOSE_FILE="docker-postgres.yml" +COMPOSE_CMD="podman-compose" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="plg-postgres" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" + +$COMPOSE_CMD -f $COMPOSE_FILE up -d + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +main() { + wait_for_postgres + initialize_database + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +main +echo "Setup Complete" From ed31598b1135691da203ba88aa79d8ffb1a9af85 Mon Sep 17 00:00:00 2001 From: Manu Date: Sat, 2 May 2026 15:16:29 +0530 Subject: [PATCH 11/13] GCS image download --- .gitignore | 3 + api/api.py | 53 +++++---- app.py | 42 +++++--- image_worker/gcs_client.py | 70 ++++++++---- image_worker/worker.py | 161 +++++++--------------------- mq/rmq_client.py | 10 +- plag_checker/submissions_checker.py | 42 ++++---- processors/image_processor.py | 4 +- requirements.txt | 3 + tests/test_rmq_client.py | 2 + tests/test_submission_checker.py | 12 +-- tests/test_worker.py | 33 ++++++ 12 files changed, 219 insertions(+), 216 deletions(-) diff --git a/.gitignore b/.gitignore index cf0db6d..cdbb287 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ + # Credentials + app/credentials/* + scratch* # Python diff --git a/api/api.py b/api/api.py index 42b12ba..35e8d50 100644 --- a/api/api.py +++ b/api/api.py @@ -8,29 +8,36 @@ from fastapi import FastAPI, HTTPException, status from fastapi.responses import JSONResponse from pydantic import BaseModel, Field, validator -import aio_pika -import json -import uuid -from dotenv import load_dotenv - -from dotenv import load_dotenv -import os - -sys.path.insert(0, str(Path(__file__).parent.parent)) -from utils.security import safe_hash_student_id - -# load_dotenv() - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) - -load_dotenv() - -# RabbitMQ Configuration -RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost") +import aio_pika +import json +import uuid +from dotenv import load_dotenv +import os + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from utils.security import safe_hash_student_id + +load_dotenv() + +DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + + +def get_log_format() -> str: + log_format = os.getenv("LOG_FORMAT", DEFAULT_LOG_FORMAT) + if log_format.lower() == "json": + return DEFAULT_LOG_FORMAT + return log_format + + +# Configure logging +logging.basicConfig( + level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO), + format=get_log_format(), +) +logger = logging.getLogger(__name__) + +# RabbitMQ Configuration +RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost") RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", "5672") RABBITMQ_VHOST = os.getenv("RABBITMQ_VHOST", "/") RABBITMQ_USER = os.getenv("RABBITMQ_USER", "admin") diff --git a/app.py b/app.py index 344e037..4b9cb7d 100644 --- a/app.py +++ b/app.py @@ -1,20 +1,32 @@ import asyncio import logging -import signal -import sys -import os -from mq.rmq_client import RabbitMQClient -from plag_checker.submissions_checker import SubmissionChecker -from dotenv import load_dotenv - -__version__ = "1.0.0" - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) -load_dotenv() +import signal +import sys +import os +from dotenv import load_dotenv +from mq.rmq_client import RabbitMQClient +from plag_checker.submissions_checker import SubmissionChecker + +__version__ = "1.0.0" + +load_dotenv() + +DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + + +def get_log_format() -> str: + log_format = os.getenv("LOG_FORMAT", DEFAULT_LOG_FORMAT) + if log_format.lower() == "json": + return DEFAULT_LOG_FORMAT + return log_format + + +# Configure logging +logging.basicConfig( + level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO), + format=get_log_format(), +) +logger = logging.getLogger(__name__) def validate_configuration(): diff --git a/image_worker/gcs_client.py b/image_worker/gcs_client.py index 96d2a03..29bda27 100644 --- a/image_worker/gcs_client.py +++ b/image_worker/gcs_client.py @@ -11,13 +11,35 @@ from io import BytesIO from pathlib import Path from typing import Optional +from urllib.parse import urlparse from PIL import Image from google.cloud import storage from google.oauth2 import service_account +from google.api_core.exceptions import NotFound logger = logging.getLogger(__name__) +def resolve_gcp_key_path(key_path: str) -> Path: + """Resolve the configured GCP key path for container and local runs.""" + configured_path = Path(key_path).expanduser() + + if configured_path.exists(): + return configured_path + + project_root = Path(__file__).resolve().parent.parent + + if configured_path.is_absolute(): + try: + local_path = project_root / "app" / configured_path.relative_to("/app") + except ValueError: + local_path = configured_path + else: + local_path = project_root / configured_path + + return local_path + + def load_gcp_credentials(key_path: str) -> Optional[service_account.Credentials]: """ Load GCP service account credentials from JSON key file. @@ -33,15 +55,15 @@ def load_gcp_credentials(key_path: str) -> Optional[service_account.Credentials] ValueError: If key file is invalid JSON or missing required fields """ try: - key_file = Path(key_path) + key_file = resolve_gcp_key_path(key_path) if not key_file.exists(): - raise FileNotFoundError(f"GCP key file not found: {key_path}") + raise FileNotFoundError(f"GCP key file not found: {key_file}") with open(key_file, "r") as f: key_data = json.load(f) credentials = service_account.Credentials.from_service_account_info(key_data) - logger.debug(f"GCP credentials loaded successfully from: {key_path}") + logger.debug(f"GCP credentials loaded successfully from: {key_file}") return credentials except FileNotFoundError as e: @@ -68,11 +90,17 @@ def parse_gcs_url(gcs_url: str) -> tuple[str, str]: Raises: ValueError: If URL is not a valid GCS URL """ - if not gcs_url.startswith("gs://"): - raise ValueError(f"Invalid GCS URL: {gcs_url}. Must start with 'gs://'") - - # Remove gs:// prefix - path = gcs_url[5:] + if gcs_url.startswith("gs://"): + path = gcs_url[5:] + elif gcs_url.startswith("https://storage.googleapis.com/"): + # support public and authenticated GCS object URLs + parsed = urlparse(gcs_url) + path = parsed.path.lstrip("/") + elif gcs_url.startswith("http://storage.googleapis.com/"): + parsed = urlparse(gcs_url) + path = parsed.path.lstrip("/") + else: + raise ValueError(f"Invalid GCS URL: {gcs_url}. Must start with 'gs://' or 'https://storage.googleapis.com/'") # Split bucket name and blob path parts = path.split("/", 1) @@ -146,6 +174,7 @@ async def download_from_gcs( credentials, bucket_name, blob_path, + timeout, ) logger.info(f"Successfully downloaded image from GCS: {gcs_url}") @@ -161,6 +190,7 @@ def _download_gcs_blob_sync( credentials: service_account.Credentials, bucket_name: str, blob_path: str, + timeout: int, ) -> Image.Image: """ Synchronous helper to download blob from GCS (runs in executor). @@ -170,6 +200,7 @@ def _download_gcs_blob_sync( credentials: GCP service account credentials bucket_name: Name of the GCS bucket blob_path: Path to the blob within the bucket + timeout: Download timeout in seconds Returns: PIL Image object @@ -181,22 +212,15 @@ def _download_gcs_blob_sync( try: client = create_gcs_client(credentials) bucket = client.bucket(bucket_name) - - # Check if bucket exists - if not bucket.exists(): - raise FileNotFoundError(f"GCS bucket not found: {bucket_name}") - blob = bucket.blob(blob_path) - # Check if blob exists - if not blob.exists(): + try: + blob_content = blob.download_as_bytes(timeout=timeout) + except NotFound: raise FileNotFoundError( f"GCS blob not found: gs://{bucket_name}/{blob_path}" ) - # Download blob content - blob_content = blob.download_as_bytes() - # Parse image try: image = Image.open(BytesIO(blob_content)) @@ -226,6 +250,12 @@ def is_gcs_url(url: str) -> bool: url: URL to check Returns: - True if URL starts with 'gs://', False otherwise + True if URL is a GCS URL using 'gs://' or 'storage.googleapis.com', False otherwise """ - return url.startswith("gs://") if url else False + if not url: + return False + return ( + url.startswith("gs://") + or url.startswith("https://storage.googleapis.com/") + or url.startswith("http://storage.googleapis.com/") + ) diff --git a/image_worker/worker.py b/image_worker/worker.py index dffbb67..477def2 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -1,15 +1,11 @@ import json from datetime import datetime import logging -import aiohttp -import ssl from PIL import Image -from io import BytesIO import time import numpy as np import asyncio from typing import Dict, Optional, Tuple, Any -from urllib.parse import urlparse from dotenv import load_dotenv from image_worker.assigment_ref_images import get_reference_images from image_worker.gcs_client import is_gcs_url, download_from_gcs, load_gcp_credentials @@ -25,10 +21,7 @@ from utils.exceptions import ( WorkerNotInitializedError, ValidationError, - ImageDownloadError, - NetworkTimeoutError, InvalidImageURLError, - InvalidImageFormatError, GCSDownloadError, ) @@ -61,7 +54,6 @@ def __init__(self, db_manager=None): config.image_processing.max_image_height, ) self.DOWNLOAD_TIMEOUT = config.image_processing.download_timeout - self.DOWNLOAD_RETRIES = config.image_processing.download_retries self.use_pgvector = config.vector_search.use_pgvector self.faiss_top_k = config.vector_search.faiss_top_k @@ -137,135 +129,56 @@ async def close(self): async def download_image(self, image_url: str) -> Image.Image: """ - Download image from URL with timeout and validation. + Download image from a Google Cloud Storage URL. - Supports both public HTTP/HTTPS URLs and authenticated GCS bucket URLs (gs://). + Supports gs:// and storage.googleapis.com URLs via configured GCP + credentials. Args: - image_url: HTTP/HTTPS URL or GCS URL (gs://bucket-name/path/to/image) + image_url: GCS URL (gs://bucket-name/path/to/image or + https://storage.googleapis.com/bucket-name/path/to/image) Returns: PIL Image object Raises: InvalidImageURLError: If URL is malformed or invalid - NetworkTimeoutError: If download times out - InvalidImageFormatError: If file is not a valid image - ImageDownloadError: For other download failures GCSDownloadError: For GCS authentication, bucket, blob, or download failures """ - # Check if URL is a GCS URL - if is_gcs_url(image_url): - if not self.gcp_credentials: - raise GCSDownloadError( - "GCP credentials not initialized. Set GCP_ENABLED=true and GCP_KEY_PATH in .env", - details={"url": image_url}, - ) - - logger.debug(f"Downloading from GCS: url={image_url}") - try: - image = await download_from_gcs( - image_url, - self.gcp_credentials, - timeout=self.DOWNLOAD_TIMEOUT, - ) - return image - except FileNotFoundError as e: - raise GCSDownloadError( - f"GCS bucket or blob not found: {str(e)}", - details={"url": image_url}, - ) - except ValueError as e: - raise GCSDownloadError( - f"Invalid or corrupted image from GCS: {str(e)}", - details={"url": image_url}, - ) - except Exception as e: - raise GCSDownloadError( - f"GCS download failed: {str(e)}", - details={"url": image_url, "error": str(e)}, - ) - - # Handle HTTP/HTTPS URLs - max_retries = self.DOWNLOAD_RETRIES - for attempt in range(max_retries): - try: - parsed = urlparse(image_url) - if not parsed.scheme or not parsed.netloc: - raise InvalidImageURLError( - f"Invalid URL: {image_url}", - details={"url": image_url, "parsed": str(parsed)}, - ) - - logger.debug( - f"Downloading image: url={image_url[:80]}..., attempt={attempt + 1}/{max_retries}" - ) - - # Configure SSL context based on config - if config.image_processing.disable_ssl_verify: - ssl_context = ssl.create_default_context() - ssl_context.check_hostname = False - ssl_context.verify_mode = ssl.CERT_NONE - logger.debug("SSL verification disabled for image download") - connector = aiohttp.TCPConnector(ssl=ssl_context) - else: - connector = aiohttp.TCPConnector() - - async with aiohttp.ClientSession(connector=connector) as session: - async with session.get( - image_url, - timeout=aiohttp.ClientTimeout(total=self.DOWNLOAD_TIMEOUT), - ) as response: - response.raise_for_status() - content = await response.read() - - try: - image = Image.open(BytesIO(content)) - return image - except ValidationError: - raise - except Exception as img_error: - raise InvalidImageFormatError( - f"Invalid or corrupted image format: {str(img_error)}", - details={"url": image_url, "error": str(img_error)}, - ) - - except asyncio.TimeoutError: - logger.warning( - f"Download timeout: attempt={attempt + 1}/{max_retries}, " - f"timeout={self.DOWNLOAD_TIMEOUT}s" - ) - if attempt == max_retries - 1: - raise NetworkTimeoutError( - f"Download timed out after {max_retries} attempts", - details={ - "url": image_url, - "timeout": self.DOWNLOAD_TIMEOUT, - "retries": max_retries, - }, - ) - await asyncio.sleep(1) + if not is_gcs_url(image_url): + raise InvalidImageURLError( + f"Invalid GCS URL: {image_url}", + details={"url": image_url}, + ) - except aiohttp.ClientError as client_error: - logger.warning( - f"Download failed: attempt={attempt + 1}/{max_retries}, " - f"error={str(client_error)}" - ) - if attempt == max_retries - 1: - raise ImageDownloadError( - f"Failed to download image: {str(client_error)}", - details={ - "url": image_url, - "error": str(client_error), - "retries": max_retries, - }, - ) - await asyncio.sleep(1) + if not self.gcp_credentials: + raise GCSDownloadError( + "GCP credentials not initialized. Set GCP_ENABLED=true and GCP_KEY_PATH in .env", + details={"url": image_url}, + ) - raise ImageDownloadError( - f"Failed to download image after {max_retries} attempts", - details={"url": image_url, "retries": max_retries}, - ) + logger.debug(f"Downloading image from GCS: url={image_url}") + try: + return await download_from_gcs( + image_url, + self.gcp_credentials, + timeout=self.DOWNLOAD_TIMEOUT, + ) + except FileNotFoundError as e: + raise GCSDownloadError( + f"GCS bucket or blob not found: {str(e)}", + details={"url": image_url}, + ) + except ValueError as e: + raise GCSDownloadError( + f"Invalid or corrupted image from GCS: {str(e)}", + details={"url": image_url}, + ) + except Exception as e: + raise GCSDownloadError( + f"GCS download failed: {str(e)}", + details={"url": image_url, "error": str(e)}, + ) def _validate_input(self, data: Dict[str, Any]) -> Tuple[str, str, str, str, str]: """ diff --git a/mq/rmq_client.py b/mq/rmq_client.py index 1e4f31a..b3d73bd 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -157,13 +157,17 @@ async def publish_message(self, message_body): """Publish to feedback queue and handle failures.""" try: await self.channel.default_exchange.publish( - aio_pika.Message(body=json.dumps(message_body).encode()), + aio_pika.Message( + body=json.dumps(message_body).encode(), + delivery_mode=aio_pika.DeliveryMode.PERSISTENT, + ), routing_key=self.FEEDBACK_QUEUE, ) logger.info( f"Published submission {message_body.get('submission_id')} for user {message_body.get('student_id')}" ) - logger.info(f"Published message body: {message_body}") + logger.info(f"Published message body:") + logger.info(json.dumps(message_body, indent=2)) except asyncio.CancelledError as e: logger.warning("publish_message CancelledError") raise Exception("publish_message CancelledError") from e @@ -237,7 +241,7 @@ async def publish_to_dlq(self, message_body, reason: str = "Max retries exceeded async def start_consumer(self, callback): """Start consuming messages from the submission queue.""" await self.connect() - await self.submission_queue.consume(lambda msg: callback(msg)) + await self.submission_queue.consume(lambda msg: callback(msg), no_ack=False) async def close(self): """Close RabbitMQ connection and cancel pending tasks.""" diff --git a/plag_checker/submissions_checker.py b/plag_checker/submissions_checker.py index daef45e..9f04ee8 100644 --- a/plag_checker/submissions_checker.py +++ b/plag_checker/submissions_checker.py @@ -32,16 +32,14 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc_val, exc_tb): """Ensure message is acknowledged exactly once on exit.""" if not self.acked: - # Default behavior: reject without requeue on unhandled errors - logger.warning( - "Message not explicitly acknowledged, rejecting (requeue=False)" - ) + # Default behavior: requeue unhandled messages to avoid message loss. + logger.warning("Message not explicitly acknowledged, requeueing") try: - await self.message.reject(requeue=False) + await self.message.nack(requeue=True) self.acked = True - self.action = "reject(cleanup)" + self.action = "nack(cleanup, requeue=True)" except Exception as e: - logger.critical(f"CRITICAL: Failed to reject message in cleanup: {e}") + logger.critical(f"CRITICAL: Failed to nack message in cleanup: {e}") # DO NOT set self.acked = True here - let finally block handle it return False # Don't suppress exceptions @@ -123,9 +121,9 @@ async def initialize(self): async def process_submission(self, submission): if self._shutdown: - logger.warning("Shutdown in progress, rejecting message") + logger.warning("Shutdown in progress, requeueing message") try: - await submission.nack(requeue=False) + await submission.nack(requeue=True) except Exception as e: logger.error(f"Failed to nack message during shutdown: {e}") return @@ -187,8 +185,11 @@ async def process_submission(self, submission): ) if not dlq_success: logger.error( - f"Failed to send poison message to DLQ for submission {submission_id}" + f"Failed to send poison message to DLQ for submission {submission_id}; requeueing" ) + await submission.nack(requeue=True, submission_id=submission_id) + message_acked = True + return await submission.nack(requeue=False, submission_id=submission_id) message_acked = True @@ -267,15 +268,7 @@ async def process_submission(self, submission): logger.warning( f"Invalid result for submission {submission_id}: {parse_error}" ) - await self.db.update_status( - submission_id or "unknown", - SubmissionStatus.FAILED, - int(retry_count or 0), - f"Invalid processing result: {str(parse_error)}", - ) - await submission.nack(requeue=False) - message_acked = True - return + raise RuntimeError(f"Invalid processing result: {parse_error}") from parse_error data["similar_sources"] = result_text.get("similar_sources") data["similarity_score"] = result_text.get("similarity_score") @@ -315,7 +308,7 @@ async def process_submission(self, submission): f"rejecting message (shutdown={self._shutdown})" ) await submission.nack( - submission, requeue=False, submission_id=submission_id + requeue=True, submission_id=submission_id ) message_acked = True else: @@ -352,8 +345,11 @@ async def process_submission(self, submission): ) if not dlq_success: logger.error( - f"Failed to send max-retry message to DLQ for submission {submission_id}" + f"Failed to send max-retry message to DLQ for submission {submission_id}; requeueing" ) + await submission.nack(requeue=True, submission_id=submission_id) + message_acked = True + return await submission.nack(requeue=False, submission_id=submission_id) message_acked = True @@ -370,9 +366,9 @@ async def process_submission(self, submission): # Final safety net: ensure message is acknowledged if not message_acked: logger.warning( - f"Emergency reject for un-acknowledged message {submission_id}" + f"Emergency requeue for un-acknowledged message {submission_id}" ) - await submission.reject() + await submission.nack(requeue=True, submission_id=submission_id) message_acked = True def get_processor(self, data: dict): diff --git a/processors/image_processor.py b/processors/image_processor.py index e3dc36e..89fc6fc 100644 --- a/processors/image_processor.py +++ b/processors/image_processor.py @@ -65,8 +65,8 @@ async def process(self, data: dict) -> dict: ) return {"error": "Invalid JSON response from worker"} - logger.info("Result payload ") - logger.info(json.dumps(result, indent=2)) + logger.debug("Result payload ") + logger.debug(json.dumps(result, indent=2)) logger.info( f"Successfully processed submission: {data.get('submission_id')}" diff --git a/requirements.txt b/requirements.txt index a415d36..fcbfe5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -82,3 +82,6 @@ urllib3==2.6.0 wcwidth==0.2.14 wheel==0.45.1 yarl==1.22.0 + + +google-cloud-storage \ No newline at end of file diff --git a/tests/test_rmq_client.py b/tests/test_rmq_client.py index b97f8e5..78a36f4 100644 --- a/tests/test_rmq_client.py +++ b/tests/test_rmq_client.py @@ -154,6 +154,7 @@ async def test_publish_message_success(self, rmq_client): call_args = mock_aio_pika.Message.call_args assert call_args[1]["body"] == json.dumps(message_body).encode() + assert call_args[1]["delivery_mode"] == mock_aio_pika.DeliveryMode.PERSISTENT publish_call_args = mock_exchange.publish.call_args assert publish_call_args[1]["routing_key"] == "test_feedback" @@ -248,6 +249,7 @@ async def test_start_consumer(self, rmq_client): # Verify consumer was started mock_queue.consume.assert_called_once() + assert mock_queue.consume.call_args[1]["no_ack"] is False @pytest.mark.asyncio async def test_close(self, rmq_client): diff --git a/tests/test_submission_checker.py b/tests/test_submission_checker.py index 92fa300..6176cd0 100644 --- a/tests/test_submission_checker.py +++ b/tests/test_submission_checker.py @@ -55,13 +55,13 @@ async def test_reject_message(self, mock_message): mock_message.reject.assert_called_once_with(requeue=False) @pytest.mark.asyncio - async def test_auto_reject_on_exit(self, mock_message): - """Test that message is auto-rejected if not explicitly acked.""" + async def test_auto_requeue_on_exit(self, mock_message): + """Test that message is auto-requeued if not explicitly acked.""" async with MessageAckManager(mock_message): pass # Don't ack - # Should auto-reject with requeue=False - mock_message.reject.assert_called_once_with(requeue=False) + # Should auto-nack with requeue=True + mock_message.nack.assert_called_once_with(requeue=True) @pytest.mark.asyncio async def test_no_double_ack(self, mock_message): @@ -427,5 +427,5 @@ async def test_shutdown_flag(self, submission_checker): await submission_checker.process_submission(mock_message) - # Should reject message immediately - mock_message.nack.assert_called_once_with(requeue=False) + # Should requeue message immediately + mock_message.nack.assert_called_once_with(requeue=True) diff --git a/tests/test_worker.py b/tests/test_worker.py index b761fb5..37de665 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -187,6 +187,39 @@ async def test_download_image_invalid_url(self, mocked_worker): with pytest.raises(Exception): # Should raise InvalidImageURLError or similar await worker.download_image("not-a-url") + @pytest.mark.asyncio + async def test_download_image_storage_google_public_url(self, mocked_worker): + """Test public storage.googleapis.com image download via HTTPS fallback.""" + worker = mocked_worker + worker.gcp_credentials = None + + pil_image = await worker.download_image( + "https://storage.googleapis.com/assignment_submission/submissions/SUB-IMSUB.png" + ) + + assert pil_image is not None + assert isinstance(pil_image, Image.Image) + + @pytest.mark.asyncio + async def test_download_image_storage_google_authenticated_url(self, mocked_worker): + """Test authenticated storage.googleapis.com URL downloads via GCS helper when credentials are available.""" + worker = mocked_worker + worker.gcp_credentials = MagicMock() + + sample_image = Image.open(BytesIO(worker._test_mocks["session"].get().read.return_value)) + + with patch( + "image_worker.worker.download_from_gcs", + AsyncMock(return_value=sample_image), + ) as mock_download: + pil_image = await worker.download_image( + "https://storage.googleapis.com/assignment_submission/submissions/SUB-IMSUB.png" + ) + + assert pil_image is not None + assert isinstance(pil_image, Image.Image) + mock_download.assert_awaited_once() + class TestWorkerSubmissionProcessing: """Test submission processing workflow.""" From 955a655e8ec9a4278495bfa95276ae45bdd5cf9f Mon Sep 17 00:00:00 2001 From: Manu Date: Sun, 10 May 2026 23:40:38 +0530 Subject: [PATCH 12/13] migration scripts --- mq/rmq_client.py | 53 ++++------- scripts/restart_app.sh | 36 ++++++++ scripts/run_db_migrations.sh | 168 ++++++++++++++++++++++++++++++++++ scripts/setup_vm.sh | 171 +++++++++++++++++++++++++++++++++++ tests/test_rmq_client.py | 20 +++- 5 files changed, 409 insertions(+), 39 deletions(-) create mode 100755 scripts/restart_app.sh create mode 100755 scripts/run_db_migrations.sh create mode 100755 scripts/setup_vm.sh diff --git a/mq/rmq_client.py b/mq/rmq_client.py index b3d73bd..7848ca0 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -91,10 +91,8 @@ async def connect(self): ) self.channel = await self.connection.channel() - # Set prefetch count to 1 to avoid multiple slow CLIP inferences in parallel - # This prevents heartbeat timeouts from multiple long-running tasks - prefetch = int(os.getenv("RABBITMQ_PREFETCH_COUNT", "1")) - await self.channel.set_qos(prefetch_count=prefetch) + # Keep consumer concurrency bounded for slow CLIP inference workloads. + await self.channel.set_qos(prefetch_count=self.PREFETCH_COUNT) # Declare Dead Letter Queue first if configured if self.DEAD_LETTER_QUEUE: @@ -103,40 +101,17 @@ async def connect(self): ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True, + ) + logger.info(f"Submission queue declared: {self.SUBMISSION_QUEUE}") - try: - # First try passive declaration to check if queue exists - self.submission_queue = await self.channel.declare_queue( - self.SUBMISSION_QUEUE, - durable=True, - passive=True # Only check, don't create - ) - logger.info(f"Submission queue already exists: {self.SUBMISSION_QUEUE}") - except Exception: - # Queue doesn't exist, create it - self.submission_queue = await self.channel.declare_queue( - self.SUBMISSION_QUEUE, - durable=True - ) - logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") - - - - try: - # First try passive declaration to check if queue exists - self.feedback_queue = await self.channel.declare_queue( - self.FEEDBACK_QUEUE, - durable=True, - passive=True # Only check, don't create - ) - logger.info(f"Feedback queue already exists: {self.FEEDBACK_QUEUE}") - except Exception: - # Queue doesn't exist, create it - self.feedback_queue = await self.channel.declare_queue( - self.FEEDBACK_QUEUE, - durable=True - ) - logger.info(f"Feedback queue created: {self.FEEDBACK_QUEUE}") + self.feedback_queue = await self.channel.declare_queue( + self.FEEDBACK_QUEUE, + durable=True, + ) + logger.info(f"Feedback queue declared: {self.FEEDBACK_QUEUE}") logger.info( @@ -145,6 +120,10 @@ async def connect(self): return except Exception as e: logger.error(f"RabbitMQ connection failed: {e}") + if self.connection: + await self.connection.close() + self.connection = None + self.channel = None attempt += 1 if attempt > self.STARTUP_RETRY_LIMIT: logger.error( diff --git a/scripts/restart_app.sh b/scripts/restart_app.sh new file mode 100755 index 0000000..f24867d --- /dev/null +++ b/scripts/restart_app.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +APP_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +BRANCH="${BRANCH:-plg_integration}" +APP_SERVICE="${APP_SERVICE:-plg_app.service}" +POSTGRES_SERVICE="${POSTGRES_SERVICE:-plg_postgres.service}" +SKIP_GIT_PULL="${SKIP_GIT_PULL:-0}" + +log() { + printf '\n[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" +} + +cd "$APP_DIR" + +if [ "$SKIP_GIT_PULL" != "1" ]; then + log "Pulling latest code for $BRANCH" + git fetch origin "$BRANCH" + git checkout "$BRANCH" + git pull --ff-only origin "$BRANCH" +else + log "Skipping git pull because SKIP_GIT_PULL=$SKIP_GIT_PULL" +fi + +log "Ensuring PostgreSQL service is running" +systemctl --user start "$POSTGRES_SERVICE" + +log "Applying database migrations" +./scripts/run_db_migrations.sh + +log "Restarting application service" +systemctl --user restart "$APP_SERVICE" +systemctl --user --no-pager --full status "$APP_SERVICE" + +log "Restart complete" diff --git a/scripts/run_db_migrations.sh b/scripts/run_db_migrations.sh new file mode 100755 index 0000000..78970ec --- /dev/null +++ b/scripts/run_db_migrations.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +APP_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +COMPOSE_FILE="${COMPOSE_FILE:-docker-postgres.yml}" +POSTGRES_CONTAINER="${POSTGRES_CONTAINER:-plg-postgresdb}" +INIT_SQL="${INIT_SQL:-database/init.sql}" +MIGRATIONS_DIR="${MIGRATIONS_DIR:-database/migrations}" +BASELINE_EXISTING_MIGRATIONS="${BASELINE_EXISTING_MIGRATIONS:-0}" +INIT_CHECK_SQL="${INIT_CHECK_SQL:-SELECT to_regclass('public.submissions') IS NOT NULL;}" + +cd "$APP_DIR" + +if [ -f ".env" ]; then + set -a + # shellcheck disable=SC1091 + . ./.env + set +a +else + echo "ERROR: .env is required in $APP_DIR" >&2 + exit 1 +fi + +: "${POSTGRES_DB:?POSTGRES_DB must be set in .env}" +: "${POSTGRES_USER:?POSTGRES_USER must be set in .env}" + +if command -v podman >/dev/null 2>&1; then + CONTAINER_CMD="${CONTAINER_CMD:-podman}" +elif command -v docker >/dev/null 2>&1; then + CONTAINER_CMD="${CONTAINER_CMD:-docker}" +else + echo "ERROR: podman or docker is required" >&2 + exit 1 +fi + +if [ -x "$APP_DIR/venv/bin/podman-compose" ]; then + COMPOSE_CMD="${COMPOSE_CMD:-$APP_DIR/venv/bin/podman-compose}" +elif command -v podman-compose >/dev/null 2>&1; then + COMPOSE_CMD="${COMPOSE_CMD:-podman-compose}" +elif command -v docker-compose >/dev/null 2>&1; then + COMPOSE_CMD="${COMPOSE_CMD:-docker-compose}" +elif docker compose version >/dev/null 2>&1; then + COMPOSE_CMD="${COMPOSE_CMD:-docker compose}" +else + COMPOSE_CMD="" +fi + +find_postgres_container() { + if "$CONTAINER_CMD" container exists "$POSTGRES_CONTAINER" >/dev/null 2>&1; then + echo "$POSTGRES_CONTAINER" + return + fi + + for candidate in plg-postgresdb plg-postgres; do + if "$CONTAINER_CMD" container exists "$candidate" >/dev/null 2>&1; then + echo "$candidate" + return + fi + done + + echo "$POSTGRES_CONTAINER" +} + +start_postgres_if_needed() { + if [ -n "$COMPOSE_CMD" ] && [ -f "$COMPOSE_FILE" ]; then + $COMPOSE_CMD -f "$COMPOSE_FILE" up -d postgres + else + "$CONTAINER_CMD" start "$(find_postgres_container)" >/dev/null + fi +} + +wait_for_postgres() { + local container="$1" + local attempt=1 + + while [ "$attempt" -le 60 ]; do + if "$CONTAINER_CMD" exec "$container" pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" >/dev/null 2>&1; then + return 0 + fi + sleep 2 + attempt=$((attempt + 1)) + done + + echo "ERROR: PostgreSQL did not become ready in time" >&2 + exit 1 +} + +psql_exec() { + local container="$1" + shift + "$CONTAINER_CMD" exec "$container" psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" "$@" +} + +psql_file() { + local container="$1" + local file="$2" + "$CONTAINER_CMD" exec -i "$container" psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d "$POSTGRES_DB" <"$file" +} + +record_migration() { + local container="$1" + local migration_name="$2" + + psql_exec "$container" -c "INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name') ON CONFLICT (migration_name) DO NOTHING;" +} + +main() { + start_postgres_if_needed + + local container + container="$(find_postgres_container)" + + wait_for_postgres "$container" + + local initialized_from_init=0 + local init_check_result + init_check_result="$( + psql_exec "$container" -tAc "$INIT_CHECK_SQL" + )" + + if [ "$init_check_result" != "t" ] && [ -f "$INIT_SQL" ]; then + echo "Applying $INIT_SQL" + psql_file "$container" "$INIT_SQL" + initialized_from_init=1 + fi + + psql_exec "$container" -c " +CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() +);" + + if [ -d "$MIGRATIONS_DIR" ]; then + for migration_file in "$MIGRATIONS_DIR"/*.sql; do + [ -f "$migration_file" ] || continue + + migration_name="$(basename "$migration_file")" + + if [ "$initialized_from_init" = "1" ]; then + echo "Recording migration already included by $INIT_SQL: $migration_name" + record_migration "$container" "$migration_name" + continue + fi + + if [ "$BASELINE_EXISTING_MIGRATIONS" = "1" ]; then + echo "Recording existing migration without applying: $migration_name" + record_migration "$container" "$migration_name" + continue + fi + + already_applied="$( + psql_exec "$container" -tAc "SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name';" + )" + + if [ "$already_applied" = "0" ]; then + echo "Applying migration: $migration_name" + psql_file "$container" "$migration_file" + record_migration "$container" "$migration_name" + else + echo "Skipping already-applied migration: $migration_name" + fi + done + fi +} + +main "$@" diff --git a/scripts/setup_vm.sh b/scripts/setup_vm.sh new file mode 100755 index 0000000..1f1db57 --- /dev/null +++ b/scripts/setup_vm.sh @@ -0,0 +1,171 @@ +#!/usr/bin/env bash +set -euo pipefail + +APP_USER="${APP_USER:-$(id -un)}" +APP_HOME="${APP_HOME:-$HOME}" +APP_DIR="${APP_DIR:-$APP_HOME/tap_plg}" +REPO_URL="${REPO_URL:-https://github.com/theapprenticeproject/tap_plg.git}" +BRANCH="${BRANCH:-plg_integration}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +POSTGRES_SERVICE="${POSTGRES_SERVICE:-plg_postgres.service}" +APP_SERVICE="${APP_SERVICE:-plg_app.service}" +DOWNLOAD_CLIP_MODEL="${DOWNLOAD_CLIP_MODEL:-1}" +ENV_FILE="${ENV_FILE:-}" + +log() { + printf '\n[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" +} + +run_sudo() { + if [ "$(id -u)" -eq 0 ]; then + "$@" + else + sudo "$@" + fi +} + +clone_or_update_repo() { + if [ -d "$APP_DIR/.git" ]; then + log "Updating existing repository in $APP_DIR" + git -C "$APP_DIR" fetch origin "$BRANCH" + git -C "$APP_DIR" checkout "$BRANCH" + git -C "$APP_DIR" pull --ff-only origin "$BRANCH" + else + log "Cloning $REPO_URL into $APP_DIR" + mkdir -p "$(dirname "$APP_DIR")" + git clone --branch "$BRANCH" "$REPO_URL" "$APP_DIR" + fi +} + +install_env_file() { + cd "$APP_DIR" + + if [ -f ".env" ]; then + log ".env already exists" + return + fi + + if [ -n "$ENV_FILE" ]; then + if [ ! -f "$ENV_FILE" ]; then + echo "ERROR: ENV_FILE does not exist: $ENV_FILE" >&2 + exit 1 + fi + + log "Creating .env from $ENV_FILE" + cp "$ENV_FILE" .env + return + fi + + echo "ERROR: .env is required in $APP_DIR, or pass ENV_FILE=/path/to/envfile when running this script." >&2 + exit 1 +} + +setup_python_environment() { + cd "$APP_DIR" + + log "Creating/updating Python virtual environment" + "$PYTHON_BIN" -m venv venv + ./venv/bin/python -m pip install podman-compose + + log "Starting production environment" + ./start-prod-env.sh --with-api + + if [ "$DOWNLOAD_CLIP_MODEL" = "1" ]; then + log "Downloading CLIP model if it is not already present" + ./venv/bin/python scripts/download_clip_model.py --model ViT-L-14 + else + log "Skipping CLIP model download because DOWNLOAD_CLIP_MODEL=$DOWNLOAD_CLIP_MODEL" + fi +} + +install_system_packages() { + log "Installing OS packages" + run_sudo apt update + run_sudo apt install -y podman vim git python3.10-venv + + log "Configuring systemd delegation for rootless Podman" + run_sudo mkdir -p /etc/systemd/system/user@.service.d + run_sudo tee /etc/systemd/system/user@.service.d/delegate.conf >/dev/null <<'EOF' +[Service] +Delegate=cpu cpuset io memory pids +EOF + run_sudo systemctl daemon-reload +} + +install_user_services() { + cd "$APP_DIR" + + local systemd_dir="$HOME/.config/systemd/user" + mkdir -p "$systemd_dir" + + log "Writing user systemd services" + cat >"$systemd_dir/$POSTGRES_SERVICE" <"$systemd_dir/$APP_SERVICE" < Date: Thu, 14 May 2026 16:49:41 +0530 Subject: [PATCH 13/13] requirements --- requirements.txt | 203 ++++++++++++++++++++--------------- scripts/restart_app.sh | 20 +++- scripts/run_db_migrations.sh | 12 ++- 3 files changed, 139 insertions(+), 96 deletions(-) diff --git a/requirements.txt b/requirements.txt index fcbfe5a..d860689 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,87 +1,116 @@ -# Message Queue - RabbitMQ (latest stable) -aio-pika==9.5.7 -aiormq==6.9.2 - -# Database - PostgreSQL (async) -asyncpg==0.30.0 -pgvector==0.4.1 - -# HTTP Client -aiohttp==3.13.2 -aiohappyeyeballs==2.6.1 -aiosignal==1.4.0 -requests==2.32.3 - -# GCP Cloud Storage -google-cloud-storage==2.10.0 -google-auth==2.27.0 - -# Configuration Management (Pydantic v2 compatible) -pydantic==1.10.24 -python-dotenv==1.2.1 - -# Image Processing -pillow==12.0.0 -ImageHash==4.3.2 - -# Machine Learning - CLIP & FAISS -torch==2.9.0 -torchvision==0.24.0 -open_clip_torch==3.2.0 -faiss-cpu==1.12.0 -numpy==2.3.4 -timm==1.0.22 -safetensors==0.7.0 -huggingface_hub==1.2.1 -filelock==3.20.0 -fsspec==2025.12.0 -ftfy==6.3.1 -regex==2025.11.3 -scipy==1.16.3 -mpmath==1.3.0 -sympy==1.14.0 -networkx==3.6 -PyWavelets==1.9.0 - -# Web Framework - FastAPI -fastapi==0.115.0 -uvicorn==0.34.0 -starlette==0.38.6 -anyio==4.12.0 -h11==0.16.0 -httptools==0.7.1 -uvloop==0.22.1 -watchfiles==1.1.1 -websockets==15.0.1 -click==8.3.1 -typer-slim==0.20.0 -shellingham==1.5.4 -httpcore==1.0.9 -httpx==0.28.1 -hf-xet==1.2.0 - -# Utilities -tqdm==4.67.1 -podman-compose==1.5.0 -annotated-doc==0.0.4 -attrs==25.4.0 -certifi==2025.11.12 -charset-normalizer==3.4.4 -frozenlist==1.8.0 -idna==3.11 -Jinja2==3.1.6 -MarkupSafe==3.0.3 -multidict==6.7.0 -packaging==25.0 -pamqp==3.3.0 -propcache==0.4.1 -PyYAML==6.0.3 -setuptools==80.9.0 -typing_extensions==4.15.0 -urllib3==2.6.0 -wcwidth==0.2.14 -wheel==0.45.1 -yarl==1.22.0 - - -google-cloud-storage \ No newline at end of file +# Message Queue - RabbitMQ +aio-pika==9.5.7 +aiormq==6.9.2 +pamqp==3.3.0 + +# Database - PostgreSQL +asyncpg==0.30.0 +pgvector==0.4.1 + +# HTTP Client +aiohttp==3.13.2 +aiohappyeyeballs==2.6.1 +aiosignal==1.4.0 +async-timeout==5.0.1 +requests==2.32.3 +httpcore==1.0.9 +httpx==0.28.1 + +# GCP Cloud Storage +google-api-core==2.30.3 +google-auth==2.52.0 +google-cloud-core==2.6.0 +google-cloud-storage==3.10.1 +google-crc32c==1.8.0 +google-resumable-media==2.9.0 +googleapis-common-protos==1.75.0 +proto-plus==1.28.0 +protobuf==7.34.1 +pyasn1==0.6.3 +pyasn1_modules==0.4.2 + +# Configuration Management +pydantic==1.10.24 +python-dotenv==1.2.1 + +# Image Processing +pillow==12.0.0 +ImageHash==4.3.2 +PyWavelets==1.8.0 + +# Machine Learning - CLIP & FAISS +torch==2.9.0 +torchvision==0.24.0 +open_clip_torch==3.2.0 +faiss-cpu==1.12.0 +numpy==2.2.6 +timm==1.0.22 +safetensors==0.7.0 +huggingface_hub==1.2.1 +hf-xet==1.2.0 +filelock==3.20.0 +fsspec==2025.12.0 +ftfy==6.3.1 +regex==2025.11.3 +scipy==1.15.3 +mpmath==1.3.0 +sympy==1.14.0 +networkx==3.4.2 + +# NVIDIA CUDA runtime dependencies installed with torch +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +triton==3.5.0 + +# Web Framework - FastAPI +fastapi==0.115.0 +uvicorn==0.34.0 +starlette==0.38.6 +anyio==4.12.0 +h11==0.16.0 +httptools==0.7.1 +uvloop==0.22.1 +watchfiles==1.1.1 +websockets==15.0.1 +click==8.3.1 +typer-slim==0.20.0 +shellingham==1.5.4 + +# Utilities +annotated-doc==0.0.4 +attrs==25.4.0 +certifi==2025.11.12 +cffi==2.0.0 +charset-normalizer==3.4.4 +cryptography==48.0.0 +exceptiongroup==1.3.1 +frozenlist==1.8.0 +idna==3.11 +Jinja2==3.1.6 +MarkupSafe==3.0.3 +multidict==6.7.0 +packaging==25.0 +podman-compose==1.5.0 +propcache==0.4.1 +pycparser==3.0 +PyYAML==6.0.3 +setuptools==80.9.0 +tqdm==4.67.1 +typing_extensions==4.15.0 +urllib3==2.6.0 +wcwidth==0.2.14 +wheel==0.45.1 +yarl==1.22.0 diff --git a/scripts/restart_app.sh b/scripts/restart_app.sh index f24867d..c95bfde 100755 --- a/scripts/restart_app.sh +++ b/scripts/restart_app.sh @@ -1,7 +1,9 @@ -#!/usr/bin/env bash -set -euo pipefail +# SKIP_GIT_PULL=1 sh scripts/restart_app.sh -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +#!/bin/sh +set -eu + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" APP_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" BRANCH="${BRANCH:-plg_integration}" APP_SERVICE="${APP_SERVICE:-plg_app.service}" @@ -12,6 +14,15 @@ log() { printf '\n[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" } +start_postgres_service_if_present() { + if systemctl --user cat "$POSTGRES_SERVICE" >/dev/null 2>&1; then + log "Ensuring PostgreSQL service is running" + systemctl --user start "$POSTGRES_SERVICE" + else + log "PostgreSQL service $POSTGRES_SERVICE not found; migrations will start PostgreSQL if needed" + fi +} + cd "$APP_DIR" if [ "$SKIP_GIT_PULL" != "1" ]; then @@ -23,8 +34,7 @@ else log "Skipping git pull because SKIP_GIT_PULL=$SKIP_GIT_PULL" fi -log "Ensuring PostgreSQL service is running" -systemctl --user start "$POSTGRES_SERVICE" +start_postgres_service_if_present log "Applying database migrations" ./scripts/run_db_migrations.sh diff --git a/scripts/run_db_migrations.sh b/scripts/run_db_migrations.sh index 78970ec..4ed5891 100755 --- a/scripts/run_db_migrations.sh +++ b/scripts/run_db_migrations.sh @@ -1,7 +1,7 @@ -#!/usr/bin/env bash -set -euo pipefail +#!/bin/sh +set -eu -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" APP_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" COMPOSE_FILE="${COMPOSE_FILE:-docker-postgres.yml}" POSTGRES_CONTAINER="${POSTGRES_CONTAINER:-plg-postgresdb}" @@ -13,9 +13,13 @@ INIT_CHECK_SQL="${INIT_CHECK_SQL:-SELECT to_regclass('public.submissions') IS NO cd "$APP_DIR" if [ -f ".env" ]; then + ENV_FILE="$(mktemp)" + trap 'rm -f "$ENV_FILE"' EXIT + sed 's/\r$//' .env >"$ENV_FILE" + set -a # shellcheck disable=SC1091 - . ./.env + . "$ENV_FILE" set +a else echo "ERROR: .env is required in $APP_DIR" >&2