theapprenticeproject · manua-glitch · Dec 17, 2025 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/.env.example b/.env.example
@@ -16,6 +16,10 @@ FEEDBACK_QUEUE=plagiarism_feedback
 # Dead Letter Queue (optional - leave empty to disable)
 DEAD_LETTER_QUEUE=plagiarism_failed_submissions
 
+#GCP CONFIGURATION
+GCP_ENABLED=true
+GCP_KEY_PATH=/app/credentials/gcp_service_account.json
+
 # POSTGRESQL CONFIGURATION
 POSTGRES_HOST=postgres
 POSTGRES_PORT=5432

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+ # Credentials
+ app/credentials/*
+
+scratch*
+
 # Python
 __pycache__/
 *.py[cod]
@@ -14,6 +19,7 @@ build/
 # Environment variables
 .env
 .env.local
+.env.prod
 
 # Logs
 logs/

diff --git a/Dockerfile b/Dockerfile
@@ -5,6 +5,7 @@ WORKDIR /app
 # Install build dependencies in a single layer
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
+    vim \
     gcc \
     g++ \
     git \
@@ -18,10 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Copy only requirements first for better caching
 COPY requirements.txt .
 
-# Use pip cache and install in parallel
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python -m pip install --upgrade pip setuptools wheel && \
-    pip install -r requirements.txt --user --no-warn-script-location
+# Install to explicit location
+RUN python -m pip install --no-cache-dir --prefix=/install --upgrade pip setuptools wheel && \
+    pip install --no-cache-dir --no-deps --prefix=/install -r requirements.txt
+
+RUN pip install --prefix=/install -r requirements.txt --no-cache-dir
 
 # ============================================
 # Final stage - minimal runtime image
@@ -37,18 +39,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy installed packages from builder
-COPY --from=builder /root/.local /root/.local
-
-ENV PATH=/root/.local/bin:$PATH
+COPY --from=builder /install /usr/local
 
 # Create necessary directories
 RUN mkdir -p /app/data /app/logs /root/.cache/clip
 
+RUN ls
+
 # Copy application code (do this last for better caching)
 COPY . .
 
-# Lightweight healthcheck
-HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
-    CMD python -c "import sys; sys.exit(0)" || exit 1
 
-CMD ["python", "app.py"]
+CMD ["python", "app.py"]
diff --git a/Dockerfile.api b/Dockerfile.api
@@ -1,4 +1,4 @@
-FROM python:3.13-slim
+FROM python:3.13-slim as builder
 
 WORKDIR /app
 
@@ -7,9 +7,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf
 
 # Copy requirements and install dependencies
 COPY api/requirements.txt /app/api/requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python -m pip install --upgrade pip && \
-    pip install -r /app/api/requirements.txt
+RUN python -m pip install --prefix=/install -r --upgrade pip && \
+    pip install --prefix=/install -r /app/api/requirements.txt --no-cache-dir
+
+# ============================================
+# Final stage - minimal runtime image
+# ============================================
+FROM python:3.13-slim
+
+WORKDIR /app
+
+# Copy installed packages from builder
+COPY --from=builder /install /usr/local
 
 # Copy application code
 COPY api/ /app/api/
@@ -18,9 +27,6 @@ COPY utils/ /app/utils/
 # Expose API port
 EXPOSE 8000
 
-# Healthcheck (check if uvicorn is responding)
-HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
-    CMD curl -f http://localhost:8000/ || exit 1
 
 # Run the API
 CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/api/api.py b/api/api.py
@@ -8,31 +8,49 @@
 from fastapi import FastAPI, HTTPException, status
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field, validator
-import aio_pika
-import json
-import uuid
-
-from dotenv import load_dotenv
-import os
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from utils.security import safe_hash_student_id
-
-# load_dotenv()
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-# RabbitMQ Configuration
-RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
+import aio_pika
+import json
+import uuid
+from dotenv import load_dotenv
+import os
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from utils.security import safe_hash_student_id
+
+load_dotenv()
+
+DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+
+def get_log_format() -> str:
+    log_format = os.getenv("LOG_FORMAT", DEFAULT_LOG_FORMAT)
+    if log_format.lower() == "json":
+        return DEFAULT_LOG_FORMAT
+    return log_format
+
+
+# Configure logging
+logging.basicConfig(
+    level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO),
+    format=get_log_format(),
+)
+logger = logging.getLogger(__name__)
+
+# RabbitMQ Configuration
+RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", "5672")
 RABBITMQ_VHOST = os.getenv("RABBITMQ_VHOST", "/")
 RABBITMQ_USER = os.getenv("RABBITMQ_USER", "admin")
 RABBITMQ_PASS = os.getenv("RABBITMQ_PASS", "admin123")
 
+#PRINT THE RABBITMQ CONFIG FOR DEBUGGING
+# logger.info("###################")
+# logger.info(f"RABBITMQ_HOST={RABBITMQ_HOST}")
+# logger.info(f"RABBITMQ_PORT={RABBITMQ_PORT}")
+# logger.info(f"RABBITMQ_VHOST={RABBITMQ_VHOST}")
+# logger.info(f"RABBITMQ_USER={RABBITMQ_USER}")
+# logger.info("###################")
+
 SUBMISSION_QUEUE = os.getenv("SUBMISSION_QUEUE", "plagiarism_submissions")
 FEEDBACK_QUEUE = os.getenv("FEEDBACK_QUEUE", "plagiarism_feedback")
 
@@ -53,15 +71,42 @@ class SubmissionRequest(BaseModel):
     """Request model for submission creation"""
 
     student_id: str = Field(..., description="Student identifier", min_length=1)
-    image_url: str = Field(..., description="URL of the submitted image", min_length=1)
+    submission_type: str = Field(..., description="Type of submission: text, audio, video, or image")
+    submission_url: Optional[str] = Field(
+        None, description="URL of the submitted resource (required for image submissions)"
+    )
+    submission_text: Optional[str] = Field(
+        None, description="Text content of the submission (required for text submissions)"
+    )
+    submitted_at: Optional[str] = Field(
+        None,
+        description="Original submission timestamp in ISO format. If provided, this value is preserved in the processed result.",
+    )
     assignment_id: Optional[str] = Field(None, description="Assignment identifier")
 
-    @validator("image_url")
-    def validate_url(cls, v):
-        """Validate that image_url is not empty"""
-        if not v or not v.strip():
-            raise ValueError("image_url cannot be empty")
-        return v.strip()
+    @validator("submission_type")
+    def validate_submission_type(cls, v):
+        """Validate that submission_type is one of the supported values"""
+        allowed = {"text", "audio", "video", "image"}
+        if v not in allowed:
+            raise ValueError(f"submission_type must be one of {sorted(allowed)}")
+        return v
+
+    @validator("submission_url", always=True)
+    def validate_submission_url(cls, v, values):
+        if values.get("submission_type") == "image":
+            if not v or not v.strip():
+                raise ValueError("submission_url is required for image submissions")
+            return v.strip()
+        return v
+
+    @validator("submission_text", always=True)
+    def validate_submission_text(cls, v, values):
+        if values.get("submission_type") == "text":
+            if not v or not v.strip():
+                raise ValueError("submission_text is required for text submissions")
+            return v.strip()
+        return v
 
     @validator("student_id")
     def validate_student_id(cls, v):
@@ -199,13 +244,13 @@ async def send_to_rabbitmq(message: dict, queue_name: str):
 )
 async def create_submission(request: SubmissionRequest):
     """
-    Submit an image for plagiarism detection
+    Submit a new user submission for plagiarism detection
 
-    Creates a new plagiarism check submission by sending the image URL
-    and metadata to the processing queue.
+    Creates a new plagiarism check submission by sending submission metadata
+    to the processing queue.
 
     Args:
-        request: Submission request containing student_id, image_url, and optional assignment_id
+        request: Submission request containing student_id, submission_type, submission_url, and optional assignment_id
 
     Returns:
         SubmissionResponse with submission details and unique ID
@@ -237,9 +282,11 @@ async def create_submission(request: SubmissionRequest):
         payload = {
             "student_id": hashed_student_id,
             "submission_id": submission_id,
-            "img_url": request.image_url,
+            "submission_type": request.submission_type,
+            "submission_url": request.submission_url,
+            "submission_text": request.submission_text,
+            "submitted_at": request.submitted_at,
             "assign_id": assignment_id,
-            "submitted_at": datetime.datetime.utcnow().isoformat(),
         }
 
         # Send to RabbitMQ

diff --git a/app.py b/app.py
@@ -1,20 +1,32 @@
 import asyncio
 import logging
-import signal
-import sys
-import os
-from mq.rmq_client import RabbitMQClient
-from plag_checker.submissions_checker import SubmissionChecker
-from dotenv import load_dotenv
-
-__version__ = "1.0.0"
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-load_dotenv()
+import signal
+import sys
+import os
+from dotenv import load_dotenv
+from mq.rmq_client import RabbitMQClient
+from plag_checker.submissions_checker import SubmissionChecker
+
+__version__ = "1.0.0"
+
+load_dotenv()
+
+DEFAULT_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+
+def get_log_format() -> str:
+    log_format = os.getenv("LOG_FORMAT", DEFAULT_LOG_FORMAT)
+    if log_format.lower() == "json":
+        return DEFAULT_LOG_FORMAT
+    return log_format
+
+
+# Configure logging
+logging.basicConfig(
+    level=getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO),
+    format=get_log_format(),
+)
+logger = logging.getLogger(__name__)
 
 
 def validate_configuration():
@@ -30,6 +42,10 @@ def validate_configuration():
     ]
 
     missing = [var for var in required_env_vars if not os.getenv(var)]
+    #print the required env vars and their values for debugging
+    # for var in required_env_vars:
+    #     logger.info("###################")
+    #     logger.info(f"{var}={os.getenv(var)}")
     if missing:
         logger.error(f"Missing required environment variables: {missing}")
         raise ValueError(f"Missing required environment variables: {missing}")

diff --git a/config/config.py b/config/config.py
@@ -93,7 +93,7 @@ class DetectionConfig(BaseSettings):
 
     exact_dup_threshold: float = Field(default=0.95, env="EXACT_DUPLICATE_THRESHOLD")
     near_dup_threshold: float = Field(default=0.90, env="NEAR_DUPLICATE_THRESHOLD")
-    semantic_threshold: float = Field(default=0.80, env="SEMANTIC_MATCH_THRESHOLD")
+    semantic_threshold: float = Field(default=0.70, env="SEMANTIC_MATCH_THRESHOLD")
 
     # Hash matching thresholds (Hamming distance, 0-64 bits)
     hash_threshold: int = Field(default=8, env="HASH_MATCH_THRESHOLD")
@@ -215,6 +215,19 @@ class Config:
         case_sensitive = False
 
 
+class GCPConfig(BaseSettings):
+    """GCP Cloud Storage configuration for authenticated image downloads."""
+
+    gcp_enabled: bool = Field(default=True, env="GCP_ENABLED")
+    gcp_key_path: str = Field(
+        default="", env="GCP_KEY_PATH", description="Path to GCP service account JSON key file"
+    )
+
+    class Config:
+        env_file = ".env"
+        case_sensitive = False
+
+
 class LoggingConfig(BaseSettings):
     """Logging configuration."""
 
@@ -239,6 +252,7 @@ class AppConfig(BaseSettings):
     detection: DetectionConfig = DetectionConfig()
     vector_search: VectorSearchConfig = VectorSearchConfig()
     image_processing: ImageProcessingConfig = ImageProcessingConfig()
+    gcp: GCPConfig = GCPConfig()
     logging: LoggingConfig = LoggingConfig()
 
     app_name: str = Field(default="MentorMe Plagiarism Detection", env="APP_NAME")