diff --git a/backend/app/alembic/versions/0e7dd198b7c7_convert_status_to_enum_type.py b/backend/app/alembic/versions/0e7dd198b7c7_convert_status_to_enum_type.py new file mode 100644 index 0000000000..a214181072 --- /dev/null +++ b/backend/app/alembic/versions/0e7dd198b7c7_convert_status_to_enum_type.py @@ -0,0 +1,103 @@ +"""convert_status_to_enum_type + +Convert ingestions.status column from VARCHAR to PostgreSQL ENUM type. + +This migration: +1. Creates extractionstatus ENUM type with all status values +2. Converts existing VARCHAR status column to use the ENUM type +3. Maintains data integrity by mapping existing values to ENUM + +Revision ID: 0e7dd198b7c7 +Revises: 2ccac127c59f +Create Date: 2025-10-30 13:25:21.537208 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes + + +# revision identifiers, used by Alembic. +revision = '0e7dd198b7c7' +down_revision = '20038a3ab258' +branch_labels = None +depends_on = None + + +def upgrade(): + """Convert status column to PostgreSQL ENUM type.""" + # Create extractionstatus ENUM type + op.execute(""" + CREATE TYPE extractionstatus AS ENUM ( + 'UPLOADED', + 'OCR_IN_PROGRESS', + 'OCR_COMPLETE', + 'OCR_FAILED', + 'SEGMENTATION_PROCESSING', + 'SEGMENTATION_COMPLETE', + 'TAGGING_PROCESSING', + 'DRAFT', + 'IN_REVIEW', + 'APPROVED', + 'REJECTED', + 'FAILED' + ) + """) + + # Update existing 'OCR_PROCESSING' values to 'OCR_IN_PROGRESS' if any exist + op.execute(""" + UPDATE ingestions + SET status = 'OCR_IN_PROGRESS' + WHERE status = 'OCR_PROCESSING' + """) + + # Step 1: Drop the existing default value + op.execute(""" + ALTER TABLE ingestions + ALTER COLUMN status DROP DEFAULT + """) + + # Step 2: Convert status column to use ENUM type + op.execute(""" + ALTER TABLE ingestions + ALTER COLUMN status TYPE extractionstatus + USING status::text::extractionstatus + """) + + # Step 3: Re-add the default value as ENUM type + op.execute(""" + ALTER TABLE ingestions + ALTER COLUMN status SET DEFAULT 'UPLOADED'::extractionstatus + """) + + +def downgrade(): + """Convert status column back to VARCHAR.""" + # Step 1: Drop the ENUM default + op.execute(""" + ALTER TABLE ingestions + ALTER COLUMN status DROP DEFAULT + """) + + # Step 2: Convert status column back to VARCHAR + op.execute(""" + ALTER TABLE ingestions + ALTER COLUMN status TYPE VARCHAR + USING status::text + """) + + # Step 3: Re-add the VARCHAR default + op.execute(""" + ALTER TABLE ingestions + ALTER COLUMN status SET DEFAULT 'UPLOADED' + """) + + # Step 4: Drop the ENUM type + op.execute("DROP TYPE extractionstatus") + + # Step 5: Revert OCR_IN_PROGRESS back to OCR_PROCESSING if any exist + op.execute(""" + UPDATE ingestions + SET status = 'OCR_PROCESSING' + WHERE status = 'OCR_IN_PROGRESS' + """) diff --git a/backend/app/models.py b/backend/app/models.py index a1eb463e8f..5ebe527c01 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -86,8 +86,9 @@ class ExtractionStatus(str, Enum): """Extraction pipeline status enum.""" UPLOADED = "UPLOADED" - OCR_PROCESSING = "OCR_PROCESSING" + OCR_IN_PROGRESS = "OCR_IN_PROGRESS" OCR_COMPLETE = "OCR_COMPLETE" + OCR_FAILED = "OCR_FAILED" SEGMENTATION_PROCESSING = "SEGMENTATION_PROCESSING" SEGMENTATION_COMPLETE = "SEGMENTATION_COMPLETE" TAGGING_PROCESSING = "TAGGING_PROCESSING" diff --git a/backend/app/services/ocr.py b/backend/app/services/ocr.py index 4246c1b300..97f36283aa 100644 --- a/backend/app/services/ocr.py +++ b/backend/app/services/ocr.py @@ -6,7 +6,7 @@ import uuid from datetime import datetime -from typing import Any +from typing import Any, Literal import httpx from pydantic import BaseModel, Field @@ -61,6 +61,20 @@ class BoundingBox(BaseModel): height: float = Field(..., description="Height of the bounding box") +class TableStructure(BaseModel): + """Table layout extracted by Mistral OCR. + + Represents the structure of a table including dimensions and cell contents. + """ + + rows: int = Field(..., description="Number of rows in the table", gt=0) + columns: int = Field(..., description="Number of columns in the table", gt=0) + cells: list[dict[str, Any]] = Field( + default_factory=list, + description="Cell data with row, col, text, and bbox information", + ) + + class ContentBlock(BaseModel): """A content block extracted from a PDF page. @@ -68,15 +82,14 @@ class ContentBlock(BaseModel): """ block_id: str = Field(..., description="Unique identifier for this content block") - block_type: str = Field( - ..., - description="Type of content: text, equation, table, image, header, paragraph, list", - ) + block_type: Literal[ + "text", "header", "paragraph", "list", "table", "equation", "image" + ] = Field(..., description="Type of content block") text: str = Field(..., description="Extracted text content") bbox: BoundingBox = Field(..., description="Bounding box coordinates") confidence: float = Field(..., ge=0.0, le=1.0, description="OCR confidence score") latex: str | None = Field(None, description="LaTeX representation for equations") - table_structure: dict[str, Any] | None = Field( + table_structure: TableStructure | None = Field( None, description="Table structure metadata (rows, columns, cells)" ) image_description: str | None = Field( @@ -153,7 +166,9 @@ def __init__(self, api_key: str, base_url: str = "https://api.mistral.ai/v1"): timeout=httpx.Timeout(60.0), ) - def _map_block_type(self, mistral_type: str) -> str: + def _map_block_type( + self, mistral_type: str + ) -> Literal["text", "header", "paragraph", "list", "table", "equation", "image"]: """Map Mistral's block type to semantic types for segmentation. Args: @@ -162,7 +177,12 @@ def _map_block_type(self, mistral_type: str) -> str: Returns: Semantic block type (e.g., "header", "paragraph") """ - mapping = { + mapping: dict[ + str, + Literal[ + "text", "header", "paragraph", "list", "table", "equation", "image" + ], + ] = { "heading": "header", "text": "paragraph", "equation": "equation", @@ -281,6 +301,15 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult: # If no type provided, default to "text" (fallback/unknown type) # If type is provided, map to semantic type + block_type: Literal[ + "text", + "header", + "paragraph", + "list", + "table", + "equation", + "image", + ] if mistral_type is None: block_type = "text" # Default fallback else: @@ -322,11 +351,11 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult: ), confidence=0.95, latex=None, - table_structure={ - "rows": table_data.get("rows"), - "columns": table_data.get("columns"), - "cells": table_data.get("cells", []), - }, + table_structure=TableStructure( + rows=table_data.get("rows", 0), + columns=table_data.get("columns", 0), + cells=table_data.get("cells", []), + ), image_description=None, markdown_content=None, hierarchy_level=None, diff --git a/backend/app/tasks/extraction.py b/backend/app/tasks/extraction.py index d3f8566873..3f54550520 100644 --- a/backend/app/tasks/extraction.py +++ b/backend/app/tasks/extraction.py @@ -78,11 +78,11 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]: logger.error(f"Ingestion {ingestion_id} not found in database") raise ValueError(f"Ingestion {ingestion_id} not found") - # Update status to OCR_PROCESSING - ingestion.status = ExtractionStatus.OCR_PROCESSING + # Update status to OCR_IN_PROGRESS + ingestion.status = ExtractionStatus.OCR_IN_PROGRESS db.add(ingestion) db.commit() - logger.info(f"[{ingestion_id}] Status updated to OCR_PROCESSING") + logger.info(f"[{ingestion_id}] Status updated to OCR_IN_PROGRESS") # Download PDF from storage logger.info( diff --git a/backend/tests/services/test_ocr.py b/backend/tests/services/test_ocr.py index 964dd11521..a6ab90bf09 100644 --- a/backend/tests/services/test_ocr.py +++ b/backend/tests/services/test_ocr.py @@ -204,7 +204,7 @@ def mock_handler(request: httpx.Request) -> httpx.Response: ) assert table_block is not None assert table_block.table_structure is not None - assert table_block.table_structure["rows"] == 2 + assert table_block.table_structure.rows == 2 @pytest.mark.asyncio async def test_extract_text_api_error_400(self): @@ -625,17 +625,17 @@ def mock_handler(request: httpx.Request) -> httpx.Response: # Verify table structure with cell-level detail table_struct = table_block.table_structure assert table_struct is not None - assert table_struct["rows"] == 4 - assert table_struct["columns"] == 2 - assert len(table_struct["cells"]) == 4 + assert table_struct.rows == 4 + assert table_struct.columns == 2 + assert len(table_struct.cells) == 4 # Verify cell data with row/column positions - cell_a = table_struct["cells"][0] + cell_a = table_struct.cells[0] assert cell_a["row"] == 0 assert cell_a["col"] == 0 assert cell_a["text"] == "A." - cell_b = table_struct["cells"][2] + cell_b = table_struct.cells[2] assert cell_b["row"] == 1 assert cell_b["col"] == 0 assert cell_b["text"] == "B." diff --git a/backend/tests/tasks/test_extraction.py b/backend/tests/tasks/test_extraction.py index 988e73b3cc..82aaade44c 100644 --- a/backend/tests/tasks/test_extraction.py +++ b/backend/tests/tasks/test_extraction.py @@ -100,7 +100,7 @@ def test_process_ocr_task_success( from app.models import Ingestion mock_db.get.assert_called_once_with(Ingestion, mock_ingestion.id) - assert mock_db.commit.call_count == 2 # Status OCR_PROCESSING + OCR_COMPLETE + assert mock_db.commit.call_count == 2 # Status OCR_IN_PROGRESS + OCR_COMPLETE # Verify ingestion status was updated to OCR_COMPLETE assert mock_ingestion.status == ExtractionStatus.OCR_COMPLETE @@ -234,7 +234,7 @@ def test_process_ocr_task_updates_status_to_processing( mock_ingestion, mock_ocr_result, ): - """Test task updates status to OCR_PROCESSING before starting OCR.""" + """Test task updates status to OCR_IN_PROGRESS before starting OCR.""" mock_settings.MISTRAL_API_KEY = "test-api-key" mock_db = MagicMock() @@ -257,7 +257,7 @@ def track_status_change(*args, **kwargs): process_ocr_task(str(mock_ingestion.id)) - # Verify status progression: OCR_PROCESSING -> OCR_COMPLETE + # Verify status progression: OCR_IN_PROGRESS -> OCR_COMPLETE assert len(status_changes) >= 2 - assert ExtractionStatus.OCR_PROCESSING in status_changes + assert ExtractionStatus.OCR_IN_PROGRESS in status_changes assert status_changes[-1] == ExtractionStatus.OCR_COMPLETE diff --git a/frontend/src/client/schemas.gen.ts b/frontend/src/client/schemas.gen.ts index 8f2edf50ef..e917082ca1 100644 --- a/frontend/src/client/schemas.gen.ts +++ b/frontend/src/client/schemas.gen.ts @@ -71,7 +71,7 @@ export const Body_login_login_access_tokenSchema = { export const ExtractionStatusSchema = { type: 'string', - enum: ['UPLOADED', 'OCR_PROCESSING', 'OCR_COMPLETE', 'SEGMENTATION_PROCESSING', 'SEGMENTATION_COMPLETE', 'TAGGING_PROCESSING', 'DRAFT', 'IN_REVIEW', 'APPROVED', 'REJECTED', 'FAILED'], + enum: ['UPLOADED', 'OCR_IN_PROGRESS', 'OCR_COMPLETE', 'OCR_FAILED', 'SEGMENTATION_PROCESSING', 'SEGMENTATION_COMPLETE', 'TAGGING_PROCESSING', 'DRAFT', 'IN_REVIEW', 'APPROVED', 'REJECTED', 'FAILED'], title: 'ExtractionStatus', description: 'Extraction pipeline status enum.' } as const; diff --git a/frontend/src/client/types.gen.ts b/frontend/src/client/types.gen.ts index 8aa2f690a7..0492cd11a1 100644 --- a/frontend/src/client/types.gen.ts +++ b/frontend/src/client/types.gen.ts @@ -19,7 +19,7 @@ export type Body_login_login_access_token = { /** * Extraction pipeline status enum. */ -export type ExtractionStatus = 'UPLOADED' | 'OCR_PROCESSING' | 'OCR_COMPLETE' | 'SEGMENTATION_PROCESSING' | 'SEGMENTATION_COMPLETE' | 'TAGGING_PROCESSING' | 'DRAFT' | 'IN_REVIEW' | 'APPROVED' | 'REJECTED' | 'FAILED'; +export type ExtractionStatus = 'UPLOADED' | 'OCR_IN_PROGRESS' | 'OCR_COMPLETE' | 'OCR_FAILED' | 'SEGMENTATION_PROCESSING' | 'SEGMENTATION_COMPLETE' | 'TAGGING_PROCESSING' | 'DRAFT' | 'IN_REVIEW' | 'APPROVED' | 'REJECTED' | 'FAILED'; export type HTTPValidationError = { detail?: Array;