RWTH-TIME
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cbc.yaml‎
Lines changed: 20 additions & 50 deletions b/‎cbc.yaml‎
Lines changed: 20 additions & 50 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 9 additions & 6 deletions b/‎docker-compose.yml‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎main.py‎
Lines changed: 41 additions & 39 deletions b/‎main.py‎
Lines changed: 41 additions & 39 deletions
@@ -78,6 +78,19 @@ jobs:
           --health-interval 5s
           --health-retries 5
           --health-timeout 5s
+      postgres:
+        image: postgres:15
+        ports:
+          - 5432:5432
+        env:
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: postgres
+        options: >-
+          --health-cmd="pg_isready -U postgres"
+          --health-interval=5s
+          --health-retries=10
+          --health-timeout=5s
     steps:
       - uses: actions/checkout@v4
 
 
@@ -1,9 +1,9 @@
-author: Paul Kalhorn 
+author: Paul Kalhorn
 description: Language preprocessing for .txt or .bib files
-docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing 
+docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
 entrypoints:
   preprocess_bib_file:
-    description: Entrypoint for preprocessing a .bib file 
+    description: Entrypoint for preprocessing an attribute of a .bib file
     envs:
       BIB_DOWNLOAD_PATH: /tmp/input.bib
       FILTER_STOPWORDS: true
@@ -27,30 +27,15 @@ entrypoints:
         description: The bib file, aswell as one attribute selected for preprocessing
         type: file
     outputs:
-      dtm_output:
+      normalized_docs_output:
         config:
-          dtm_output_BUCKET_NAME: null
-          dtm_output_FILE_EXT: pkl
-          dtm_output_FILE_NAME: null
-          dtm_output_FILE_PATH: null
-          dtm_output_S3_ACCESS_KEY: null
-          dtm_output_S3_HOST: null
-          dtm_output_S3_PORT: null
-          dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file
-        type: file
-      vocab_output:
-        config:
-          vocab_output_BUCKET_NAME: null
-          vocab_output_FILE_EXT: pkl
-          vocab_output_FILE_NAME: null
-          vocab_output_FILE_PATH: null
-          vocab_output_S3_ACCESS_KEY: null
-          vocab_output_S3_HOST: null
-          vocab_output_S3_PORT: null
-          vocab_output_S3_SECRET_KEY: null
-        description: Pkl file of a dictionary that maps all words to their index in the DTM
-        type: file
+          normalized_docs_DB_TABLE: null
+          normalized_docs_PG_HOST: null
+          normalized_docs_PG_PASS: null
+          normalized_docs_PG_PORT: null
+          normalized_docs_PG_USER: null
+        description: Database Output, containing bib_id aswell as the normalized text
+        type: pg_table
   preprocess_txt_file:
     description: Entrypoint to preprocess a .txt file
     envs:
@@ -72,31 +57,16 @@ entrypoints:
           txt_file_S3_HOST: null
           txt_file_S3_PORT: null
           txt_file_S3_SECRET_KEY: null
-        description: A .txt file
+        description: A .txt file, each line will be treated as a document
         type: file
     outputs:
-      dtm_output:
+      normalized_docs_output:
         config:
-          dtm_output_BUCKET_NAME: null
-          dtm_output_FILE_EXT: pkl
-          dtm_output_FILE_NAME: null
-          dtm_output_FILE_PATH: null
-          dtm_output_S3_ACCESS_KEY: null
-          dtm_output_S3_HOST: null
-          dtm_output_S3_PORT: null
-          dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file
-        type: file
-      vocab_output:
-        config:
-          vocab_output_BUCKET_NAME: null
-          vocab_output_FILE_EXT: pkl
-          vocab_output_FILE_NAME: null
-          vocab_output_FILE_PATH: null
-          vocab_output_S3_ACCESS_KEY: null
-          vocab_output_S3_HOST: null
-          vocab_output_S3_PORT: null
-          vocab_output_S3_SECRET_KEY: null
-        description: Pkl file of a dictionary that maps all words to their index in the DTM
-        type: file
+          normalized_docs_DB_TABLE: null
+          normalized_docs_PG_HOST: null
+          normalized_docs_PG_PASS: null
+          normalized_docs_PG_PORT: null
+          normalized_docs_PG_USER: null
+        description: Database Output, containing bib_id aswell as the normalized text
+        type: pg_table
 name: Language-Preprocessing
@@ -13,13 +13,16 @@ services:
     ports:
       - "9000:9000"
       - "9001:9001"
-    networks:
-      - scystream-net
 
-networks:
-  scystream-net:
-    driver: bridge
+  postgres:
+    image: postgres:13
+    container_name: postgres
+    environment:
+      - POSTGRES_USER=postgres
+      - POSTGRES_PASSWORD=postgres
+      - POSTGRES_DB=postgres
+    ports:
+      - "5432:5432"
 
 volumes:
   minio_data:
-  search_query:
 
@@ -1,18 +1,21 @@
-import pickle
-import tempfile
 import logging
+import pandas as pd
+from sqlalchemy import create_engine
 
+from typing import List
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
     EnvSettings,
     InputSettings,
     OutputSettings,
-    FileSettings
+    FileSettings,
+    PostgresSettings
 )
 from scystream.sdk.file_handling.s3_manager import S3Operations
 
 from preprocessing.core import Preprocessor
 from preprocessing.loader import TxtLoader, BibLoader
+from preprocessing.models import DocumentRecord, PreprocessedDocument
 
 logging.basicConfig(
     level=logging.INFO,
@@ -21,16 +24,8 @@
 logger = logging.getLogger(__name__)
 
 
-class DTMFileOutput(FileSettings, OutputSettings):
-    __identifier__ = "dtm_output"
-
-    FILE_EXT: str = "pkl"
-
-
-class VocabFileOutput(FileSettings, OutputSettings):
-    __identifier__ = "vocab_output"
-
-    FILE_EXT: str = "pkl"
+class NormalizedDocsOutput(PostgresSettings, OutputSettings):
+    __identifier__ = "normalized_docs"
 
 
 class TXTFileInput(FileSettings, InputSettings):
@@ -56,8 +51,7 @@ class PreprocessTXT(EnvSettings):
     TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
 
     txt_input: TXTFileInput
-    dtm_output: DTMFileOutput
-    vocab_output: VocabFileOutput
+    normalized_docs_output: NormalizedDocsOutput
 
 
 class PreprocessBIB(EnvSettings):
@@ -71,13 +65,37 @@ class PreprocessBIB(EnvSettings):
     BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
 
     bib_input: BIBFileInput
-    dtm_output: DTMFileOutput
-    vocab_output: VocabFileOutput
+    normalized_docs_output: NormalizedDocsOutput
+
+
+def _write_preprocessed_docs_to_postgres(
+        preprocessed_ouput: List[PreprocessedDocument],
+        settings: PostgresSettings
+):
+    df = pd.DataFrame([
+        {
+            "doc_id": d.doc_id,
+            "tokens": d.tokens
+        }
+        for d in preprocessed_ouput
+    ])
+
+    logger.info(f"Writing {len(df)} processed documents to DB table '{
+                settings.DB_TABLE}'…")
+    engine = create_engine(
+        f"postgresql+psycopg2://{settings.PG_USER}:{settings.PG_PASS}"
+        f"@{settings.PG_HOST}:{int(settings.PG_PORT)}/"
+    )
+
+    df.to_sql(settings.DB_TABLE, engine, if_exists="replace", index=False)
+
+    logger.info(f"Successfully stored normalized documents into '{
+                settings.DB_TABLE}'.")
 
 
-def _preprocess_and_store(texts, settings):
+def _preprocess_and_store(documents: List[DocumentRecord], settings):
     """Shared preprocessing logic for TXT and BIB."""
-    logger.info(f"Starting preprocessing with {len(texts)} documents")
+    logger.info(f"Starting preprocessing with {len(documents)} documents")
 
     pre = Preprocessor(
         language=settings.LANGUAGE,
@@ -88,27 +106,11 @@ def _preprocess_and_store(texts, settings):
         ngram_max=settings.NGRAM_MAX,
     )
 
-    pre.texts = texts
-    pre.analyze_texts()
-
-    pre.generate_bag_of_words()
-
-    dtm, vocab = pre.generate_document_term_matrix()
-
-    with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
-            tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
-
-        pickle.dump(dtm, tmp_dtm)
-        tmp_dtm.flush()
-
-        pickle.dump(vocab, tmp_vocab)
-        tmp_vocab.flush()
-
-        logger.info("Uploading DTM to S3...")
-        S3Operations.upload(settings.dtm_output, tmp_dtm.name)
+    pre.documents = documents
+    result = pre.generate_normalized_output()
 
-        logger.info("Uploading vocabulary to S3...")
-        S3Operations.upload(settings.vocab_output, tmp_vocab.name)
+    _write_preprocessed_docs_to_postgres(
+        result, settings.normalized_docs_output)
 
     logger.info("Preprocessing completed successfully.")