Skip to content

Commit c9b7b99

Browse files
authored
Merge pull request #8 from RWTH-TIME/issue-4/move-dtm-bow-to-nlp
Remove BOW and DTM logic
2 parents a14a927 + e36e7ac commit c9b7b99

15 files changed

Lines changed: 291 additions & 370 deletions

.github/workflows/ci.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,19 @@ jobs:
7878
--health-interval 5s
7979
--health-retries 5
8080
--health-timeout 5s
81+
postgres:
82+
image: postgres:15
83+
ports:
84+
- 5432:5432
85+
env:
86+
POSTGRES_USER: postgres
87+
POSTGRES_PASSWORD: postgres
88+
POSTGRES_DB: postgres
89+
options: >-
90+
--health-cmd="pg_isready -U postgres"
91+
--health-interval=5s
92+
--health-retries=10
93+
--health-timeout=5s
8194
steps:
8295
- uses: actions/checkout@v4
8396

cbc.yaml

Lines changed: 20 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
author: Paul Kalhorn
1+
author: Paul Kalhorn
22
description: Language preprocessing for .txt or .bib files
3-
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
3+
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
44
entrypoints:
55
preprocess_bib_file:
6-
description: Entrypoint for preprocessing a .bib file
6+
description: Entrypoint for preprocessing an attribute of a .bib file
77
envs:
88
BIB_DOWNLOAD_PATH: /tmp/input.bib
99
FILTER_STOPWORDS: true
@@ -27,30 +27,15 @@ entrypoints:
2727
description: The bib file, aswell as one attribute selected for preprocessing
2828
type: file
2929
outputs:
30-
dtm_output:
30+
normalized_docs_output:
3131
config:
32-
dtm_output_BUCKET_NAME: null
33-
dtm_output_FILE_EXT: pkl
34-
dtm_output_FILE_NAME: null
35-
dtm_output_FILE_PATH: null
36-
dtm_output_S3_ACCESS_KEY: null
37-
dtm_output_S3_HOST: null
38-
dtm_output_S3_PORT: null
39-
dtm_output_S3_SECRET_KEY: null
40-
description: Numpy representation of document-term matrix as .pkl file
41-
type: file
42-
vocab_output:
43-
config:
44-
vocab_output_BUCKET_NAME: null
45-
vocab_output_FILE_EXT: pkl
46-
vocab_output_FILE_NAME: null
47-
vocab_output_FILE_PATH: null
48-
vocab_output_S3_ACCESS_KEY: null
49-
vocab_output_S3_HOST: null
50-
vocab_output_S3_PORT: null
51-
vocab_output_S3_SECRET_KEY: null
52-
description: Pkl file of a dictionary that maps all words to their index in the DTM
53-
type: file
32+
normalized_docs_DB_TABLE: null
33+
normalized_docs_PG_HOST: null
34+
normalized_docs_PG_PASS: null
35+
normalized_docs_PG_PORT: null
36+
normalized_docs_PG_USER: null
37+
description: Database Output, containing bib_id aswell as the normalized text
38+
type: pg_table
5439
preprocess_txt_file:
5540
description: Entrypoint to preprocess a .txt file
5641
envs:
@@ -72,31 +57,16 @@ entrypoints:
7257
txt_file_S3_HOST: null
7358
txt_file_S3_PORT: null
7459
txt_file_S3_SECRET_KEY: null
75-
description: A .txt file
60+
description: A .txt file, each line will be treated as a document
7661
type: file
7762
outputs:
78-
dtm_output:
63+
normalized_docs_output:
7964
config:
80-
dtm_output_BUCKET_NAME: null
81-
dtm_output_FILE_EXT: pkl
82-
dtm_output_FILE_NAME: null
83-
dtm_output_FILE_PATH: null
84-
dtm_output_S3_ACCESS_KEY: null
85-
dtm_output_S3_HOST: null
86-
dtm_output_S3_PORT: null
87-
dtm_output_S3_SECRET_KEY: null
88-
description: Numpy representation of document-term matrix as .pkl file
89-
type: file
90-
vocab_output:
91-
config:
92-
vocab_output_BUCKET_NAME: null
93-
vocab_output_FILE_EXT: pkl
94-
vocab_output_FILE_NAME: null
95-
vocab_output_FILE_PATH: null
96-
vocab_output_S3_ACCESS_KEY: null
97-
vocab_output_S3_HOST: null
98-
vocab_output_S3_PORT: null
99-
vocab_output_S3_SECRET_KEY: null
100-
description: Pkl file of a dictionary that maps all words to their index in the DTM
101-
type: file
65+
normalized_docs_DB_TABLE: null
66+
normalized_docs_PG_HOST: null
67+
normalized_docs_PG_PASS: null
68+
normalized_docs_PG_PORT: null
69+
normalized_docs_PG_USER: null
70+
description: Database Output, containing bib_id aswell as the normalized text
71+
type: pg_table
10272
name: Language-Preprocessing

docker-compose.yml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ services:
1313
ports:
1414
- "9000:9000"
1515
- "9001:9001"
16-
networks:
17-
- scystream-net
1816

19-
networks:
20-
scystream-net:
21-
driver: bridge
17+
postgres:
18+
image: postgres:13
19+
container_name: postgres
20+
environment:
21+
- POSTGRES_USER=postgres
22+
- POSTGRES_PASSWORD=postgres
23+
- POSTGRES_DB=postgres
24+
ports:
25+
- "5432:5432"
2226

2327
volumes:
2428
minio_data:
25-
search_query:

main.py

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,21 @@
1-
import pickle
2-
import tempfile
31
import logging
2+
import pandas as pd
3+
from sqlalchemy import create_engine
44

5+
from typing import List
56
from scystream.sdk.core import entrypoint
67
from scystream.sdk.env.settings import (
78
EnvSettings,
89
InputSettings,
910
OutputSettings,
10-
FileSettings
11+
FileSettings,
12+
PostgresSettings
1113
)
1214
from scystream.sdk.file_handling.s3_manager import S3Operations
1315

1416
from preprocessing.core import Preprocessor
1517
from preprocessing.loader import TxtLoader, BibLoader
18+
from preprocessing.models import DocumentRecord, PreprocessedDocument
1619

1720
logging.basicConfig(
1821
level=logging.INFO,
@@ -21,16 +24,8 @@
2124
logger = logging.getLogger(__name__)
2225

2326

24-
class DTMFileOutput(FileSettings, OutputSettings):
25-
__identifier__ = "dtm_output"
26-
27-
FILE_EXT: str = "pkl"
28-
29-
30-
class VocabFileOutput(FileSettings, OutputSettings):
31-
__identifier__ = "vocab_output"
32-
33-
FILE_EXT: str = "pkl"
27+
class NormalizedDocsOutput(PostgresSettings, OutputSettings):
28+
__identifier__ = "normalized_docs"
3429

3530

3631
class TXTFileInput(FileSettings, InputSettings):
@@ -56,8 +51,7 @@ class PreprocessTXT(EnvSettings):
5651
TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
5752

5853
txt_input: TXTFileInput
59-
dtm_output: DTMFileOutput
60-
vocab_output: VocabFileOutput
54+
normalized_docs_output: NormalizedDocsOutput
6155

6256

6357
class PreprocessBIB(EnvSettings):
@@ -71,13 +65,37 @@ class PreprocessBIB(EnvSettings):
7165
BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
7266

7367
bib_input: BIBFileInput
74-
dtm_output: DTMFileOutput
75-
vocab_output: VocabFileOutput
68+
normalized_docs_output: NormalizedDocsOutput
69+
70+
71+
def _write_preprocessed_docs_to_postgres(
72+
preprocessed_ouput: List[PreprocessedDocument],
73+
settings: PostgresSettings
74+
):
75+
df = pd.DataFrame([
76+
{
77+
"doc_id": d.doc_id,
78+
"tokens": d.tokens
79+
}
80+
for d in preprocessed_ouput
81+
])
82+
83+
logger.info(f"Writing {len(df)} processed documents to DB table '{
84+
settings.DB_TABLE}'…")
85+
engine = create_engine(
86+
f"postgresql+psycopg2://{settings.PG_USER}:{settings.PG_PASS}"
87+
f"@{settings.PG_HOST}:{int(settings.PG_PORT)}/"
88+
)
89+
90+
df.to_sql(settings.DB_TABLE, engine, if_exists="replace", index=False)
91+
92+
logger.info(f"Successfully stored normalized documents into '{
93+
settings.DB_TABLE}'.")
7694

7795

78-
def _preprocess_and_store(texts, settings):
96+
def _preprocess_and_store(documents: List[DocumentRecord], settings):
7997
"""Shared preprocessing logic for TXT and BIB."""
80-
logger.info(f"Starting preprocessing with {len(texts)} documents")
98+
logger.info(f"Starting preprocessing with {len(documents)} documents")
8199

82100
pre = Preprocessor(
83101
language=settings.LANGUAGE,
@@ -88,27 +106,11 @@ def _preprocess_and_store(texts, settings):
88106
ngram_max=settings.NGRAM_MAX,
89107
)
90108

91-
pre.texts = texts
92-
pre.analyze_texts()
93-
94-
pre.generate_bag_of_words()
95-
96-
dtm, vocab = pre.generate_document_term_matrix()
97-
98-
with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
99-
tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
100-
101-
pickle.dump(dtm, tmp_dtm)
102-
tmp_dtm.flush()
103-
104-
pickle.dump(vocab, tmp_vocab)
105-
tmp_vocab.flush()
106-
107-
logger.info("Uploading DTM to S3...")
108-
S3Operations.upload(settings.dtm_output, tmp_dtm.name)
109+
pre.documents = documents
110+
result = pre.generate_normalized_output()
109111

110-
logger.info("Uploading vocabulary to S3...")
111-
S3Operations.upload(settings.vocab_output, tmp_vocab.name)
112+
_write_preprocessed_docs_to_postgres(
113+
result, settings.normalized_docs_output)
112114

113115
logger.info("Preprocessing completed successfully.")
114116

0 commit comments

Comments
 (0)