1414from preprocessing .core import Preprocessor
1515from preprocessing .loader import TxtLoader , BibLoader
1616
17-
1817logging .basicConfig (
1918 level = logging .INFO ,
2019 format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -54,6 +53,8 @@ class PreprocessTXT(EnvSettings):
5453 NGRAM_MIN : int = 2
5554 NGRAM_MAX : int = 3
5655
56+ TXT_DOWNLOAD_PATH : str = "/tmp/input.txt"
57+
5758 txt_input : TXTFileInput
5859 dtm_output : DTMFileOutput
5960 vocab_output : VocabFileOutput
@@ -67,6 +68,8 @@ class PreprocessBIB(EnvSettings):
6768 NGRAM_MIN : int = 2
6869 NGRAM_MAX : int = 3
6970
71+ BIB_DOWNLOAD_PATH : str = "/tmp/input.bib"
72+
7073 bib_input : BIBFileInput
7174 dtm_output : DTMFileOutput
7275 vocab_output : VocabFileOutput
@@ -113,20 +116,20 @@ def _preprocess_and_store(texts, settings):
113116@entrypoint (PreprocessTXT )
114117def preprocess_txt_file (settings ):
115118 logger .info ("Downloading TXT input from S3..." )
116- S3Operations .download (settings .txt_input , "input.txt" )
119+ S3Operations .download (settings .txt_input , settings . TXT_DOWNLOAD_PATH )
117120
118- texts = TxtLoader .load ("./input.txt" )
121+ texts = TxtLoader .load (settings . TXT_DOWNLOAD_PATH )
119122
120123 _preprocess_and_store (texts , settings )
121124
122125
123126@entrypoint (PreprocessBIB )
124127def preprocess_bib_file (settings ):
125128 logger .info ("Downloading BIB input from S3..." )
126- S3Operations .download (settings .bib_input , "input.bib" )
129+ S3Operations .download (settings .bib_input , settings . BIB_DOWNLOAD_PATH )
127130
128131 texts = BibLoader .load (
129- "./input.bib" ,
132+ settings . BIB_DOWNLOAD_PATH ,
130133 attribute = settings .bib_input .SELECTED_ATTRIBUTE ,
131134 )
132135 _preprocess_and_store (texts , settings )
0 commit comments