Laser/preprocessor.py at main · SpaceNetLab/Laser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
from filter import clean_file
from extractor import *

def preprocess(input_pdf: str, output_txt: str):
    """
    Perform the following steps for a single paper:
      1. Extract text from input_pdf to output_txt
      2. Clean output_txt (overwrite the file)
    """
    extract_sections_smart(input_pdf, output_txt)
    clean_file(output_txt, output_txt)

def main():
    pdf_root = 'paper_pdf'   # Root directory containing all PDFs
    txt_root = 'paper_txt'   # Target directory for cleaned TXT output

    os.makedirs(txt_root, exist_ok=True)

    for dirpath, _, filenames in os.walk(pdf_root):
        for fname in filenames:
            if not fname.lower().endswith('.pdf'):
                continue

            input_pdf = os.path.join(dirpath, fname)
            base = os.path.splitext(fname)[0]
            output_txt = os.path.join(txt_root, f'{base}.txt')

            try:
                preprocess(input_pdf, output_txt)
            except Exception:
                # Skip the file if an error occurs
                pass

if __name__ == '__main__':
    main()