-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_funding_source.py
More file actions
94 lines (81 loc) · 3.22 KB
/
extract_funding_source.py
File metadata and controls
94 lines (81 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import re
import PyPDF2
from tqdm import tqdm
def extract_funding_sources_from_pdf(pdf_path):
funding_sources = []
funding_patterns = [
r'\bNational Science Foundation\b',
r'\bNSF\b',
r'\bEuropean Research Council\b',
r'\bERC\b',
r'\bHorizon 2020\b',
r'\bDARPA\b',
r'\bDefense Advanced Research Projects Agency\b',
r'\bNIH\b',
r'\bNational Institutes of Health\b',
r'\bONR\b',
r'\bOffice of Naval Research\b',
r'\bNASA\b',
r'\bNational Aeronautics and Space Administration\b',
r'\bDOE\b',
r'\bDepartment of Energy\b',
r'\bAFOSR\b',
r'\bAir Force Office of Scientific Research\b',
r'\bsupported by\b',
r'\bfunded by\b',
r'\bgrant from\b',
r'\bcontract\b'
]
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = "".join([page.extract_text() or "" for page in reader.pages])
# Extract matches for hardcoded patterns
for pattern in funding_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
funding_sources.extend(matches)
# Extract sentences with funding-related keywords
funding_sentences = re.findall(
r'([^.]*\b(supported by|funded by|grant from|contract)\b[^.]*\.)',
text,
re.IGNORECASE
)
funding_sources.extend([sentence[0].strip() for sentence in funding_sentences])
return list(set(funding_sources)), None
except Exception as e:
return None, str(e)
def process_pdfs_for_funding_sources(root_folder):
funding_data = []
error_log = []
pdf_files = []
# Gather PDF files for progress bar
for foldername, _, filenames in os.walk(root_folder):
for filename in filenames:
if filename.lower().endswith('.pdf'):
pdf_files.append((foldername, filename))
# Process files with progress bar
for foldername, filename in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
pdf_path = os.path.join(foldername, filename)
funding_sources, error = extract_funding_sources_from_pdf(pdf_path)
if funding_sources:
funding_data.append({
"PDF Name": filename,
"Folder": os.path.relpath(foldername, root_folder),
"Funding Sources": funding_sources
})
elif error:
error_log.append(f"{pdf_path} | Error: {error}")
return funding_data, error_log
if __name__ == "__main__":
PDF_FOLDER = r"C:\Users\mateo\..." # Update as needed
funding_results, errors = process_pdfs_for_funding_sources(PDF_FOLDER)
for result in funding_results:
print(f"PDF: {result['PDF Name']}")
print(f"Folder: {result['Folder']}")
print(f"Funding Sources: {', '.join(result['Funding Sources'])}")
print("-" * 40)
if errors:
with open("funding_extraction_errors.log", "w") as f:
f.write("\n".join(errors))
print(f"Encountered {len(errors)} errors (see funding_extraction_errors.log)")