-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchecker.py
More file actions
102 lines (88 loc) · 2.83 KB
/
checker.py
File metadata and controls
102 lines (88 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from PyPDF2 import PdfReader
from tqdm import tqdm
# --- CONFIG ---
central_dir = r"C:\Users\mateo..." # Add path
expected_counts = {
1984: 76,
1985: 168,
1986: 343,
1987: 331,
1988: 330,
1989: 282,
1990: 332,
1991: 416,
1992: 429,
1993: 456,
1994: 534,
1995: 541,
1996: 594,
1997: 581,
1998: 600,
1999: 528,
2000: 647,
2001: 686,
2002: 697,
2003: 723,
2004: 905,
2005: 771
}
# --- SCRIPT ---
report_lines = []
def pdf_has_text(filepath, max_pages=3):
"""Quick check if PDF has selectable text."""
try:
reader = PdfReader(filepath)
pages_to_check = min(len(reader.pages), max_pages)
for i in range(pages_to_check):
text = reader.pages[i].extract_text()
if text and text.strip():
return True
return False
except Exception:
return False # treat errors as no text
for year, expected_max in expected_counts.items():
year_folder = os.path.join(central_dir, f"{year} ICRA Articles")
if not os.path.isdir(year_folder):
report_lines.append(f"Year folder not found: {year}")
continue
files = [f for f in os.listdir(year_folder) if f.endswith(".pdf") and f.startswith(str(year) + "-")]
ids = []
id_to_files = {}
ocr_needed = []
# progress bar per year
for f in tqdm(files, desc=f"Processing {year}", unit="file"):
try:
part = f.split("-", 1)[1]
num = int(part.split(".", 1)[0])
ids.append(num)
id_to_files.setdefault(num, []).append(f)
# OCR check
filepath = os.path.join(year_folder, f)
if not pdf_has_text(filepath):
ocr_needed.append(f)
except ValueError:
continue
# Missing IDs
present = set(ids)
missing = [f"{year}-{i}.pdf" for i in range(1, expected_max + 1) if i not in present]
# Duplicates
duplicates = {num: files for num, files in id_to_files.items() if len(files) > 1}
# Build report
if missing or duplicates or ocr_needed:
report_lines.append(f"===== {year} =====")
if missing:
report_lines.append(f"Missing ({len(missing)}): {', '.join(missing)}")
if duplicates:
for num, dup_files in duplicates.items():
report_lines.append(f"Duplicate ID {year}-{num}: {', '.join(dup_files)}")
if ocr_needed:
report_lines.append(f"Needs OCR ({len(ocr_needed)}): {', '.join(ocr_needed)}")
# Write report
report_path = os.path.join(central_dir, "missing_files_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
if report_lines:
f.write("\n".join(report_lines))
else:
f.write("No missing, duplicate, or image-only PDFs detected!\n")
print(f"Report written to {report_path}")