-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze_layout.py
More file actions
195 lines (170 loc) · 6.66 KB
/
analyze_layout.py
File metadata and controls
195 lines (170 loc) · 6.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
This code sample shows Prebuilt Layout operations with the Azure AI Document Intelligence client library.
The async versions of the samples require Python 3.8 or later.
To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs
https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python
"""
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from PyPDF2 import PdfReader
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment
import os
import json
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Set environment variables in .env file
AZURE_ENDPOINT = os.getenv('AZURE_ENDPOINT')
AZURE_KEY = os.getenv('AZURE_KEY')
PDF_FILE_PATH = os.getenv('PDF_FILE_PATH')
if not all([AZURE_ENDPOINT, AZURE_KEY, PDF_FILE_PATH]):
raise ValueError("Missing required environment variables. Please check your .env file.")
# Count pages and build pages string
reader = PdfReader(PDF_FILE_PATH)
num_pages = len(reader.pages)
pages_string = f"1-{num_pages}" #process all pages of PDF
print(f"PDF has {num_pages} pages; requesting pages='{pages_string}'")
#Submit single analyze request for all pages
with open(PDF_FILE_PATH, "rb") as f:
pdf_bytes = f.read()
client = DocumentIntelligenceClient(
endpoint=AZURE_ENDPOINT,
credential=AzureKeyCredential(AZURE_KEY)
)
poller = client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=pdf_bytes),
pages=pages_string
)
result = poller.result()
# If service returns fewer pages than expected, fall back to per-page calls
if len(result.pages) < num_pages:
print(f"Warning: got {len(result.pages)} pages back; falling back to per-page requests.")
all_pages = []
all_tables = []
for p in range(1, num_pages + 1):
sub_poller = client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=pdf_bytes),
pages=str(p)
)
resp = sub_poller.result()
all_pages.extend(resp.pages)
all_tables.extend(resp.tables)
result.pages = all_pages
result.tables = all_tables
# Extract lines and tables
all_lines = []
all_tables = []
for page in result.pages:
for idx, line in enumerate(page.lines):
all_lines.append({
"page_number": page.page_number,
"line_number": idx + 1,
"content": line.content,
"bounding_box":[str(pt) for pt in getattr(line, "polygon", [])]
})
for ti, table in enumerate(result.tables):
br = table.bounding_regions[0] if table.bounding_regions else None
page_num = br.page_number if br else 1
cells = []
for cell in table.cells:
cells.append({
"row_index": cell.row_index,
"column_index": cell.column_index,
"content": cell.content,
"is_header": getattr(cell, "kind", "") == "columnHeader"
})
all_tables.append({
"table_number": ti + 1,
"page_number": page_num,
"row_count": table.row_count,
"column_count": table.column_count,
"cells": cells
})
print(f"Extracted {len(result.pages)} pages, {len(all_lines)} lines, {len(all_tables)} tables.")
#Saving to Excel File
def save_to_excel(data, filename="document_analysis.xlsx"):
wb = Workbook()
# remove default sheet
if "Sheet" in wb.sheetnames:
wb.remove(wb["Sheet"])
# tables → separate sheets
for ti, table in enumerate(data["tables"]):
ws = wb.create_sheet(f"Table_{ti+1}")
# header row
headers = {c["column_index"]: c["content"] for c in table["cells"] if c["row_index"] == 0}
if headers:
ws.append([headers.get(i, f"Column_{i}") for i in range(table["column_count"])])
# data rows
for r in range(1, table["row_count"]):
row = [next((c["content"] for c in table["cells"]
if c["row_index"] == r and c["column_index"] == col), "")
for col in range(table["column_count"])]
if any(row):
ws.append(row)
# pages → one sheet
ws = wb.create_sheet("Pages")
ws.append(["Page Number", "Width", "Height", "Unit", "Angle"])
for page in data["pages"]:
ws.append([
page["page_number"],
page["width"],
page["height"],
page["unit"],
page["angle"]
])
# formatting
for sheet in wb.worksheets:
for col in sheet.columns:
max_len = max((len(str(cell.value)) for cell in col), default=0)
sheet.column_dimensions[col[0].column_letter].width = min((max_len + 2) * 1.2, 50)
for cell in sheet[1]:
cell.font = Font(bold=True)
cell.alignment = Alignment(horizontal="center")
wb.save(filename)
print(f"Excel saved to {filename}")
#Saving to Json File
def save_table_to_json(data, filename="tables_data.json"):
formatted = []
for table in data["tables"]:
# reorganize per row
rows = {}
for c in table["cells"]:
rows.setdefault(c["row_index"], {})[c["column_index"]] = c["content"]
header = rows.get(0, {})
for r, cols in rows.items():
if r == 0: continue
entry = {header.get(0, "Heading"): cols.get(0, "")}
for ci, val in cols.items():
if ci == 0: continue
entry[header.get(ci, f"Column_{ci}")] = val
formatted.append(entry)
with open(filename, "w", encoding="utf-8") as f:
json.dump(formatted, f, ensure_ascii=False, indent=2)
print(f"JSON saved to {filename}")
# build result dict
result_dict = {
"pages": [
{
"page_number": page.page_number,
"angle": getattr(page, "angle", 0.0),
"width": getattr(page, "width", 0.0),
"height": getattr(page, "height", 0.0),
"unit": getattr(page, "unit", "pixel"),
"lines": [{"content": l.content,
"bounding_box": [str(pt) for pt in getattr(l, "polygon", [])]}
for l in page.lines]
}
for page in result.pages
],
"all_lines": all_lines,
"tables": all_tables,
"total_tables": len(all_tables),
"total_lines": len(all_lines)
}
# save outputs
save_to_excel(result_dict)
save_table_to_json(result_dict)