From 5cd99b4b41f8aa3f5d847a9eb9a201f5708cace1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Mar 2026 02:26:16 +0000
Subject: [PATCH 1/2] Initial plan
From 587c17920550192e1410bc43e199be92794fed54 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Mar 2026 02:34:03 +0000
Subject: [PATCH 2/2] Fix all E501 line-too-long linting errors (max 120 chars)
- 04_html: Break long HTML string with backslash continuation
- 05_spreadsheets: Use parenthesized string concatenation for descriptions
- 08_markdown_txt: Break long lines in triple-quoted strings with backslash continuation
- generate_presentation: Use parenthesized string concatenation for slide content
- Run ruff format for consistent style
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
.../01_docling/01_basic_conversion.py | 6 +-
.../01_docling/02_pdf_advanced.py | 23 +-
advanced_methods/01_docling/03_chunking.py | 24 +-
.../01_docling/04_integrations.py | 6 +-
.../02_unstructured_io/01_auto_partition.py | 17 +-
.../02_unstructured_io/02_pdf_strategies.py | 19 +-
.../03_specific_partitioners.py | 10 +-
.../04_chunking_and_export.py | 29 +-
.../01_layout_extraction.py | 9 +-
.../02_prebuilt_models.py | 13 +-
.../03_table_and_figure_extraction.py | 6 +-
.../04_rag_pipeline_example.py | 11 +-
.../04_llamaparse/01_basic_parsing.py | 4 +-
.../02_llamaindex_integration.py | 4 +-
.../04_llamaparse/03_parsing_tiers.py | 2 +-
.../05_marker/01_basic_conversion.py | 8 +-
.../05_marker/02_output_formats.py | 5 +-
.../05_marker/03_specialized_converters.py | 8 +-
.../06_megaparse/01_basic_parsing.py | 3 +-
.../06_megaparse/02_vision_parsing.py | 10 +-
.../06_megaparse/03_rag_preparation.py | 27 +-
pyproject.toml | 9 +
.../01_pdf/01_pypdf_extraction.py | 2 +-
.../01_pdf/02_pdfplumber_extraction.py | 71 +-
.../01_pdf/03_pymupdf_extraction.py | 49 +-
.../01_pdf/04_table_extraction.py | 51 +-
.../01_pdf/05_ocr_extraction.py | 81 +-
.../01_pdf/06_comparison.py | 64 +-
.../01_pdf/sample_docs/generate_samples.py | 1341 +++++++----
.../02_docx/01_python_docx_extraction.py | 55 +-
.../02_docx/02_mammoth_extraction.py | 16 +-
.../02_docx/03_docx2txt_extraction.py | 13 +-
.../02_docx/sample_docs/generate_samples.py | 139 +-
.../03_pptx/01_python_pptx_extraction.py | 40 +-
.../03_pptx/02_slide_structured_extraction.py | 79 +-
.../03_pptx/sample_docs/generate_samples.py | 120 +-
.../04_html/01_beautifulsoup_extraction.py | 12 +-
.../04_html/02_html2text_extraction.py | 8 +-
.../04_html/03_trafilatura_extraction.py | 2 +-
.../04_html/sample_docs/generate_samples.py | 3 +-
.../05_spreadsheets/01_openpyxl_extraction.py | 26 +-
.../05_spreadsheets/02_pandas_extraction.py | 11 +-
.../05_spreadsheets/03_csv_extraction.py | 5 +-
.../sample_docs/generate_samples.py | 179 +-
.../06_images_ocr/01_tesseract_ocr.py | 29 +-
.../06_images_ocr/02_easyocr_extraction.py | 22 +-
.../07_email/01_email_parsing.py | 4 +-
.../02_structured_email_extraction.py | 78 +-
.../07_email/sample_docs/generate_samples.py | 6 +-
.../01_text_chunking_strategies.py | 32 +-
.../08_markdown_txt/02_markdown_parsing.py | 42 +-
.../08_markdown_txt/03_semantic_chunking.py | 62 +-
.../sample_docs/generate_samples.py | 244 +-
.../09_epub/01_ebooklib_extraction.py | 14 +-
.../09_epub/02_epub_to_text.py | 95 +-
.../09_epub/sample_docs/generate_samples.py | 19 +-
.../generate_presentation.py | 2144 +++++++++++------
unstructured_documents/shared/chunking.py | 14 +-
58 files changed, 3389 insertions(+), 2036 deletions(-)
diff --git a/advanced_methods/01_docling/01_basic_conversion.py b/advanced_methods/01_docling/01_basic_conversion.py
index f4ae90f..de88a7e 100644
--- a/advanced_methods/01_docling/01_basic_conversion.py
+++ b/advanced_methods/01_docling/01_basic_conversion.py
@@ -10,12 +10,13 @@
uv pip install docling
"""
-import sys
+
from pathlib import Path
# Reference sample docs from the documents folder
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
+
def convert_single_document():
"""Convert a single PDF to markdown using default settings."""
from docling.document_converter import DocumentConverter
@@ -29,7 +30,7 @@ def convert_single_document():
print("DOCLING: Basic PDF to Markdown")
print("=" * 60)
print(f"Status: {result.status}")
- print(f"\n--- Markdown Output ---\n")
+ print("\n--- Markdown Output ---\n")
print(result.document.export_to_markdown())
@@ -84,7 +85,6 @@ def export_formats():
print(text[:300])
# JSON (lossless serialization)
- import json
json_str = doc.model_dump_json(indent=2)
print(f"\n--- JSON ({len(json_str)} chars) ---")
print(json_str[:400] + "...")
diff --git a/advanced_methods/01_docling/02_pdf_advanced.py b/advanced_methods/01_docling/02_pdf_advanced.py
index 55b48fd..9bf33dd 100644
--- a/advanced_methods/01_docling/02_pdf_advanced.py
+++ b/advanced_methods/01_docling/02_pdf_advanced.py
@@ -15,7 +15,7 @@
uv pip install "docling[tesserocr]" # for Tesseract OCR
uv pip install "docling[easyocr]" # for EasyOCR
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -23,14 +23,16 @@
def table_extraction_modes():
"""Compare FAST vs ACCURATE table detection on a table-heavy PDF."""
- from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
- from docling.document_converter import PdfFormatOption
+ from docling.document_converter import DocumentConverter, PdfFormatOption
pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "tables.pdf"
- for mode_name, mode in [("FAST", TableFormerMode.FAST), ("ACCURATE", TableFormerMode.ACCURATE)]:
+ for mode_name, mode in [
+ ("FAST", TableFormerMode.FAST),
+ ("ACCURATE", TableFormerMode.ACCURATE),
+ ]:
print(f"\n{'=' * 60}")
print(f"TABLE DETECTION MODE: {mode_name}")
print(f"{'=' * 60}")
@@ -41,9 +43,7 @@ def table_extraction_modes():
)
converter = DocumentConverter(
- format_options={
- InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
- }
+ format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
result = converter.convert(str(pdf_path))
@@ -54,10 +54,9 @@ def table_extraction_modes():
def ocr_configuration():
"""Configure OCR settings for scanned documents."""
- from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
- from docling.document_converter import PdfFormatOption
+ from docling.document_converter import DocumentConverter, PdfFormatOption
pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "simple_text.pdf"
@@ -70,11 +69,7 @@ def ocr_configuration():
# ocr_options=TesseractCliOcrOptions(lang=["eng"])
)
- converter = DocumentConverter(
- format_options={
- InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
- }
- )
+ converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)})
print("=" * 60)
print("PDF WITH OCR ENABLED")
diff --git a/advanced_methods/01_docling/03_chunking.py b/advanced_methods/01_docling/03_chunking.py
index df6fa4e..210aa53 100644
--- a/advanced_methods/01_docling/03_chunking.py
+++ b/advanced_methods/01_docling/03_chunking.py
@@ -14,7 +14,7 @@
uv pip install docling docling-core
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -37,9 +37,9 @@ def hierarchical_chunking():
print("=" * 60)
for i, chunk in enumerate(chunks[:5]):
- print(f"\n--- Chunk {i+1} ---")
+ print(f"\n--- Chunk {i + 1} ---")
print(f"Text: {chunk.text[:200]}...")
- if hasattr(chunk, 'meta') and chunk.meta:
+ if hasattr(chunk, "meta") and chunk.meta:
print(f"Metadata: {chunk.meta}")
print()
@@ -66,9 +66,9 @@ def hybrid_chunking():
print("=" * 60)
for i, chunk in enumerate(chunks[:5]):
- print(f"\n--- Chunk {i+1} ---")
+ print(f"\n--- Chunk {i + 1} ---")
print(f"Text: {chunk.text[:200]}...")
- if hasattr(chunk, 'meta') and chunk.meta:
+ if hasattr(chunk, "meta") and chunk.meta:
print(f"Metadata: {chunk.meta}")
@@ -83,21 +83,23 @@ def compare_chunking_strategies():
hier_chunks = list(HierarchicalChunker().chunk(result.document))
- hybrid_chunks = list(HybridChunker(
- tokenizer="sentence-transformers/all-MiniLM-L6-v2",
- max_tokens=256,
- ).chunk(result.document))
+ hybrid_chunks = list(
+ HybridChunker(
+ tokenizer="sentence-transformers/all-MiniLM-L6-v2",
+ max_tokens=256,
+ ).chunk(result.document)
+ )
print("=" * 60)
print("CHUNKING STRATEGY COMPARISON")
print("=" * 60)
print(f"\nHierarchical: {len(hier_chunks)} chunks")
for i, c in enumerate(hier_chunks[:3]):
- print(f" [{i+1}] {len(c.text)} chars: {c.text[:80]}...")
+ print(f" [{i + 1}] {len(c.text)} chars: {c.text[:80]}...")
print(f"\nHybrid (256 tokens): {len(hybrid_chunks)} chunks")
for i, c in enumerate(hybrid_chunks[:3]):
- print(f" [{i+1}] {len(c.text)} chars: {c.text[:80]}...")
+ print(f" [{i + 1}] {len(c.text)} chars: {c.text[:80]}...")
if __name__ == "__main__":
diff --git a/advanced_methods/01_docling/04_integrations.py b/advanced_methods/01_docling/04_integrations.py
index 6e5c748..1ff6a3f 100644
--- a/advanced_methods/01_docling/04_integrations.py
+++ b/advanced_methods/01_docling/04_integrations.py
@@ -12,7 +12,7 @@
uv pip install llama-index-readers-docling # for LlamaIndex
uv pip install langchain-docling # for LangChain
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -48,7 +48,7 @@ def llamaindex_integration():
print("DOCLING + LLAMAINDEX")
print("=" * 60)
for i, doc in enumerate(documents):
- print(f"\nDocument {i+1}:")
+ print(f"\nDocument {i + 1}:")
print(f" Text length: {len(doc.text)}")
print(f" Preview: {doc.text[:200]}...")
@@ -85,7 +85,7 @@ def langchain_integration():
print("DOCLING + LANGCHAIN")
print("=" * 60)
for i, doc in enumerate(documents):
- print(f"\nDocument {i+1}:")
+ print(f"\nDocument {i + 1}:")
print(f" Content: {doc.page_content[:200]}...")
print(f" Metadata: {doc.metadata}")
diff --git a/advanced_methods/02_unstructured_io/01_auto_partition.py b/advanced_methods/02_unstructured_io/01_auto_partition.py
index 5573eda..93daf04 100644
--- a/advanced_methods/02_unstructured_io/01_auto_partition.py
+++ b/advanced_methods/02_unstructured_io/01_auto_partition.py
@@ -9,7 +9,7 @@
uv pip install "unstructured[all-docs]"
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -29,7 +29,7 @@ def auto_partition_pdf():
for el in elements:
print(f"\n[{type(el).__name__}]")
print(f" Text: {str(el)[:150]}")
- if hasattr(el, 'metadata'):
+ if hasattr(el, "metadata"):
if el.metadata.page_number:
print(f" Page: {el.metadata.page_number}")
@@ -44,7 +44,10 @@ def auto_partition_multiple():
("HTML", SAMPLES_DIR / "04_html" / "sample_docs" / "article_page.html"),
("PPTX", SAMPLES_DIR / "03_pptx" / "sample_docs" / "presentation.pptx"),
("Email", SAMPLES_DIR / "07_email" / "sample_docs" / "plain_text.eml"),
- ("Markdown", SAMPLES_DIR / "08_markdown_txt" / "sample_docs" / "technical_doc.md"),
+ (
+ "Markdown",
+ SAMPLES_DIR / "08_markdown_txt" / "sample_docs" / "technical_doc.md",
+ ),
("EPUB", SAMPLES_DIR / "09_epub" / "sample_docs" / "sample_book.epub"),
]
@@ -59,6 +62,7 @@ def auto_partition_multiple():
# Show element type distribution
from collections import Counter
+
type_counts = Counter(type(el).__name__ for el in elements)
for etype, count in type_counts.most_common():
print(f" {etype}: {count}")
@@ -97,6 +101,7 @@ def element_types_overview():
""")
from collections import Counter
+
type_counts = Counter(type(el).__name__ for el in elements)
print(f"Found in this document ({len(elements)} elements):")
for etype, count in type_counts.most_common():
@@ -110,4 +115,8 @@ def element_types_overview():
print("3. Element types overview")
choice = input("Enter 1/2/3 (default=1): ").strip() or "1"
- {"1": auto_partition_pdf, "2": auto_partition_multiple, "3": element_types_overview}[choice]()
+ {
+ "1": auto_partition_pdf,
+ "2": auto_partition_multiple,
+ "3": element_types_overview,
+ }[choice]()
diff --git a/advanced_methods/02_unstructured_io/02_pdf_strategies.py b/advanced_methods/02_unstructured_io/02_pdf_strategies.py
index a7ce819..2842a81 100644
--- a/advanced_methods/02_unstructured_io/02_pdf_strategies.py
+++ b/advanced_methods/02_unstructured_io/02_pdf_strategies.py
@@ -12,7 +12,7 @@
uv pip install "unstructured[pdf]"
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -20,9 +20,10 @@
def compare_strategies():
"""Compare fast, hi_res, and ocr_only on the same PDF."""
- from unstructured.partition.pdf import partition_pdf
- from collections import Counter
import time
+ from collections import Counter
+
+ from unstructured.partition.pdf import partition_pdf
pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "tables.pdf")
@@ -51,7 +52,7 @@ def compare_strategies():
print(f" [{type(el).__name__}] {str(el)[:120]}")
except Exception as e:
print(f" Error: {e}")
- print(f" (hi_res requires: uv pip install \"unstructured[pdf]\" and model downloads)")
+ print(' (hi_res requires: uv pip install "unstructured[pdf]" and model downloads)')
def hi_res_with_options():
@@ -68,16 +69,16 @@ def hi_res_with_options():
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
- infer_table_structure=True, # Extract table HTML
- include_page_breaks=True, # Insert PageBreak elements
- languages=["eng"], # OCR language hints
+ infer_table_structure=True, # Extract table HTML
+ include_page_breaks=True, # Insert PageBreak elements
+ languages=["eng"], # OCR language hints
)
for el in elements:
if type(el).__name__ == "Table":
- print(f"\n--- Table Found ---")
+ print("\n--- Table Found ---")
print(f"Text: {str(el)[:200]}")
- if hasattr(el.metadata, 'text_as_html') and el.metadata.text_as_html:
+ if hasattr(el.metadata, "text_as_html") and el.metadata.text_as_html:
print(f"HTML: {el.metadata.text_as_html[:300]}")
break
else:
diff --git a/advanced_methods/02_unstructured_io/03_specific_partitioners.py b/advanced_methods/02_unstructured_io/03_specific_partitioners.py
index 97366be..7d82b4e 100644
--- a/advanced_methods/02_unstructured_io/03_specific_partitioners.py
+++ b/advanced_methods/02_unstructured_io/03_specific_partitioners.py
@@ -17,9 +17,9 @@
uv pip install "unstructured[all-docs]"
"""
-import sys
-from pathlib import Path
+
from collections import Counter
+from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -82,11 +82,11 @@ def partition_email_demo():
# Show email-specific metadata
for el in elements[:1]:
meta = el.metadata
- if hasattr(meta, 'sent_from') and meta.sent_from:
+ if hasattr(meta, "sent_from") and meta.sent_from:
print(f" From: {meta.sent_from}")
- if hasattr(meta, 'sent_to') and meta.sent_to:
+ if hasattr(meta, "sent_to") and meta.sent_to:
print(f" To: {meta.sent_to}")
- if hasattr(meta, 'subject') and meta.subject:
+ if hasattr(meta, "subject") and meta.subject:
print(f" Subject: {meta.subject}")
diff --git a/advanced_methods/02_unstructured_io/04_chunking_and_export.py b/advanced_methods/02_unstructured_io/04_chunking_and_export.py
index 0250785..17db313 100644
--- a/advanced_methods/02_unstructured_io/04_chunking_and_export.py
+++ b/advanced_methods/02_unstructured_io/04_chunking_and_export.py
@@ -16,7 +16,7 @@
uv pip install "unstructured[all-docs]"
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -24,8 +24,8 @@
def chunk_by_title_demo():
"""Chunk elements by title for semantic RAG chunks."""
- from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title
+ from unstructured.partition.auto import partition
pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "mixed_content.pdf")
elements = partition(filename=pdf_path)
@@ -33,8 +33,8 @@ def chunk_by_title_demo():
# Chunk by title with size constraints
chunks = chunk_by_title(
elements,
- max_characters=1000, # Max chunk size
- new_after_n_chars=500, # Soft limit to start new chunk
+ max_characters=1000, # Max chunk size
+ new_after_n_chars=500, # Soft limit to start new chunk
combine_text_under_n_chars=200, # Merge small chunks
)
@@ -43,7 +43,7 @@ def chunk_by_title_demo():
print("=" * 60)
for i, chunk in enumerate(chunks[:5]):
- print(f"\n--- Chunk {i+1} [{type(chunk).__name__}] ---")
+ print(f"\n--- Chunk {i + 1} [{type(chunk).__name__}] ---")
text = str(chunk)
print(f" Length: {len(text)} chars")
print(f" Text: {text[:200]}...")
@@ -66,8 +66,10 @@ def export_formats_demo():
print(text[:300])
# 2. JSON
- from unstructured.staging.base import elements_to_json
import json
+
+ from unstructured.staging.base import elements_to_json
+
json_str = elements_to_json(elements)
print(f"\n--- JSON ({len(json_str)} chars) ---")
parsed = json.loads(json_str)
@@ -75,6 +77,7 @@ def export_formats_demo():
# 3. Dict list
from unstructured.staging.base import elements_to_dicts
+
dicts = elements_to_dicts(elements)
print(f"\n--- Dicts ({len(dicts)} items) ---")
if dicts:
@@ -84,9 +87,10 @@ def export_formats_demo():
# 4. DataFrame
try:
from unstructured.staging.base import convert_to_dataframe
+
df = convert_to_dataframe(elements)
print(f"\n--- DataFrame ({len(df)} rows) ---")
- print(df[['type', 'text']].head().to_string())
+ print(df[["type", "text"]].head().to_string())
except Exception as e:
print(f"\n--- DataFrame: {e} ---")
@@ -105,8 +109,15 @@ def metadata_exploration():
for el in elements[:5]:
print(f"\n[{type(el).__name__}] {str(el)[:80]}")
meta = el.metadata
- attrs = ['filename', 'file_directory', 'page_number', 'coordinates',
- 'text_as_html', 'languages', 'detection_class_prob']
+ attrs = [
+ "filename",
+ "file_directory",
+ "page_number",
+ "coordinates",
+ "text_as_html",
+ "languages",
+ "detection_class_prob",
+ ]
for attr in attrs:
val = getattr(meta, attr, None)
if val is not None:
diff --git a/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py b/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py
index 34bbd4e..1652061 100644
--- a/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py
+++ b/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py
@@ -18,8 +18,8 @@
Free tier: 500 pages/month
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -28,9 +28,8 @@
def layout_extraction():
"""Extract layout from a PDF using prebuilt-layout model."""
try:
- from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
- from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+ from azure.core.credentials import AzureKeyCredential
except ImportError:
print("Install: uv pip install azure-ai-documentintelligence")
_show_example_code()
@@ -81,7 +80,7 @@ def layout_extraction():
if result.tables:
print(f"\n--- Tables: {len(result.tables)} found ---")
for i, table in enumerate(result.tables):
- print(f"\n Table {i+1}: {table.row_count} rows x {table.column_count} cols")
+ print(f"\n Table {i + 1}: {table.row_count} rows x {table.column_count} cols")
for cell in table.cells[:6]:
print(f" [{cell.row_index},{cell.column_index}] = {cell.content}")
@@ -126,8 +125,8 @@ def _show_example_code():
def markdown_output():
"""Get document content as Markdown (Azure's built-in conversion)."""
try:
- from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
+ from azure.core.credentials import AzureKeyCredential
except ImportError:
print("Install: uv pip install azure-ai-documentintelligence")
return
diff --git a/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py b/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py
index cd2d6bd..77fd483 100644
--- a/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py
+++ b/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py
@@ -16,8 +16,8 @@
uv pip install azure-ai-documentintelligence
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -26,8 +26,8 @@
def prebuilt_read():
"""Use prebuilt-read for pure text extraction (OCR optimized)."""
try:
- from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
+ from azure.core.credentials import AzureKeyCredential
except ImportError:
print("Install: uv pip install azure-ai-documentintelligence")
_show_setup_message("PREBUILT-READ MODEL")
@@ -56,7 +56,7 @@ def prebuilt_read():
print(result.content[:500])
if result.languages:
- print(f"\nDetected languages: {[l.locale for l in result.languages]}")
+ print(f"\nDetected languages: {[lang.locale for lang in result.languages]}")
def prebuilt_document():
@@ -181,4 +181,9 @@ def _show_read_example():
print("4. Model comparison overview")
choice = input("Enter 1/2/3/4 (default=4): ").strip() or "4"
- {"1": prebuilt_read, "2": prebuilt_document, "3": prebuilt_invoice, "4": model_comparison}[choice]()
+ {
+ "1": prebuilt_read,
+ "2": prebuilt_document,
+ "3": prebuilt_invoice,
+ "4": model_comparison,
+ }[choice]()
diff --git a/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py b/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py
index d678e28..96d38a6 100644
--- a/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py
+++ b/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py
@@ -7,8 +7,8 @@
uv pip install azure-ai-documentintelligence
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -17,8 +17,8 @@
def table_extraction():
"""Extract tables with full structure from PDF."""
try:
- from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
+ from azure.core.credentials import AzureKeyCredential
except ImportError:
print("Install: uv pip install azure-ai-documentintelligence")
_show_table_example()
@@ -50,7 +50,7 @@ def table_extraction():
return
for i, table in enumerate(result.tables):
- print(f"\n--- Table {i+1} ---")
+ print(f"\n--- Table {i + 1} ---")
print(f" Rows: {table.row_count}, Columns: {table.column_count}")
print(f" Page: {table.bounding_regions[0].page_number if table.bounding_regions else 'N/A'}")
diff --git a/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py b/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py
index df7d511..7537dbc 100644
--- a/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py
+++ b/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py
@@ -14,8 +14,8 @@
uv pip install azure-ai-documentintelligence
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -49,8 +49,8 @@ def rag_pipeline():
_show_pipeline_code()
return
- from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
+ from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))
pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "mixed_content.pdf"
@@ -76,7 +76,10 @@ def rag_pipeline():
if line.startswith("# ") or line.startswith("## "):
if current_chunk["text"].strip():
chunks.append(current_chunk.copy())
- current_chunk = {"text": line + "\n", "metadata": {"section": line.strip("# ")}}
+ current_chunk = {
+ "text": line + "\n",
+ "metadata": {"section": line.strip("# ")},
+ }
else:
current_chunk["text"] += line + "\n"
@@ -86,7 +89,7 @@ def rag_pipeline():
print(f"Step 3: Created {len(chunks)} semantic chunks")
for i, chunk in enumerate(chunks[:5]):
- print(f"\n Chunk {i+1} ({len(chunk['text'])} chars):")
+ print(f"\n Chunk {i + 1} ({len(chunk['text'])} chars):")
print(f" Section: {chunk['metadata']['section']}")
print(f" Preview: {chunk['text'][:150].strip()}...")
diff --git a/advanced_methods/04_llamaparse/01_basic_parsing.py b/advanced_methods/04_llamaparse/01_basic_parsing.py
index 0077642..1934e1d 100644
--- a/advanced_methods/04_llamaparse/01_basic_parsing.py
+++ b/advanced_methods/04_llamaparse/01_basic_parsing.py
@@ -19,8 +19,8 @@
# OR newer:
uv pip install llama-cloud>=1.0
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -62,7 +62,7 @@ def basic_parse_pdf():
documents = parser.load_data(pdf_path)
for i, doc in enumerate(documents):
- print(f"\n--- Document {i+1} ---")
+ print(f"\n--- Document {i + 1} ---")
print(f"Text length: {len(doc.text)}")
print(f"Preview:\n{doc.text[:500]}")
diff --git a/advanced_methods/04_llamaparse/02_llamaindex_integration.py b/advanced_methods/04_llamaparse/02_llamaindex_integration.py
index a1adb84..c58f4f9 100644
--- a/advanced_methods/04_llamaparse/02_llamaindex_integration.py
+++ b/advanced_methods/04_llamaparse/02_llamaindex_integration.py
@@ -6,8 +6,8 @@
uv pip install llama-parse llama-index
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -28,8 +28,8 @@ def llamaindex_rag_pipeline():
return
try:
+ from llama_index.core import VectorStoreIndex
from llama_parse import LlamaParse
- from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
except ImportError:
print("Install: uv pip install llama-parse llama-index")
_show_pipeline_code()
diff --git a/advanced_methods/04_llamaparse/03_parsing_tiers.py b/advanced_methods/04_llamaparse/03_parsing_tiers.py
index 6e373f7..7a3395f 100644
--- a/advanced_methods/04_llamaparse/03_parsing_tiers.py
+++ b/advanced_methods/04_llamaparse/03_parsing_tiers.py
@@ -12,8 +12,8 @@
uv pip install llama-parse
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
diff --git a/advanced_methods/05_marker/01_basic_conversion.py b/advanced_methods/05_marker/01_basic_conversion.py
index 3988db1..45cd395 100644
--- a/advanced_methods/05_marker/01_basic_conversion.py
+++ b/advanced_methods/05_marker/01_basic_conversion.py
@@ -15,7 +15,7 @@
uv pip install marker-pdf
uv pip install marker-pdf[full] # for DOCX, PPTX, XLSX support
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -50,7 +50,7 @@ def basic_pdf_to_markdown():
print(f"\n--- Markdown Output ({len(text)} chars) ---")
print(text[:800])
- print(f"\n--- Metadata ---")
+ print("\n--- Metadata ---")
for key, val in metadata.items():
print(f" {key}: {val}")
@@ -61,9 +61,9 @@ def basic_pdf_to_markdown():
def convert_with_config():
"""Convert with custom configuration: output format, page range, etc."""
try:
+ from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
- from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
except ImportError:
print("Install: uv pip install marker-pdf")
@@ -75,7 +75,7 @@ def convert_with_config():
# Custom configuration
config = {
"output_format": "markdown",
- "page_range": "0-2", # First 3 pages only
+ "page_range": "0-2", # First 3 pages only
"force_ocr": False,
"disable_image_extraction": True,
}
diff --git a/advanced_methods/05_marker/02_output_formats.py b/advanced_methods/05_marker/02_output_formats.py
index 2175836..3e37eda 100644
--- a/advanced_methods/05_marker/02_output_formats.py
+++ b/advanced_methods/05_marker/02_output_formats.py
@@ -10,8 +10,7 @@
uv pip install marker-pdf
"""
-import sys
-import json
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -46,9 +45,9 @@ def output_formats_overview():
# Try live demo if marker is installed
try:
+ from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
- from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "simple_text.pdf")
diff --git a/advanced_methods/05_marker/03_specialized_converters.py b/advanced_methods/05_marker/03_specialized_converters.py
index 0233658..82fb8da 100644
--- a/advanced_methods/05_marker/03_specialized_converters.py
+++ b/advanced_methods/05_marker/03_specialized_converters.py
@@ -9,7 +9,7 @@
uv pip install marker-pdf
"""
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -132,4 +132,8 @@ def _show_table_example():
print("3. Structured extraction")
choice = input("Enter 1/2/3 (default=1): ").strip() or "1"
- {"1": table_converter_demo, "2": ocr_converter_demo, "3": extraction_converter_demo}[choice]()
+ {
+ "1": table_converter_demo,
+ "2": ocr_converter_demo,
+ "3": extraction_converter_demo,
+ }[choice]()
diff --git a/advanced_methods/06_megaparse/01_basic_parsing.py b/advanced_methods/06_megaparse/01_basic_parsing.py
index 18489c7..b705ba3 100644
--- a/advanced_methods/06_megaparse/01_basic_parsing.py
+++ b/advanced_methods/06_megaparse/01_basic_parsing.py
@@ -16,8 +16,7 @@
Requirements: Python >= 3.11, poppler, tesseract-ocr
uv pip install megaparse
"""
-import os
-import sys
+
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
diff --git a/advanced_methods/06_megaparse/02_vision_parsing.py b/advanced_methods/06_megaparse/02_vision_parsing.py
index b263da1..0c9e0e6 100644
--- a/advanced_methods/06_megaparse/02_vision_parsing.py
+++ b/advanced_methods/06_megaparse/02_vision_parsing.py
@@ -17,8 +17,8 @@
# OR
uv pip install megaparse langchain-anthropic
"""
+
import os
-import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -37,8 +37,8 @@ def vision_parse_openai():
return
try:
- from megaparse.parser.megaparse_vision import MegaParseVision
from langchain_openai import ChatOpenAI
+ from megaparse.parser.megaparse_vision import MegaParseVision
except ImportError:
print("Install: uv pip install megaparse langchain-openai")
return
@@ -142,4 +142,8 @@ def _show_vision_example():
print("3. Standard vs Vision comparison")
choice = input("Enter 1/2/3 (default=3): ").strip() or "3"
- {"1": vision_parse_openai, "2": vision_parse_anthropic, "3": compare_standard_vs_vision}[choice]()
+ {
+ "1": vision_parse_openai,
+ "2": vision_parse_anthropic,
+ "3": compare_standard_vs_vision,
+ }[choice]()
diff --git a/advanced_methods/06_megaparse/03_rag_preparation.py b/advanced_methods/06_megaparse/03_rag_preparation.py
index 8f48e57..a1ea80d 100644
--- a/advanced_methods/06_megaparse/03_rag_preparation.py
+++ b/advanced_methods/06_megaparse/03_rag_preparation.py
@@ -13,9 +13,9 @@
uv pip install megaparse
"""
-import os
-import sys
+
import re
+import sys
from pathlib import Path
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
@@ -42,18 +42,20 @@ def rag_preparation_pipeline():
# Step 3: Add metadata
enriched_chunks = []
for i, chunk in enumerate(chunks):
- enriched_chunks.append({
- "id": f"chunk_{i}",
- "text": chunk["text"],
- "metadata": {
- "heading": chunk.get("heading", ""),
- "char_count": len(chunk["text"]),
- "chunk_index": i,
- "source": "megaparse",
+ enriched_chunks.append(
+ {
+ "id": f"chunk_{i}",
+ "text": chunk["text"],
+ "metadata": {
+ "heading": chunk.get("heading", ""),
+ "char_count": len(chunk["text"]),
+ "chunk_index": i,
+ "source": "megaparse",
+ },
}
- })
+ )
- print(f"Step 3: Enriched with metadata")
+ print("Step 3: Enriched with metadata")
# Display results
print(f"\n{'=' * 60}")
@@ -117,6 +119,7 @@ def _get_parsed_content():
"""Get parsed content from MegaParse or use sample Markdown."""
try:
from megaparse import MegaParse
+
pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "mixed_content.pdf")
megaparse = MegaParse()
return megaparse.load(pdf_path)
diff --git a/pyproject.toml b/pyproject.toml
index 6858d9f..f5f169f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,3 +58,12 @@ llamaparse = [
marker = [
"marker-pdf>=1.0",
]
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+select = ["E", "W", "F", "I"]
+
+[tool.ruff.format]
+line-ending = "auto"
diff --git a/unstructured_documents/01_pdf/01_pypdf_extraction.py b/unstructured_documents/01_pdf/01_pypdf_extraction.py
index 3580fb5..a1a0d68 100644
--- a/unstructured_documents/01_pdf/01_pypdf_extraction.py
+++ b/unstructured_documents/01_pdf/01_pypdf_extraction.py
@@ -125,7 +125,7 @@ def demo_chunking():
# Summary comparison
print("\n--- Chunking Strategy Comparison ---")
print(f" {'Strategy':<25s} {'Chunks':>8s} {'Avg Size':>10s} {'Min':>6s} {'Max':>6s}")
- print(f" {'-'*25} {'-'*8} {'-'*10} {'-'*6} {'-'*6}")
+ print(f" {'-' * 25} {'-' * 8} {'-' * 10} {'-' * 6} {'-' * 6}")
for name, chunks in [
("Character (500)", char_chunks),
("Sentence (5/chunk)", sent_chunks),
diff --git a/unstructured_documents/01_pdf/02_pdfplumber_extraction.py b/unstructured_documents/01_pdf/02_pdfplumber_extraction.py
index 85f1821..b969aea 100644
--- a/unstructured_documents/01_pdf/02_pdfplumber_extraction.py
+++ b/unstructured_documents/01_pdf/02_pdfplumber_extraction.py
@@ -29,6 +29,7 @@
# 1. Text extraction with better formatting
# ---------------------------------------------------------------------------
+
def extract_text_with_layout(pdf_path: Path) -> list[dict]:
"""
Extract text page by page, preserving layout spacing.
@@ -42,18 +43,23 @@ def extract_text_with_layout(pdf_path: Path) -> list[dict]:
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
# extract_text with layout settings for better column handling
- text_layout = page.extract_text(
- layout=True, # Use layout-aware extraction
- x_density=7.25, # Horizontal character density
- y_density=13, # Vertical line density
- ) or ""
- results.append({
- "page": i + 1,
- "text_default": text,
- "text_layout": text_layout,
- "width": float(page.width),
- "height": float(page.height),
- })
+ text_layout = (
+ page.extract_text(
+ layout=True, # Use layout-aware extraction
+ x_density=7.25, # Horizontal character density
+ y_density=13, # Vertical line density
+ )
+ or ""
+ )
+ results.append(
+ {
+ "page": i + 1,
+ "text_default": text,
+ "text_layout": text_layout,
+ "width": float(page.width),
+ "height": float(page.height),
+ }
+ )
return results
@@ -90,6 +96,7 @@ def demo_text_extraction():
# 2. Character-level and word-level extraction
# ---------------------------------------------------------------------------
+
def demo_character_level():
"""Show character-level and word-level data available via pdfplumber."""
print("\n" + "=" * 70)
@@ -104,28 +111,30 @@ def demo_character_level():
print(f"\n Total characters on page 1: {len(chars)}")
print("\n First 5 characters with metadata:")
print(f" {'Char':<6s} {'x0':>8s} {'y0':>8s} {'x1':>8s} {'y1':>8s} {'Font':<30s} {'Size':>6s}")
- print(f" {'-'*6} {'-'*8} {'-'*8} {'-'*8} {'-'*8} {'-'*30} {'-'*6}")
+ print(f" {'-' * 6} {'-' * 8} {'-' * 8} {'-' * 8} {'-' * 8} {'-' * 30} {'-' * 6}")
for c in chars[:5]:
- print(f" {repr(c['text']):<6s} {c['x0']:>8.2f} {c['top']:>8.2f} "
- f"{c['x1']:>8.2f} {c['bottom']:>8.2f} "
- f"{c.get('fontname', 'N/A'):<30s} {c.get('size', 0):>6.1f}")
+ print(
+ f" {repr(c['text']):<6s} {c['x0']:>8.2f} {c['top']:>8.2f} "
+ f"{c['x1']:>8.2f} {c['bottom']:>8.2f} "
+ f"{c.get('fontname', 'N/A'):<30s} {c.get('size', 0):>6.1f}"
+ )
# Word-level data
words = page.extract_words()
print(f"\n Total words on page 1: {len(words)}")
print("\n First 10 words with bounding boxes:")
print(f" {'Word':<20s} {'x0':>8s} {'top':>8s} {'x1':>8s} {'bottom':>8s}")
- print(f" {'-'*20} {'-'*8} {'-'*8} {'-'*8} {'-'*8}")
+ print(f" {'-' * 20} {'-' * 8} {'-' * 8} {'-' * 8} {'-' * 8}")
for w in words[:10]:
text = w["text"][:20]
- print(f" {text:<20s} {w['x0']:>8.2f} {w['top']:>8.2f} "
- f"{w['x1']:>8.2f} {w['bottom']:>8.2f}")
+ print(f" {text:<20s} {w['x0']:>8.2f} {w['top']:>8.2f} {w['x1']:>8.2f} {w['bottom']:>8.2f}")
# ---------------------------------------------------------------------------
# 3. Table detection and extraction
# ---------------------------------------------------------------------------
+
def extract_tables_from_pdf(pdf_path: Path) -> list[dict]:
"""
Detect and extract all tables from a PDF.
@@ -138,13 +147,15 @@ def extract_tables_from_pdf(pdf_path: Path) -> list[dict]:
for i, page in enumerate(pdf.pages):
page_tables = page.extract_tables()
for j, table in enumerate(page_tables):
- tables.append({
- "page": i + 1,
- "table_index": j,
- "data": table, # list of rows, each row is a list of cells
- "num_rows": len(table),
- "num_cols": len(table[0]) if table else 0,
- })
+ tables.append(
+ {
+ "page": i + 1,
+ "table_index": j,
+ "data": table, # list of rows, each row is a list of cells
+ "num_rows": len(table),
+ "num_cols": len(table[0]) if table else 0,
+ }
+ )
return tables
@@ -203,13 +214,13 @@ def demo_table_settings():
# Show table finder debug info
table_finder = page.debug_tablefinder()
- print(f"\n Table finder debug:")
+ print("\n Table finder debug:")
print(f" Tables detected: {len(table_finder.tables)}")
for idx, tbl in enumerate(table_finder.tables):
bbox = tbl.bbox
- print(f" Table {idx + 1} bbox: "
- f"x0={bbox[0]:.1f}, top={bbox[1]:.1f}, "
- f"x1={bbox[2]:.1f}, bottom={bbox[3]:.1f}")
+ print(
+ f" Table {idx + 1} bbox: x0={bbox[0]:.1f}, top={bbox[1]:.1f}, x1={bbox[2]:.1f}, bottom={bbox[3]:.1f}"
+ )
if __name__ == "__main__":
diff --git a/unstructured_documents/01_pdf/03_pymupdf_extraction.py b/unstructured_documents/01_pdf/03_pymupdf_extraction.py
index c6e72a8..bd9d276 100644
--- a/unstructured_documents/01_pdf/03_pymupdf_extraction.py
+++ b/unstructured_documents/01_pdf/03_pymupdf_extraction.py
@@ -29,6 +29,7 @@
# 1. Basic text extraction
# ---------------------------------------------------------------------------
+
def extract_text_basic(pdf_path: Path) -> list[str]:
"""Extract plain text page by page using get_text()."""
doc = fitz.open(str(pdf_path))
@@ -62,6 +63,7 @@ def demo_basic_extraction():
# 2. Layout-preserved extraction using "blocks" mode
# ---------------------------------------------------------------------------
+
def extract_text_with_layout(pdf_path: Path) -> list[str]:
"""
Extract text preserving the original page layout.
@@ -100,6 +102,7 @@ def demo_layout_extraction():
# 3. Block-level extraction with bounding boxes
# ---------------------------------------------------------------------------
+
def extract_blocks(pdf_path: Path, page_num: int = 0) -> list[dict]:
"""
Extract text as blocks with position information.
@@ -146,15 +149,17 @@ def demo_block_extraction():
blocks = extract_blocks(SIMPLE_TEXT_PDF, page_num=0)
print(f"\n Total blocks on page 1: {len(blocks)}")
print(f"\n {'Block':>5s} {'Type':<6s} {'x0':>7s} {'y0':>7s} {'x1':>7s} {'y1':>7s} Text Preview")
- print(f" {'-'*5} {'-'*6} {'-'*7} {'-'*7} {'-'*7} {'-'*7} {'-'*30}")
+ print(f" {'-' * 5} {'-' * 6} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 30}")
for b in blocks[:10]:
text_preview = b["text"][:40].replace("\n", " ").strip()
if len(b["text"]) > 40:
text_preview += "..."
- print(f" {b['block_no']:>5d} {b['block_type']:<6s} "
- f"{b['x0']:>7.1f} {b['y0']:>7.1f} "
- f"{b['x1']:>7.1f} {b['y1']:>7.1f} {text_preview}")
+ print(
+ f" {b['block_no']:>5d} {b['block_type']:<6s} "
+ f"{b['x0']:>7.1f} {b['y0']:>7.1f} "
+ f"{b['x1']:>7.1f} {b['y1']:>7.1f} {text_preview}"
+ )
if len(blocks) > 10:
print(f"\n ... and {len(blocks) - 10} more blocks")
@@ -164,6 +169,7 @@ def demo_block_extraction():
# 4. Structured dict extraction (spans with font info)
# ---------------------------------------------------------------------------
+
def extract_structured_dict(pdf_path: Path, page_num: int = 0) -> dict:
"""
Extract page content as a structured dictionary with full font info.
@@ -198,14 +204,14 @@ def demo_structured_extraction():
for span in line["spans"]:
fonts_seen.add((span["font"], round(span["size"], 1)))
- print(f"\n Unique font/size combinations found:")
+ print("\n Unique font/size combinations found:")
for font, size in sorted(fonts_seen, key=lambda x: (-x[1], x[0])):
print(f" {font:<35s} size={size}")
# Show first few spans with full detail
- print(f"\n First 8 text spans with details:")
+ print("\n First 8 text spans with details:")
print(f" {'Text':<35s} {'Font':<25s} {'Size':>5s} {'Bold':>5s}")
- print(f" {'-'*35} {'-'*25} {'-'*5} {'-'*5}")
+ print(f" {'-' * 35} {'-' * 25} {'-' * 5} {'-' * 5}")
count = 0
for block in data["blocks"]:
if block["type"] != 0:
@@ -232,6 +238,7 @@ def demo_structured_extraction():
# 5. Fast batch extraction
# ---------------------------------------------------------------------------
+
def demo_batch_extraction():
"""Demonstrate fast batch extraction of all pages at once."""
print("\n" + "=" * 70)
@@ -262,12 +269,9 @@ def demo_batch_extraction():
texts_c = [page.get_text("rawdict") for page in doc3]
time_c = time.perf_counter() - start
- print(f"\n Page-by-page (text mode): {time_a*1000:>8.2f} ms "
- f"({sum(len(t) for t in texts_a):,} chars)")
- print(f" Page-by-page (sorted text): {time_b*1000:>8.2f} ms "
- f"({sum(len(t) for t in texts_b):,} chars)")
- print(f" Page-by-page (rawdict mode): {time_c*1000:>8.2f} ms "
- f"({len(texts_c)} page dicts)")
+ print(f"\n Page-by-page (text mode): {time_a * 1000:>8.2f} ms ({sum(len(t) for t in texts_a):,} chars)")
+ print(f" Page-by-page (sorted text): {time_b * 1000:>8.2f} ms ({sum(len(t) for t in texts_b):,} chars)")
+ print(f" Page-by-page (rawdict mode): {time_c * 1000:>8.2f} ms ({len(texts_c)} page dicts)")
doc.close()
doc2.close()
@@ -282,6 +286,7 @@ def demo_batch_extraction():
# 6. Practical: Detect headings by font size
# ---------------------------------------------------------------------------
+
def detect_headings(pdf_path: Path) -> list[dict]:
"""
Use font-size analysis to detect headings automatically.
@@ -315,13 +320,15 @@ def detect_headings(pdf_path: Path) -> list[dict]:
for line in block["lines"]:
for span in line["spans"]:
if span["size"] > body_size + 1 and span["text"].strip():
- headings.append({
- "page": page_num + 1,
- "text": span["text"].strip(),
- "font_size": round(span["size"], 1),
- "font": span["font"],
- "y_position": round(span["origin"][1], 1),
- })
+ headings.append(
+ {
+ "page": page_num + 1,
+ "text": span["text"].strip(),
+ "font_size": round(span["size"], 1),
+ "font": span["font"],
+ "y_position": round(span["origin"][1], 1),
+ }
+ )
doc.close()
return headings
@@ -336,7 +343,7 @@ def demo_heading_detection():
headings = detect_headings(SIMPLE_TEXT_PDF)
print(f"\n Headings detected: {len(headings)}")
print(f"\n {'Page':>4s} {'Size':>5s} Text")
- print(f" {'-'*4} {'-'*5} {'-'*50}")
+ print(f" {'-' * 4} {'-' * 5} {'-' * 50}")
for h in headings:
print(f" {h['page']:>4d} {h['font_size']:>5.1f} {h['text'][:60]}")
diff --git a/unstructured_documents/01_pdf/04_table_extraction.py b/unstructured_documents/01_pdf/04_table_extraction.py
index bac9552..38d2083 100644
--- a/unstructured_documents/01_pdf/04_table_extraction.py
+++ b/unstructured_documents/01_pdf/04_table_extraction.py
@@ -33,6 +33,7 @@
# Core extraction
# ---------------------------------------------------------------------------
+
def extract_all_tables(pdf_path: Path) -> list[dict]:
"""
Extract every table from every page of a PDF.
@@ -54,18 +55,18 @@ def extract_all_tables(pdf_path: Path) -> list[dict]:
# Clean cell values: replace None with empty string, strip whitespace
cleaned = []
for row in table:
- cleaned.append([
- (cell.strip() if cell else "") for cell in row
- ])
+ cleaned.append([(cell.strip() if cell else "") for cell in row])
header = cleaned[0]
rows = cleaned[1:]
- results.append({
- "page": page_num + 1,
- "table_index": table_idx,
- "header": header,
- "rows": rows,
- "raw": table,
- })
+ results.append(
+ {
+ "page": page_num + 1,
+ "table_index": table_idx,
+ "header": header,
+ "rows": rows,
+ "raw": table,
+ }
+ )
return results
@@ -73,6 +74,7 @@ def extract_all_tables(pdf_path: Path) -> list[dict]:
# Format converters
# ---------------------------------------------------------------------------
+
def table_to_list_of_dicts(header: list[str], rows: list[list[str]]) -> list[dict]:
"""Convert a table to a list of dictionaries (one per row)."""
return [dict(zip(header, row)) for row in rows]
@@ -145,6 +147,7 @@ def table_to_natural_language(
# Demonstrations
# ---------------------------------------------------------------------------
+
def demo_extract_tables():
"""Extract and display all tables."""
print("=" * 70)
@@ -266,12 +269,14 @@ def demo_rag_preparation():
# Strategy 1: One passage per table (markdown format)
md_passage = f"## {title}\n\n" + table_to_markdown(t["header"], t["rows"])
- all_passages.append({
- "type": "table_markdown",
- "source": f"tables.pdf, page {t['page']}",
- "title": title,
- "content": md_passage,
- })
+ all_passages.append(
+ {
+ "type": "table_markdown",
+ "source": f"tables.pdf, page {t['page']}",
+ "title": title,
+ "content": md_passage,
+ }
+ )
# Strategy 2: One passage per row (natural language)
for row_idx, row in enumerate(t["rows"]):
@@ -280,12 +285,14 @@ def demo_rag_preparation():
if val:
parts.append(f"{col} is {val}")
sentence = f"In the {title} table: " + ", ".join(parts) + "."
- all_passages.append({
- "type": "table_row_nl",
- "source": f"tables.pdf, page {t['page']}, row {row_idx + 1}",
- "title": title,
- "content": sentence,
- })
+ all_passages.append(
+ {
+ "type": "table_row_nl",
+ "source": f"tables.pdf, page {t['page']}, row {row_idx + 1}",
+ "title": title,
+ "content": sentence,
+ }
+ )
print(f"\n Total RAG passages generated: {len(all_passages)}")
diff --git a/unstructured_documents/01_pdf/05_ocr_extraction.py b/unstructured_documents/01_pdf/05_ocr_extraction.py
index 9b31e1f..de2d50f 100644
--- a/unstructured_documents/01_pdf/05_ocr_extraction.py
+++ b/unstructured_documents/01_pdf/05_ocr_extraction.py
@@ -60,6 +60,7 @@ def check_dependencies() -> tuple[bool, list[str]]:
if "pytesseract" not in [m.split()[0] for m in missing]:
try:
import pytesseract
+
pytesseract.get_tesseract_version()
except Exception:
missing.append("tesseract-ocr system binary (brew install tesseract / apt install tesseract-ocr)")
@@ -67,7 +68,7 @@ def check_dependencies() -> tuple[bool, list[str]]:
# Check poppler (needed by pdf2image)
if "pdf2image" not in [m.split()[0] for m in missing]:
try:
- from pdf2image import convert_from_path
+ pass
# Try a quick conversion to check poppler is available
# We do not actually convert here; just check the import path
except Exception:
@@ -103,6 +104,7 @@ def print_installation_guide(missing: list[str]):
# OCR extraction functions
# ---------------------------------------------------------------------------
+
def ocr_extract_text(pdf_path: Path, dpi: int = 300) -> list[dict]:
"""
Extract text from a PDF using OCR.
@@ -117,8 +119,8 @@ def ocr_extract_text(pdf_path: Path, dpi: int = 300) -> list[dict]:
dpi: Resolution for page-to-image conversion (higher = better accuracy
but slower). 300 DPI is a good default.
"""
- from pdf2image import convert_from_path
import pytesseract
+ from pdf2image import convert_from_path
# Convert PDF pages to images
images = convert_from_path(str(pdf_path), dpi=dpi)
@@ -132,19 +134,18 @@ def ocr_extract_text(pdf_path: Path, dpi: int = 300) -> list[dict]:
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
# Calculate average confidence (excluding empty/low-confidence entries)
- confidences = [
- int(c) for c, t in zip(data["conf"], data["text"])
- if int(c) > 0 and t.strip()
- ]
+ confidences = [int(c) for c, t in zip(data["conf"], data["text"]) if int(c) > 0 and t.strip()]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
- results.append({
- "page": i + 1,
- "text": text,
- "image_size": image.size,
- "avg_confidence": round(avg_confidence, 1),
- "word_count": len([t for t in data["text"] if t.strip()]),
- })
+ results.append(
+ {
+ "page": i + 1,
+ "text": text,
+ "image_size": image.size,
+ "avg_confidence": round(avg_confidence, 1),
+ "word_count": len([t for t in data["text"] if t.strip()]),
+ }
+ )
return results
@@ -161,9 +162,9 @@ def ocr_extract_with_preprocessing(pdf_path: Path, dpi: int = 300) -> list[dict]
These steps are especially helpful for low-quality scans.
"""
- from pdf2image import convert_from_path
- from PIL import Image, ImageFilter
import pytesseract
+ from pdf2image import convert_from_path
+ from PIL import ImageFilter
images = convert_from_path(str(pdf_path), dpi=dpi)
@@ -184,11 +185,13 @@ def ocr_extract_with_preprocessing(pdf_path: Path, dpi: int = 300) -> list[dict]
# Extract text from preprocessed image
text = pytesseract.image_to_string(binarized)
- results.append({
- "page": i + 1,
- "text": text,
- "preprocessing": "grayscale -> sharpen -> binarize",
- })
+ results.append(
+ {
+ "page": i + 1,
+ "text": text,
+ "preprocessing": "grayscale -> sharpen -> binarize",
+ }
+ )
return results
@@ -201,8 +204,8 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]:
word, line, and paragraph. This is useful for preserving document
layout from scanned PDFs.
"""
- from pdf2image import convert_from_path
import pytesseract
+ from pdf2image import convert_from_path
images = convert_from_path(str(pdf_path), dpi=dpi)
@@ -218,7 +221,11 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]:
lines = {}
for j in range(len(data["text"])):
if data["text"][j].strip():
- line_key = (data["block_num"][j], data["par_num"][j], data["line_num"][j])
+ line_key = (
+ data["block_num"][j],
+ data["par_num"][j],
+ data["line_num"][j],
+ )
if line_key not in lines:
lines[line_key] = {
"words": [],
@@ -227,18 +234,20 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]:
}
lines[line_key]["words"].append(data["text"][j])
- results.append({
- "page": i + 1,
- "num_lines": len(lines),
- "lines": [
- {
- "text": " ".join(line["words"]),
- "position": {"left": line["left"], "top": line["top"]},
- }
- for line in lines.values()
- ],
- "hocr_size": len(hocr),
- })
+ results.append(
+ {
+ "page": i + 1,
+ "num_lines": len(lines),
+ "lines": [
+ {
+ "text": " ".join(line["words"]),
+ "position": {"left": line["left"], "top": line["top"]},
+ }
+ for line in lines.values()
+ ],
+ "hocr_size": len(hocr),
+ }
+ )
return results
@@ -247,6 +256,7 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]:
# Demonstrations
# ---------------------------------------------------------------------------
+
def demo_basic_ocr():
"""Demonstrate basic OCR extraction."""
print("\n" + "=" * 70)
@@ -297,7 +307,7 @@ def demo_layout_ocr():
print(f"\n --- Page {r['page']} ---")
print(f" Lines detected: {r['num_lines']}")
print(f" hOCR output size: {r['hocr_size']:,} bytes")
- print(f"\n First 10 lines with positions:")
+ print("\n First 10 lines with positions:")
for line in r["lines"][:10]:
pos = line["position"]
print(f" [{pos['left']:>4d}, {pos['top']:>4d}] {line['text'][:60]}")
@@ -318,6 +328,7 @@ def demo_ocr_vs_text():
# Direct extraction with PyMuPDF
try:
import fitz
+
doc = fitz.open(str(SIMPLE_TEXT_PDF))
direct_text = "\n".join(page.get_text() for page in doc)
doc.close()
diff --git a/unstructured_documents/01_pdf/06_comparison.py b/unstructured_documents/01_pdf/06_comparison.py
index 59e8b0f..8c1afdd 100644
--- a/unstructured_documents/01_pdf/06_comparison.py
+++ b/unstructured_documents/01_pdf/06_comparison.py
@@ -34,6 +34,7 @@
# Extraction wrappers
# ---------------------------------------------------------------------------
+
def extract_pypdf(pdf_path: Path) -> str:
"""Extract text using pypdf."""
reader = PdfReader(str(pdf_path))
@@ -90,6 +91,7 @@ def extract_pymupdf_sorted(pdf_path: Path) -> str:
# Timing utility
# ---------------------------------------------------------------------------
+
def time_extraction(func, pdf_path: Path, runs: int = 5) -> tuple[str, float]:
"""Run an extraction function multiple times and return text + avg time."""
times = []
@@ -107,6 +109,7 @@ def time_extraction(func, pdf_path: Path, runs: int = 5) -> tuple[str, float]:
# 1. Text extraction comparison
# ---------------------------------------------------------------------------
+
def compare_text_extraction():
"""Compare all methods on simple_text.pdf."""
print("=" * 70)
@@ -127,13 +130,15 @@ def compare_text_extraction():
text, avg_ms = time_extraction(func, SIMPLE_TEXT_PDF, runs=5)
word_count = len(text.split())
line_count = len(text.strip().split("\n"))
- results.append({
- "Method": name,
- "Chars": f"{len(text):,}",
- "Words": f"{word_count:,}",
- "Lines": f"{line_count:,}",
- "Avg Time (ms)": f"{avg_ms:.1f}",
- })
+ results.append(
+ {
+ "Method": name,
+ "Chars": f"{len(text):,}",
+ "Words": f"{word_count:,}",
+ "Lines": f"{line_count:,}",
+ "Avg Time (ms)": f"{avg_ms:.1f}",
+ }
+ )
texts[name] = text
# Print comparison table
@@ -156,6 +161,7 @@ def compare_text_extraction():
# 2. Table extraction comparison
# ---------------------------------------------------------------------------
+
def compare_table_extraction():
"""Compare table extraction on tables.pdf."""
print("\n" + "=" * 70)
@@ -209,18 +215,30 @@ def compare_table_extraction():
print(f" Text extracted: {len(mupdf_text):,} chars")
print(f" Time: {mupdf_time:.1f} ms")
- print(f" Note: PyMuPDF extracts table content as text but does not")
- print(f" detect table structure (rows/columns). Use pdfplumber for that.")
+ print(" Note: PyMuPDF extracts table content as text but does not")
+ print(" detect table structure (rows/columns). Use pdfplumber for that.")
# Summary table
print("\n --- Table Extraction Summary ---")
summary = [
- {"Method": "pdfplumber (default)", "Tables Found": len(plumber_tables),
- "Time (ms)": f"{plumber_time:.1f}", "Structured": "Yes"},
- {"Method": "pdfplumber (text strategy)", "Tables Found": len(custom_tables),
- "Time (ms)": f"{custom_time:.1f}", "Structured": "Yes"},
- {"Method": "PyMuPDF (text only)", "Tables Found": "N/A",
- "Time (ms)": f"{mupdf_time:.1f}", "Structured": "No"},
+ {
+ "Method": "pdfplumber (default)",
+ "Tables Found": len(plumber_tables),
+ "Time (ms)": f"{plumber_time:.1f}",
+ "Structured": "Yes",
+ },
+ {
+ "Method": "pdfplumber (text strategy)",
+ "Tables Found": len(custom_tables),
+ "Time (ms)": f"{custom_time:.1f}",
+ "Structured": "Yes",
+ },
+ {
+ "Method": "PyMuPDF (text only)",
+ "Tables Found": "N/A",
+ "Time (ms)": f"{mupdf_time:.1f}",
+ "Structured": "No",
+ },
]
print(f"\n{tabulate(summary, headers='keys', tablefmt='grid')}")
@@ -229,6 +247,7 @@ def compare_table_extraction():
# 3. Mixed content comparison
# ---------------------------------------------------------------------------
+
def compare_mixed_content():
"""Compare methods on mixed_content.pdf (text + tables + bullets)."""
print("\n" + "=" * 70)
@@ -248,12 +267,14 @@ def compare_mixed_content():
results = []
for name, func in methods:
text, avg_ms = time_extraction(func, MIXED_CONTENT_PDF, runs=5)
- results.append({
- "Method": name,
- "Chars": f"{len(text):,}",
- "Words": f"{len(text.split()):,}",
- "Time (ms)": f"{avg_ms:.1f}",
- })
+ results.append(
+ {
+ "Method": name,
+ "Chars": f"{len(text):,}",
+ "Words": f"{len(text.split()):,}",
+ "Time (ms)": f"{avg_ms:.1f}",
+ }
+ )
print(f"\n{tabulate(results, headers='keys', tablefmt='grid')}")
@@ -267,6 +288,7 @@ def compare_mixed_content():
# 4. Recommendation summary
# ---------------------------------------------------------------------------
+
def print_recommendations():
"""Print a summary of when to use each method."""
print("\n" + "=" * 70)
diff --git a/unstructured_documents/01_pdf/sample_docs/generate_samples.py b/unstructured_documents/01_pdf/sample_docs/generate_samples.py
index 34fe06c..a207240 100644
--- a/unstructured_documents/01_pdf/sample_docs/generate_samples.py
+++ b/unstructured_documents/01_pdf/sample_docs/generate_samples.py
@@ -14,10 +14,9 @@
from pathlib import Path
from reportlab.lib import colors
-from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY, TA_LEFT
+from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
-from reportlab.lib.units import inch
from reportlab.platypus import (
Flowable,
Frame,
@@ -37,29 +36,48 @@
# 1) simple_text.pdf
# ---------------------------------------------------------------------------
+
def generate_simple_text():
"""Create a multi-page document about Artificial Intelligence."""
path = SAMPLE_DIR / "simple_text.pdf"
- doc = SimpleDocTemplate(str(path), pagesize=letter,
- topMargin=72, bottomMargin=72,
- leftMargin=72, rightMargin=72)
+ doc = SimpleDocTemplate(
+ str(path),
+ pagesize=letter,
+ topMargin=72,
+ bottomMargin=72,
+ leftMargin=72,
+ rightMargin=72,
+ )
styles = getSampleStyleSheet()
title_style = ParagraphStyle(
- "CustomTitle", parent=styles["Title"], fontSize=24,
- spaceAfter=20, alignment=TA_CENTER,
+ "CustomTitle",
+ parent=styles["Title"],
+ fontSize=24,
+ spaceAfter=20,
+ alignment=TA_CENTER,
)
heading_style = ParagraphStyle(
- "CustomHeading", parent=styles["Heading1"], fontSize=16,
- spaceBefore=18, spaceAfter=10,
+ "CustomHeading",
+ parent=styles["Heading1"],
+ fontSize=16,
+ spaceBefore=18,
+ spaceAfter=10,
)
subheading_style = ParagraphStyle(
- "CustomSubheading", parent=styles["Heading2"], fontSize=13,
- spaceBefore=14, spaceAfter=8,
+ "CustomSubheading",
+ parent=styles["Heading2"],
+ fontSize=13,
+ spaceBefore=14,
+ spaceAfter=8,
)
body_style = ParagraphStyle(
- "CustomBody", parent=styles["BodyText"], fontSize=11,
- leading=16, spaceAfter=10, alignment=TA_JUSTIFY,
+ "CustomBody",
+ parent=styles["BodyText"],
+ fontSize=11,
+ leading=16,
+ spaceAfter=10,
+ alignment=TA_JUSTIFY,
)
story = []
@@ -70,245 +88,293 @@ def generate_simple_text():
# ---- Section 1 ----
story.append(Paragraph("1. Introduction to Artificial Intelligence", heading_style))
- story.append(Paragraph(
- "Artificial Intelligence (AI) is a branch of computer science that aims to create "
- "systems capable of performing tasks that normally require human intelligence. These "
- "tasks include visual perception, speech recognition, decision-making, and language "
- "translation. The field was founded on the claim that human intelligence can be so "
- "precisely described that a machine can be made to simulate it.",
- body_style,
- ))
- story.append(Paragraph(
- "The concept of artificial intelligence has been part of human imagination for "
- "centuries, but the formal field of AI research was established in 1956 at the "
- "Dartmouth Conference. Since then, AI has experienced several cycles of optimism "
- "and disappointment, known as AI winters, followed by renewed enthusiasm and "
- "funding. The current era of AI, driven by deep learning and big data, has "
- "achieved remarkable breakthroughs across numerous domains.",
- body_style,
- ))
- story.append(Paragraph(
- "Modern AI systems are powered by machine learning algorithms that learn patterns "
- "from vast amounts of data. Unlike traditional software that follows explicit "
- "rules written by programmers, machine learning systems improve their performance "
- "through experience. This paradigm shift has enabled applications that were "
- "previously thought impossible, from self-driving cars to protein structure "
- "prediction.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Artificial Intelligence (AI) is a branch of computer science that aims to create "
+ "systems capable of performing tasks that normally require human intelligence. These "
+ "tasks include visual perception, speech recognition, decision-making, and language "
+ "translation. The field was founded on the claim that human intelligence can be so "
+ "precisely described that a machine can be made to simulate it.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "The concept of artificial intelligence has been part of human imagination for "
+ "centuries, but the formal field of AI research was established in 1956 at the "
+ "Dartmouth Conference. Since then, AI has experienced several cycles of optimism "
+ "and disappointment, known as AI winters, followed by renewed enthusiasm and "
+ "funding. The current era of AI, driven by deep learning and big data, has "
+ "achieved remarkable breakthroughs across numerous domains.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Modern AI systems are powered by machine learning algorithms that learn patterns "
+ "from vast amounts of data. Unlike traditional software that follows explicit "
+ "rules written by programmers, machine learning systems improve their performance "
+ "through experience. This paradigm shift has enabled applications that were "
+ "previously thought impossible, from self-driving cars to protein structure "
+ "prediction.",
+ body_style,
+ )
+ )
# ---- Section 2 ----
story.append(Paragraph("2. Machine Learning Fundamentals", heading_style))
story.append(Paragraph("2.1 Supervised Learning", subheading_style))
- story.append(Paragraph(
- "Supervised learning is the most common form of machine learning. In this "
- "paradigm, the algorithm is trained on labeled data, where each input example "
- "is paired with the correct output. The model learns to map inputs to outputs "
- "by minimizing the difference between its predictions and the actual labels. "
- "Common supervised learning tasks include classification, where the goal is to "
- "assign inputs to discrete categories, and regression, where the goal is to "
- "predict continuous values.",
- body_style,
- ))
- story.append(Paragraph(
- "Popular supervised learning algorithms include linear regression, logistic "
- "regression, support vector machines, decision trees, random forests, and "
- "neural networks. The choice of algorithm depends on the nature of the data, "
- "the complexity of the relationship between inputs and outputs, and the amount "
- "of available training data. Cross-validation and regularization techniques "
- "help prevent overfitting, where the model memorizes training data rather than "
- "learning generalizable patterns.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Supervised learning is the most common form of machine learning. In this "
+ "paradigm, the algorithm is trained on labeled data, where each input example "
+ "is paired with the correct output. The model learns to map inputs to outputs "
+ "by minimizing the difference between its predictions and the actual labels. "
+ "Common supervised learning tasks include classification, where the goal is to "
+ "assign inputs to discrete categories, and regression, where the goal is to "
+ "predict continuous values.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Popular supervised learning algorithms include linear regression, logistic "
+ "regression, support vector machines, decision trees, random forests, and "
+ "neural networks. The choice of algorithm depends on the nature of the data, "
+ "the complexity of the relationship between inputs and outputs, and the amount "
+ "of available training data. Cross-validation and regularization techniques "
+ "help prevent overfitting, where the model memorizes training data rather than "
+ "learning generalizable patterns.",
+ body_style,
+ )
+ )
story.append(Paragraph("2.2 Unsupervised Learning", subheading_style))
- story.append(Paragraph(
- "Unsupervised learning works with unlabeled data, seeking to discover hidden "
- "patterns and structures. Clustering algorithms such as K-means, hierarchical "
- "clustering, and DBSCAN group similar data points together. Dimensionality "
- "reduction techniques like PCA and t-SNE help visualize high-dimensional data. "
- "Generative models learn the underlying distribution of data and can create "
- "new samples that resemble the training data.",
- body_style,
- ))
- story.append(Paragraph(
- "Unsupervised learning is particularly valuable when labeled data is scarce or "
- "expensive to obtain. It is used extensively in customer segmentation, anomaly "
- "detection, topic modeling, and feature extraction. Auto-encoders and variational "
- "auto-encoders are neural network architectures commonly used for unsupervised "
- "representation learning.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Unsupervised learning works with unlabeled data, seeking to discover hidden "
+ "patterns and structures. Clustering algorithms such as K-means, hierarchical "
+ "clustering, and DBSCAN group similar data points together. Dimensionality "
+ "reduction techniques like PCA and t-SNE help visualize high-dimensional data. "
+ "Generative models learn the underlying distribution of data and can create "
+ "new samples that resemble the training data.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Unsupervised learning is particularly valuable when labeled data is scarce or "
+ "expensive to obtain. It is used extensively in customer segmentation, anomaly "
+ "detection, topic modeling, and feature extraction. Auto-encoders and variational "
+ "auto-encoders are neural network architectures commonly used for unsupervised "
+ "representation learning.",
+ body_style,
+ )
+ )
story.append(Paragraph("2.3 Reinforcement Learning", subheading_style))
- story.append(Paragraph(
- "Reinforcement learning (RL) involves an agent that learns to make decisions by "
- "interacting with an environment. The agent receives rewards or penalties based on "
- "its actions and aims to maximize cumulative reward over time. RL has achieved "
- "impressive results in game playing, with systems like AlphaGo defeating world "
- "champions in Go, and AlphaStar reaching grandmaster level in StarCraft II.",
- body_style,
- ))
- story.append(Paragraph(
- "Key concepts in reinforcement learning include the Markov Decision Process (MDP), "
- "value functions, policy gradients, and the exploration-exploitation trade-off. "
- "Deep reinforcement learning combines neural networks with RL algorithms, enabling "
- "agents to handle high-dimensional state spaces. Applications include robotics "
- "control, recommendation systems, resource management, and autonomous driving.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Reinforcement learning (RL) involves an agent that learns to make decisions by "
+ "interacting with an environment. The agent receives rewards or penalties based on "
+ "its actions and aims to maximize cumulative reward over time. RL has achieved "
+ "impressive results in game playing, with systems like AlphaGo defeating world "
+ "champions in Go, and AlphaStar reaching grandmaster level in StarCraft II.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Key concepts in reinforcement learning include the Markov Decision Process (MDP), "
+ "value functions, policy gradients, and the exploration-exploitation trade-off. "
+ "Deep reinforcement learning combines neural networks with RL algorithms, enabling "
+ "agents to handle high-dimensional state spaces. Applications include robotics "
+ "control, recommendation systems, resource management, and autonomous driving.",
+ body_style,
+ )
+ )
# ---- Section 3 ----
story.append(Paragraph("3. Deep Learning and Neural Networks", heading_style))
- story.append(Paragraph(
- "Deep learning is a subset of machine learning based on artificial neural networks "
- "with multiple layers. These deep networks can learn hierarchical representations "
- "of data, with each layer capturing increasingly abstract features. The input layer "
- "receives raw data, hidden layers process and transform it, and the output layer "
- "produces the final result. Backpropagation is the primary algorithm for training "
- "deep neural networks, computing gradients of the loss function with respect to "
- "each weight through the chain rule of calculus.",
- body_style,
- ))
- story.append(Paragraph(
- "Convolutional Neural Networks (CNNs) are specialized for processing grid-like data "
- "such as images. They use convolutional layers to automatically learn spatial "
- "hierarchies of features, from edges and textures to objects and scenes. Recurrent "
- "Neural Networks (RNNs) and their variants, Long Short-Term Memory (LSTM) and "
- "Gated Recurrent Unit (GRU), are designed for sequential data such as text and "
- "time series.",
- body_style,
- ))
- story.append(Paragraph(
- "The Transformer architecture, introduced in 2017, has revolutionized natural "
- "language processing and is increasingly used in computer vision and other domains. "
- "Transformers rely on self-attention mechanisms to capture long-range dependencies "
- "in data without the sequential processing limitations of RNNs. Large language "
- "models (LLMs) like GPT and BERT are based on the Transformer architecture and "
- "have demonstrated remarkable capabilities in text generation, translation, "
- "summarization, and question answering.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Deep learning is a subset of machine learning based on artificial neural networks "
+ "with multiple layers. These deep networks can learn hierarchical representations "
+ "of data, with each layer capturing increasingly abstract features. The input layer "
+ "receives raw data, hidden layers process and transform it, and the output layer "
+ "produces the final result. Backpropagation is the primary algorithm for training "
+ "deep neural networks, computing gradients of the loss function with respect to "
+ "each weight through the chain rule of calculus.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Convolutional Neural Networks (CNNs) are specialized for processing grid-like data "
+ "such as images. They use convolutional layers to automatically learn spatial "
+ "hierarchies of features, from edges and textures to objects and scenes. Recurrent "
+ "Neural Networks (RNNs) and their variants, Long Short-Term Memory (LSTM) and "
+ "Gated Recurrent Unit (GRU), are designed for sequential data such as text and "
+ "time series.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "The Transformer architecture, introduced in 2017, has revolutionized natural "
+ "language processing and is increasingly used in computer vision and other domains. "
+ "Transformers rely on self-attention mechanisms to capture long-range dependencies "
+ "in data without the sequential processing limitations of RNNs. Large language "
+ "models (LLMs) like GPT and BERT are based on the Transformer architecture and "
+ "have demonstrated remarkable capabilities in text generation, translation, "
+ "summarization, and question answering.",
+ body_style,
+ )
+ )
story.append(PageBreak())
# ---- Section 4 ----
story.append(Paragraph("4. Natural Language Processing", heading_style))
- story.append(Paragraph(
- "Natural Language Processing (NLP) is a field at the intersection of computer "
- "science, artificial intelligence, and linguistics. It focuses on enabling computers "
- "to understand, interpret, and generate human language. NLP encompasses a wide range "
- "of tasks, including text classification, named entity recognition, sentiment "
- "analysis, machine translation, text summarization, and question answering.",
- body_style,
- ))
- story.append(Paragraph(
- "The evolution of NLP has progressed from rule-based systems to statistical methods "
- "and finally to deep learning approaches. Word embeddings like Word2Vec, GloVe, and "
- "FastText represent words as dense vectors in a continuous space, capturing semantic "
- "relationships. Contextual embeddings from models like ELMo, BERT, and GPT provide "
- "word representations that vary based on context, significantly improving performance "
- "on downstream tasks.",
- body_style,
- ))
- story.append(Paragraph(
- "Retrieval-Augmented Generation (RAG) is an emerging paradigm that combines the "
- "strengths of retrieval systems and generative models. In RAG, a retrieval component "
- "first finds relevant documents from a knowledge base, and then a generative model "
- "uses those documents as context to produce accurate, grounded responses. This "
- "approach helps mitigate hallucination in language models and enables them to access "
- "up-to-date information without retraining.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Natural Language Processing (NLP) is a field at the intersection of computer "
+ "science, artificial intelligence, and linguistics. It focuses on enabling computers "
+ "to understand, interpret, and generate human language. NLP encompasses a wide range "
+ "of tasks, including text classification, named entity recognition, sentiment "
+ "analysis, machine translation, text summarization, and question answering.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "The evolution of NLP has progressed from rule-based systems to statistical methods "
+ "and finally to deep learning approaches. Word embeddings like Word2Vec, GloVe, and "
+ "FastText represent words as dense vectors in a continuous space, capturing semantic "
+ "relationships. Contextual embeddings from models like ELMo, BERT, and GPT provide "
+ "word representations that vary based on context, significantly improving performance "
+ "on downstream tasks.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Retrieval-Augmented Generation (RAG) is an emerging paradigm that combines the "
+ "strengths of retrieval systems and generative models. In RAG, a retrieval component "
+ "first finds relevant documents from a knowledge base, and then a generative model "
+ "uses those documents as context to produce accurate, grounded responses. This "
+ "approach helps mitigate hallucination in language models and enables them to access "
+ "up-to-date information without retraining.",
+ body_style,
+ )
+ )
# ---- Section 5 ----
story.append(Paragraph("5. Computer Vision", heading_style))
- story.append(Paragraph(
- "Computer vision is a field of AI that enables computers to interpret and understand "
- "visual information from the world. Key tasks include image classification, object "
- "detection, semantic segmentation, instance segmentation, and image generation. "
- "The field has been transformed by deep learning, particularly convolutional neural "
- "networks, which have achieved human-level or superhuman performance on many "
- "benchmark tasks.",
- body_style,
- ))
- story.append(Paragraph(
- "Object detection algorithms such as YOLO, SSD, and Faster R-CNN can identify and "
- "localize multiple objects in an image in real time. Image segmentation models like "
- "U-Net and Mask R-CNN assign labels to every pixel in an image, enabling precise "
- "understanding of scene composition. Generative Adversarial Networks (GANs) and "
- "diffusion models can create photorealistic images from text descriptions, "
- "opening up new possibilities in creative applications.",
- body_style,
- ))
- story.append(Paragraph(
- "Transfer learning has been crucial for computer vision, allowing models pre-trained "
- "on large datasets like ImageNet to be fine-tuned for specific tasks with limited "
- "data. Vision Transformers (ViTs) apply the Transformer architecture to images by "
- "treating image patches as tokens, achieving competitive or superior results "
- "compared to CNNs. Multi-modal models that combine vision and language understanding "
- "can perform tasks like visual question answering and image captioning.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Computer vision is a field of AI that enables computers to interpret and understand "
+ "visual information from the world. Key tasks include image classification, object "
+ "detection, semantic segmentation, instance segmentation, and image generation. "
+ "The field has been transformed by deep learning, particularly convolutional neural "
+ "networks, which have achieved human-level or superhuman performance on many "
+ "benchmark tasks.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Object detection algorithms such as YOLO, SSD, and Faster R-CNN can identify and "
+ "localize multiple objects in an image in real time. Image segmentation models like "
+ "U-Net and Mask R-CNN assign labels to every pixel in an image, enabling precise "
+ "understanding of scene composition. Generative Adversarial Networks (GANs) and "
+ "diffusion models can create photorealistic images from text descriptions, "
+ "opening up new possibilities in creative applications.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Transfer learning has been crucial for computer vision, allowing models pre-trained "
+ "on large datasets like ImageNet to be fine-tuned for specific tasks with limited "
+ "data. Vision Transformers (ViTs) apply the Transformer architecture to images by "
+ "treating image patches as tokens, achieving competitive or superior results "
+ "compared to CNNs. Multi-modal models that combine vision and language understanding "
+ "can perform tasks like visual question answering and image captioning.",
+ body_style,
+ )
+ )
# ---- Section 6 ----
story.append(Paragraph("6. Ethics and Societal Impact", heading_style))
- story.append(Paragraph(
- "As AI systems become more powerful and pervasive, ethical considerations become "
- "increasingly important. Key concerns include algorithmic bias, where AI systems "
- "can perpetuate or amplify existing societal biases present in training data. "
- "Fairness, accountability, and transparency (FAT) are essential principles for "
- "responsible AI development. Privacy concerns arise from the massive data "
- "collection required to train AI models, and the potential for surveillance "
- "and tracking.",
- body_style,
- ))
- story.append(Paragraph(
- "The impact of AI on employment is a subject of ongoing debate. While AI automates "
- "certain tasks, it also creates new jobs and augments human capabilities. The "
- "challenge lies in managing the transition and ensuring that the benefits of AI "
- "are distributed equitably. Education and workforce development programs are "
- "essential to prepare workers for an AI-driven economy.",
- body_style,
- ))
- story.append(Paragraph(
- "AI safety research focuses on ensuring that advanced AI systems remain aligned "
- "with human values and intentions. This includes work on interpretability, making "
- "AI decisions understandable to humans; robustness, ensuring AI systems work "
- "reliably under various conditions; and alignment, ensuring AI goals match human "
- "goals. International cooperation and governance frameworks are being developed "
- "to address the global implications of AI technology.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "As AI systems become more powerful and pervasive, ethical considerations become "
+ "increasingly important. Key concerns include algorithmic bias, where AI systems "
+ "can perpetuate or amplify existing societal biases present in training data. "
+ "Fairness, accountability, and transparency (FAT) are essential principles for "
+ "responsible AI development. Privacy concerns arise from the massive data "
+ "collection required to train AI models, and the potential for surveillance "
+ "and tracking.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "The impact of AI on employment is a subject of ongoing debate. While AI automates "
+ "certain tasks, it also creates new jobs and augments human capabilities. The "
+ "challenge lies in managing the transition and ensuring that the benefits of AI "
+ "are distributed equitably. Education and workforce development programs are "
+ "essential to prepare workers for an AI-driven economy.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "AI safety research focuses on ensuring that advanced AI systems remain aligned "
+ "with human values and intentions. This includes work on interpretability, making "
+ "AI decisions understandable to humans; robustness, ensuring AI systems work "
+ "reliably under various conditions; and alignment, ensuring AI goals match human "
+ "goals. International cooperation and governance frameworks are being developed "
+ "to address the global implications of AI technology.",
+ body_style,
+ )
+ )
# ---- Section 7 ----
story.append(Paragraph("7. Future Directions", heading_style))
- story.append(Paragraph(
- "The future of AI holds tremendous promise across many fronts. Artificial General "
- "Intelligence (AGI), which would match or exceed human intelligence across all "
- "cognitive tasks, remains a long-term goal of the field. Neuro-symbolic AI aims "
- "to combine the pattern recognition strengths of neural networks with the "
- "reasoning capabilities of symbolic AI systems.",
- body_style,
- ))
- story.append(Paragraph(
- "Edge AI brings intelligence to resource-constrained devices, enabling real-time "
- "processing without cloud connectivity. Quantum machine learning explores the "
- "intersection of quantum computing and AI, potentially offering exponential "
- "speedups for certain types of computations. Federated learning enables "
- "collaborative model training while keeping data decentralized, addressing "
- "privacy concerns in healthcare, finance, and other sensitive domains.",
- body_style,
- ))
- story.append(Paragraph(
- "AI for science is accelerating discoveries in physics, chemistry, biology, and "
- "materials science. Protein structure prediction by AlphaFold has transformed "
- "structural biology. Climate modeling, drug discovery, and mathematical reasoning "
- "are all areas where AI is making significant contributions. As these technologies "
- "mature, the integration of AI into every aspect of human activity will continue "
- "to deepen, making responsible development more important than ever.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "The future of AI holds tremendous promise across many fronts. Artificial General "
+ "Intelligence (AGI), which would match or exceed human intelligence across all "
+ "cognitive tasks, remains a long-term goal of the field. Neuro-symbolic AI aims "
+ "to combine the pattern recognition strengths of neural networks with the "
+ "reasoning capabilities of symbolic AI systems.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Edge AI brings intelligence to resource-constrained devices, enabling real-time "
+ "processing without cloud connectivity. Quantum machine learning explores the "
+ "intersection of quantum computing and AI, potentially offering exponential "
+ "speedups for certain types of computations. Federated learning enables "
+ "collaborative model training while keeping data decentralized, addressing "
+ "privacy concerns in healthcare, finance, and other sensitive domains.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "AI for science is accelerating discoveries in physics, chemistry, biology, and "
+ "materials science. Protein structure prediction by AlphaFold has transformed "
+ "structural biology. Climate modeling, drug discovery, and mathematical reasoning "
+ "are all areas where AI is making significant contributions. As these technologies "
+ "mature, the integration of AI into every aspect of human activity will continue "
+ "to deepen, making responsible development more important than ever.",
+ body_style,
+ )
+ )
doc.build(story)
print(f" Created: {path}")
@@ -318,44 +384,66 @@ def generate_simple_text():
# 2) tables.pdf
# ---------------------------------------------------------------------------
+
def generate_tables():
"""Create a document with multiple data tables."""
path = SAMPLE_DIR / "tables.pdf"
- doc = SimpleDocTemplate(str(path), pagesize=letter,
- topMargin=72, bottomMargin=72,
- leftMargin=54, rightMargin=54)
+ doc = SimpleDocTemplate(
+ str(path),
+ pagesize=letter,
+ topMargin=72,
+ bottomMargin=72,
+ leftMargin=54,
+ rightMargin=54,
+ )
styles = getSampleStyleSheet()
title_style = ParagraphStyle(
- "TblTitle", parent=styles["Title"], fontSize=20,
- spaceAfter=16, alignment=TA_CENTER,
+ "TblTitle",
+ parent=styles["Title"],
+ fontSize=20,
+ spaceAfter=16,
+ alignment=TA_CENTER,
)
heading_style = ParagraphStyle(
- "TblHeading", parent=styles["Heading2"], fontSize=14,
- spaceBefore=20, spaceAfter=10,
+ "TblHeading",
+ parent=styles["Heading2"],
+ fontSize=14,
+ spaceBefore=20,
+ spaceAfter=10,
)
body_style = ParagraphStyle(
- "TblBody", parent=styles["BodyText"], fontSize=10,
- leading=14, spaceAfter=8,
+ "TblBody",
+ parent=styles["BodyText"],
+ fontSize=10,
+ leading=14,
+ spaceAfter=8,
)
# Common table style
def make_table_style():
- return TableStyle([
- ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#4472C4")),
- ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
- ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
- ("FONTSIZE", (0, 0), (-1, 0), 10),
- ("BOTTOMPADDING", (0, 0), (-1, 0), 8),
- ("TOPPADDING", (0, 0), (-1, 0), 8),
- ("BACKGROUND", (0, 1), (-1, -1), colors.HexColor("#D9E2F3")),
- ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#D9E2F3"), colors.white]),
- ("FONTSIZE", (0, 1), (-1, -1), 9),
- ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("TOPPADDING", (0, 1), (-1, -1), 5),
- ("BOTTOMPADDING", (0, 1), (-1, -1), 5),
- ])
+ return TableStyle(
+ [
+ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#4472C4")),
+ ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
+ ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
+ ("FONTSIZE", (0, 0), (-1, 0), 10),
+ ("BOTTOMPADDING", (0, 0), (-1, 0), 8),
+ ("TOPPADDING", (0, 0), (-1, 0), 8),
+ ("BACKGROUND", (0, 1), (-1, -1), colors.HexColor("#D9E2F3")),
+ (
+ "ROWBACKGROUNDS",
+ (0, 1),
+ (-1, -1),
+ [colors.HexColor("#D9E2F3"), colors.white],
+ ),
+ ("FONTSIZE", (0, 1), (-1, -1), 9),
+ ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
+ ("ALIGN", (0, 0), (-1, -1), "CENTER"),
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
+ ("TOPPADDING", (0, 1), (-1, -1), 5),
+ ("BOTTOMPADDING", (0, 1), (-1, -1), 5),
+ ]
+ )
story = []
@@ -364,23 +452,53 @@ def make_table_style():
# --- Table 1: Product Inventory ---
story.append(Paragraph("Table 1: Product Inventory", heading_style))
- story.append(Paragraph(
- "Current inventory levels across warehouse locations as of Q4 2024.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Current inventory levels across warehouse locations as of Q4 2024.",
+ body_style,
+ )
+ )
inventory_data = [
- ["Product ID", "Product Name", "Category", "Quantity", "Unit Price", "Warehouse"],
+ [
+ "Product ID",
+ "Product Name",
+ "Category",
+ "Quantity",
+ "Unit Price",
+ "Warehouse",
+ ],
["PRD-001", "Wireless Mouse", "Electronics", "1,250", "$29.99", "Warehouse A"],
- ["PRD-002", "Mechanical Keyboard", "Electronics", "843", "$79.99", "Warehouse A"],
+ [
+ "PRD-002",
+ "Mechanical Keyboard",
+ "Electronics",
+ "843",
+ "$79.99",
+ "Warehouse A",
+ ],
["PRD-003", "USB-C Hub", "Accessories", "2,100", "$44.99", "Warehouse B"],
["PRD-004", "Monitor Stand", "Furniture", "567", "$54.99", "Warehouse C"],
["PRD-005", "Webcam HD", "Electronics", "1,890", "$64.99", "Warehouse A"],
["PRD-006", "Desk Lamp LED", "Lighting", "3,210", "$34.99", "Warehouse B"],
["PRD-007", "Ergonomic Chair", "Furniture", "245", "$349.99", "Warehouse C"],
["PRD-008", "Laptop Stand", "Accessories", "1,670", "$39.99", "Warehouse B"],
- ["PRD-009", "Noise-Canceling Headphones", "Electronics", "920", "$149.99", "Warehouse A"],
- ["PRD-010", "Power Strip Surge Protector", "Accessories", "4,500", "$24.99", "Warehouse C"],
+ [
+ "PRD-009",
+ "Noise-Canceling Headphones",
+ "Electronics",
+ "920",
+ "$149.99",
+ "Warehouse A",
+ ],
+ [
+ "PRD-010",
+ "Power Strip Surge Protector",
+ "Accessories",
+ "4,500",
+ "$24.99",
+ "Warehouse C",
+ ],
]
t1 = Table(inventory_data, repeatRows=1)
t1.setStyle(make_table_style())
@@ -389,10 +507,12 @@ def make_table_style():
# --- Table 2: Quarterly Revenue ---
story.append(Paragraph("Table 2: Quarterly Revenue by Region (in thousands USD)", heading_style))
- story.append(Paragraph(
- "Revenue figures for fiscal year 2024 across all operating regions.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Revenue figures for fiscal year 2024 across all operating regions.",
+ body_style,
+ )
+ )
revenue_data = [
["Region", "Q1 2024", "Q2 2024", "Q3 2024", "Q4 2024", "Annual Total"],
@@ -411,23 +531,95 @@ def make_table_style():
# --- Table 3: Employee Records ---
story.append(Paragraph("Table 3: Employee Directory", heading_style))
- story.append(Paragraph(
- "Key personnel across departments with their roles and contact information.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Key personnel across departments with their roles and contact information.",
+ body_style,
+ )
+ )
employee_data = [
["Emp ID", "Name", "Department", "Title", "Start Date", "Email"],
- ["E-101", "Sarah Johnson", "Engineering", "Senior Developer", "2019-03-15", "s.johnson@example.com"],
- ["E-102", "Michael Chen", "Engineering", "Tech Lead", "2018-07-22", "m.chen@example.com"],
- ["E-103", "Emily Rodriguez", "Marketing", "Marketing Manager", "2020-01-10", "e.rodriguez@example.com"],
- ["E-104", "David Kim", "Data Science", "ML Engineer", "2021-05-18", "d.kim@example.com"],
- ["E-105", "Jessica Patel", "Product", "Product Manager", "2019-11-03", "j.patel@example.com"],
- ["E-106", "Robert Taylor", "Engineering", "DevOps Engineer", "2020-08-25", "r.taylor@example.com"],
- ["E-107", "Amanda White", "HR", "HR Director", "2017-02-14", "a.white@example.com"],
- ["E-108", "James Wilson", "Finance", "Financial Analyst", "2022-04-01", "j.wilson@example.com"],
- ["E-109", "Lisa Brown", "Data Science", "Data Analyst", "2021-09-12", "l.brown@example.com"],
- ["E-110", "Thomas Lee", "Engineering", "Frontend Developer", "2023-01-30", "t.lee@example.com"],
+ [
+ "E-101",
+ "Sarah Johnson",
+ "Engineering",
+ "Senior Developer",
+ "2019-03-15",
+ "s.johnson@example.com",
+ ],
+ [
+ "E-102",
+ "Michael Chen",
+ "Engineering",
+ "Tech Lead",
+ "2018-07-22",
+ "m.chen@example.com",
+ ],
+ [
+ "E-103",
+ "Emily Rodriguez",
+ "Marketing",
+ "Marketing Manager",
+ "2020-01-10",
+ "e.rodriguez@example.com",
+ ],
+ [
+ "E-104",
+ "David Kim",
+ "Data Science",
+ "ML Engineer",
+ "2021-05-18",
+ "d.kim@example.com",
+ ],
+ [
+ "E-105",
+ "Jessica Patel",
+ "Product",
+ "Product Manager",
+ "2019-11-03",
+ "j.patel@example.com",
+ ],
+ [
+ "E-106",
+ "Robert Taylor",
+ "Engineering",
+ "DevOps Engineer",
+ "2020-08-25",
+ "r.taylor@example.com",
+ ],
+ [
+ "E-107",
+ "Amanda White",
+ "HR",
+ "HR Director",
+ "2017-02-14",
+ "a.white@example.com",
+ ],
+ [
+ "E-108",
+ "James Wilson",
+ "Finance",
+ "Financial Analyst",
+ "2022-04-01",
+ "j.wilson@example.com",
+ ],
+ [
+ "E-109",
+ "Lisa Brown",
+ "Data Science",
+ "Data Analyst",
+ "2021-09-12",
+ "l.brown@example.com",
+ ],
+ [
+ "E-110",
+ "Thomas Lee",
+ "Engineering",
+ "Frontend Developer",
+ "2023-01-30",
+ "t.lee@example.com",
+ ],
]
t3 = Table(employee_data, repeatRows=1)
t3.setStyle(make_table_style())
@@ -436,10 +628,12 @@ def make_table_style():
# --- Table 4: Project Status ---
story.append(Paragraph("Table 4: Project Status Overview", heading_style))
- story.append(Paragraph(
- "Active and upcoming projects with timeline and budget information.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Active and upcoming projects with timeline and budget information.",
+ body_style,
+ )
+ )
project_data = [
["Project", "Lead", "Status", "Start", "Deadline", "Budget"],
@@ -462,8 +656,10 @@ def make_table_style():
# 3) multi_column.pdf
# ---------------------------------------------------------------------------
+
class ColumnBreak(Flowable):
"""Force a break to the next column / frame."""
+
def __init__(self):
super().__init__()
self.width = 0
@@ -491,201 +687,262 @@ def generate_multi_column():
right_frame = Frame(margin + col_w + gutter, margin, col_w, frame_h, id="right")
two_col_template = PageTemplate(
- id="TwoCol", frames=[left_frame, right_frame],
+ id="TwoCol",
+ frames=[left_frame, right_frame],
)
- doc = SimpleDocTemplate(str(path), pagesize=letter,
- topMargin=margin, bottomMargin=margin,
- leftMargin=margin, rightMargin=margin)
+ doc = SimpleDocTemplate(
+ str(path),
+ pagesize=letter,
+ topMargin=margin,
+ bottomMargin=margin,
+ leftMargin=margin,
+ rightMargin=margin,
+ )
doc.addPageTemplates([two_col_template])
styles = getSampleStyleSheet()
title_style = ParagraphStyle(
- "ColTitle", parent=styles["Title"], fontSize=16,
- spaceAfter=8, alignment=TA_CENTER,
+ "ColTitle",
+ parent=styles["Title"],
+ fontSize=16,
+ spaceAfter=8,
+ alignment=TA_CENTER,
)
author_style = ParagraphStyle(
- "ColAuthor", parent=styles["Normal"], fontSize=10,
- spaceAfter=12, alignment=TA_CENTER, textColor=colors.grey,
+ "ColAuthor",
+ parent=styles["Normal"],
+ fontSize=10,
+ spaceAfter=12,
+ alignment=TA_CENTER,
+ textColor=colors.grey,
)
heading_style = ParagraphStyle(
- "ColHeading", parent=styles["Heading2"], fontSize=12,
- spaceBefore=12, spaceAfter=6,
+ "ColHeading",
+ parent=styles["Heading2"],
+ fontSize=12,
+ spaceBefore=12,
+ spaceAfter=6,
)
body_style = ParagraphStyle(
- "ColBody", parent=styles["BodyText"], fontSize=9,
- leading=13, spaceAfter=6, alignment=TA_JUSTIFY,
+ "ColBody",
+ parent=styles["BodyText"],
+ fontSize=9,
+ leading=13,
+ spaceAfter=6,
+ alignment=TA_JUSTIFY,
)
abstract_style = ParagraphStyle(
- "ColAbstract", parent=styles["BodyText"], fontSize=9,
- leading=13, spaceAfter=6, alignment=TA_JUSTIFY,
- leftIndent=12, rightIndent=12,
+ "ColAbstract",
+ parent=styles["BodyText"],
+ fontSize=9,
+ leading=13,
+ spaceAfter=6,
+ alignment=TA_JUSTIFY,
+ leftIndent=12,
+ rightIndent=12,
)
story = []
# Title and authors (span both columns via the left frame, will reflow)
- story.append(Paragraph(
- "Advances in Document Understanding for Retrieval-Augmented Generation Systems",
- title_style,
- ))
- story.append(Paragraph(
- "J. Smith, A. Kumar, L. Zhang — Institute of AI Research, 2024",
- author_style,
- ))
+ story.append(
+ Paragraph(
+ "Advances in Document Understanding for Retrieval-Augmented Generation Systems",
+ title_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "J. Smith, A. Kumar, L. Zhang — Institute of AI Research, 2024",
+ author_style,
+ )
+ )
story.append(Spacer(1, 6))
# Abstract
story.append(Paragraph("Abstract", heading_style))
- story.append(Paragraph(
- "This paper surveys recent advances in document understanding techniques that "
- "underpin modern Retrieval-Augmented Generation (RAG) pipelines. We examine "
- "methods for parsing unstructured documents including PDFs, web pages, and "
- "scanned images, and evaluate their effectiveness for downstream retrieval "
- "and generation tasks. Our analysis covers text extraction, layout analysis, "
- "table recognition, and multimodal approaches that combine vision and language "
- "models. We find that hybrid methods combining rule-based extraction with "
- "learned representations achieve the best results across diverse document types.",
- abstract_style,
- ))
+ story.append(
+ Paragraph(
+ "This paper surveys recent advances in document understanding techniques that "
+ "underpin modern Retrieval-Augmented Generation (RAG) pipelines. We examine "
+ "methods for parsing unstructured documents including PDFs, web pages, and "
+ "scanned images, and evaluate their effectiveness for downstream retrieval "
+ "and generation tasks. Our analysis covers text extraction, layout analysis, "
+ "table recognition, and multimodal approaches that combine vision and language "
+ "models. We find that hybrid methods combining rule-based extraction with "
+ "learned representations achieve the best results across diverse document types.",
+ abstract_style,
+ )
+ )
# Section 1
story.append(Paragraph("1. Introduction", heading_style))
- story.append(Paragraph(
- "The explosion of digital documents in enterprise and academic settings has "
- "created an urgent need for robust document understanding systems. Organizations "
- "store vast quantities of knowledge in unstructured formats such as PDFs, Word "
- "documents, presentations, and scanned images. Unlocking this knowledge for "
- "AI-powered applications, particularly Retrieval-Augmented Generation (RAG), "
- "requires sophisticated parsing and extraction pipelines.",
- body_style,
- ))
- story.append(Paragraph(
- "RAG systems combine retrieval from a document corpus with generative language "
- "models to produce accurate, grounded responses. The quality of the retrieval "
- "step depends critically on how well source documents have been parsed, chunked, "
- "and indexed. Poor extraction leads to noisy passages that degrade both retrieval "
- "precision and generation quality.",
- body_style,
- ))
- story.append(Paragraph(
- "In this paper, we provide a comprehensive analysis of document understanding "
- "techniques relevant to RAG pipelines. We focus on PDF documents, which remain "
- "the most prevalent format for sharing structured knowledge in business and "
- "academia. We evaluate multiple extraction libraries and approaches, measuring "
- "their fidelity in preserving text content, layout structure, and tabular data.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "The explosion of digital documents in enterprise and academic settings has "
+ "created an urgent need for robust document understanding systems. Organizations "
+ "store vast quantities of knowledge in unstructured formats such as PDFs, Word "
+ "documents, presentations, and scanned images. Unlocking this knowledge for "
+ "AI-powered applications, particularly Retrieval-Augmented Generation (RAG), "
+ "requires sophisticated parsing and extraction pipelines.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "RAG systems combine retrieval from a document corpus with generative language "
+ "models to produce accurate, grounded responses. The quality of the retrieval "
+ "step depends critically on how well source documents have been parsed, chunked, "
+ "and indexed. Poor extraction leads to noisy passages that degrade both retrieval "
+ "precision and generation quality.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "In this paper, we provide a comprehensive analysis of document understanding "
+ "techniques relevant to RAG pipelines. We focus on PDF documents, which remain "
+ "the most prevalent format for sharing structured knowledge in business and "
+ "academia. We evaluate multiple extraction libraries and approaches, measuring "
+ "their fidelity in preserving text content, layout structure, and tabular data.",
+ body_style,
+ )
+ )
# Section 2
story.append(Paragraph("2. Related Work", heading_style))
- story.append(Paragraph(
- "Document understanding has a rich history in the document analysis and "
- "recognition community. Early systems relied on rule-based approaches with "
- "hand-crafted heuristics for layout segmentation. The introduction of deep "
- "learning brought significant improvements, with models like LayoutLM and "
- "DocFormer learning joint representations of text and layout.",
- body_style,
- ))
- story.append(Paragraph(
- "Table extraction has received particular attention due to the structured "
- "nature of tabular data. Methods range from heuristic line detection to "
- "deep learning approaches such as TableNet and DETR-based table detectors. "
- "Recent multimodal models like Donut and Nougat can parse documents end-to-end "
- "without relying on OCR as an intermediate step.",
- body_style,
- ))
- story.append(Paragraph(
- "The RAG paradigm was introduced by Lewis et al. (2020) and has since become "
- "a standard approach for knowledge-intensive NLP tasks. Subsequent work has "
- "explored various aspects of RAG including retrieval strategies, chunk size "
- "optimization, and re-ranking methods. However, relatively little attention "
- "has been paid to the document parsing stage that precedes retrieval.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Document understanding has a rich history in the document analysis and "
+ "recognition community. Early systems relied on rule-based approaches with "
+ "hand-crafted heuristics for layout segmentation. The introduction of deep "
+ "learning brought significant improvements, with models like LayoutLM and "
+ "DocFormer learning joint representations of text and layout.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Table extraction has received particular attention due to the structured "
+ "nature of tabular data. Methods range from heuristic line detection to "
+ "deep learning approaches such as TableNet and DETR-based table detectors. "
+ "Recent multimodal models like Donut and Nougat can parse documents end-to-end "
+ "without relying on OCR as an intermediate step.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "The RAG paradigm was introduced by Lewis et al. (2020) and has since become "
+ "a standard approach for knowledge-intensive NLP tasks. Subsequent work has "
+ "explored various aspects of RAG including retrieval strategies, chunk size "
+ "optimization, and re-ranking methods. However, relatively little attention "
+ "has been paid to the document parsing stage that precedes retrieval.",
+ body_style,
+ )
+ )
# Section 3
story.append(Paragraph("3. Methodology", heading_style))
- story.append(Paragraph(
- "We evaluate five PDF extraction approaches: (1) PyPDF for basic text extraction, "
- "(2) pdfplumber for layout-aware extraction and table detection, (3) PyMuPDF for "
- "high-performance extraction with position information, (4) Tesseract OCR for "
- "scanned documents, and (5) a hybrid pipeline combining multiple methods.",
- body_style,
- ))
- story.append(Paragraph(
- "Our evaluation corpus consists of 500 documents spanning four categories: "
- "academic papers, technical reports, financial statements, and product manuals. "
- "Each document was manually annotated with ground-truth text, table structures, "
- "and layout regions. We measure extraction quality using character error rate "
- "(CER), table structure recognition (TSR) accuracy, and reading order accuracy.",
- body_style,
- ))
- story.append(Paragraph(
- "For the RAG evaluation, we chunk extracted text using four strategies: "
- "fixed-size character windows, sentence-based splitting, recursive splitting "
- "with semantic boundaries, and heading-aware chunking. We then embed chunks "
- "using a sentence transformer model and evaluate retrieval quality on a set "
- "of 200 question-answer pairs derived from the document corpus.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "We evaluate five PDF extraction approaches: (1) PyPDF for basic text extraction, "
+ "(2) pdfplumber for layout-aware extraction and table detection, (3) PyMuPDF for "
+ "high-performance extraction with position information, (4) Tesseract OCR for "
+ "scanned documents, and (5) a hybrid pipeline combining multiple methods.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Our evaluation corpus consists of 500 documents spanning four categories: "
+ "academic papers, technical reports, financial statements, and product manuals. "
+ "Each document was manually annotated with ground-truth text, table structures, "
+ "and layout regions. We measure extraction quality using character error rate "
+ "(CER), table structure recognition (TSR) accuracy, and reading order accuracy.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "For the RAG evaluation, we chunk extracted text using four strategies: "
+ "fixed-size character windows, sentence-based splitting, recursive splitting "
+ "with semantic boundaries, and heading-aware chunking. We then embed chunks "
+ "using a sentence transformer model and evaluate retrieval quality on a set "
+ "of 200 question-answer pairs derived from the document corpus.",
+ body_style,
+ )
+ )
# Section 4
story.append(Paragraph("4. Results", heading_style))
- story.append(Paragraph(
- "Our experiments reveal significant differences between extraction methods "
- "across document categories. PyMuPDF consistently achieves the lowest character "
- "error rate, averaging 2.3% across all documents. pdfplumber provides the best "
- "table extraction with 87% TSR accuracy, compared to 45% for PyPDF and 76% for "
- "PyMuPDF. Tesseract OCR, while essential for scanned documents, introduces "
- "higher error rates of 5-8% on born-digital PDFs.",
- body_style,
- ))
- story.append(Paragraph(
- "The hybrid pipeline, which selects the best extraction method based on document "
- "characteristics, achieves the highest overall quality with 1.8% CER and 89% TSR "
- "accuracy. Documents are first classified as born-digital or scanned using image "
- "analysis, and then processed accordingly.",
- body_style,
- ))
- story.append(Paragraph(
- "For RAG retrieval quality, heading-aware chunking combined with PyMuPDF extraction "
- "yields the best results, with a mean reciprocal rank (MRR) of 0.82 and recall@5 "
- "of 0.91. Fixed-size chunking performs worst at MRR 0.68, while recursive splitting "
- "achieves MRR 0.79. These results underscore the importance of respecting document "
- "structure during chunking.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Our experiments reveal significant differences between extraction methods "
+ "across document categories. PyMuPDF consistently achieves the lowest character "
+ "error rate, averaging 2.3% across all documents. pdfplumber provides the best "
+ "table extraction with 87% TSR accuracy, compared to 45% for PyPDF and 76% for "
+ "PyMuPDF. Tesseract OCR, while essential for scanned documents, introduces "
+ "higher error rates of 5-8% on born-digital PDFs.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "The hybrid pipeline, which selects the best extraction method based on document "
+ "characteristics, achieves the highest overall quality with 1.8% CER and 89% TSR "
+ "accuracy. Documents are first classified as born-digital or scanned using image "
+ "analysis, and then processed accordingly.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "For RAG retrieval quality, heading-aware chunking combined with PyMuPDF extraction "
+ "yields the best results, with a mean reciprocal rank (MRR) of 0.82 and recall@5 "
+ "of 0.91. Fixed-size chunking performs worst at MRR 0.68, while recursive splitting "
+ "achieves MRR 0.79. These results underscore the importance of respecting document "
+ "structure during chunking.",
+ body_style,
+ )
+ )
# Section 5
story.append(Paragraph("5. Discussion", heading_style))
- story.append(Paragraph(
- "Our findings highlight several key insights for practitioners building RAG systems. "
- "First, no single extraction method dominates across all document types, suggesting "
- "that adaptive pipelines are necessary for production systems. Second, table "
- "extraction remains a significant challenge, and converting tables to natural "
- "language descriptions substantially improves retrieval performance.",
- body_style,
- ))
- story.append(Paragraph(
- "Third, the choice of chunking strategy has a measurable impact on RAG quality, "
- "with structure-aware approaches outperforming naive splitting. Fourth, multi-column "
- "layouts require special handling to preserve reading order; without layout analysis, "
- "text from different columns may be interleaved, producing incoherent passages.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Our findings highlight several key insights for practitioners building RAG systems. "
+ "First, no single extraction method dominates across all document types, suggesting "
+ "that adaptive pipelines are necessary for production systems. Second, table "
+ "extraction remains a significant challenge, and converting tables to natural "
+ "language descriptions substantially improves retrieval performance.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Third, the choice of chunking strategy has a measurable impact on RAG quality, "
+ "with structure-aware approaches outperforming naive splitting. Fourth, multi-column "
+ "layouts require special handling to preserve reading order; without layout analysis, "
+ "text from different columns may be interleaved, producing incoherent passages.",
+ body_style,
+ )
+ )
# Section 6
story.append(Paragraph("6. Conclusion", heading_style))
- story.append(Paragraph(
- "We have presented a comprehensive evaluation of document understanding techniques "
- "for RAG systems. Our results demonstrate that careful attention to the parsing "
- "stage significantly impacts downstream retrieval and generation quality. We "
- "recommend a hybrid approach that combines multiple extraction methods with "
- "structure-aware chunking for optimal results. Future work should explore "
- "end-to-end learned parsing systems and their integration with RAG pipelines.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "We have presented a comprehensive evaluation of document understanding techniques "
+ "for RAG systems. Our results demonstrate that careful attention to the parsing "
+ "stage significantly impacts downstream retrieval and generation quality. We "
+ "recommend a hybrid approach that combines multiple extraction methods with "
+ "structure-aware chunking for optimal results. Future work should explore "
+ "end-to-end learned parsing systems and their integration with RAG pipelines.",
+ body_style,
+ )
+ )
# References
story.append(Paragraph("References", heading_style))
@@ -697,9 +954,18 @@ def generate_multi_column():
"[5] Blecher, L. et al. (2023). Nougat: Neural Optical Understanding for Academic Documents. arXiv.",
]
for ref in refs:
- story.append(Paragraph(ref, ParagraphStyle(
- "Ref", parent=body_style, fontSize=8, leading=11, spaceAfter=3,
- )))
+ story.append(
+ Paragraph(
+ ref,
+ ParagraphStyle(
+ "Ref",
+ parent=body_style,
+ fontSize=8,
+ leading=11,
+ spaceAfter=3,
+ ),
+ )
+ )
doc.build(story)
print(f" Created: {path}")
@@ -709,51 +975,82 @@ def generate_multi_column():
# 4) mixed_content.pdf
# ---------------------------------------------------------------------------
+
def generate_mixed_content():
"""Create a document mixing text, tables, bullet points, and headers."""
path = SAMPLE_DIR / "mixed_content.pdf"
- doc = SimpleDocTemplate(str(path), pagesize=letter,
- topMargin=72, bottomMargin=72,
- leftMargin=72, rightMargin=72)
+ doc = SimpleDocTemplate(
+ str(path),
+ pagesize=letter,
+ topMargin=72,
+ bottomMargin=72,
+ leftMargin=72,
+ rightMargin=72,
+ )
styles = getSampleStyleSheet()
title_style = ParagraphStyle(
- "MixTitle", parent=styles["Title"], fontSize=22,
- spaceAfter=16, alignment=TA_CENTER,
+ "MixTitle",
+ parent=styles["Title"],
+ fontSize=22,
+ spaceAfter=16,
+ alignment=TA_CENTER,
)
heading_style = ParagraphStyle(
- "MixHeading", parent=styles["Heading1"], fontSize=15,
- spaceBefore=18, spaceAfter=10,
+ "MixHeading",
+ parent=styles["Heading1"],
+ fontSize=15,
+ spaceBefore=18,
+ spaceAfter=10,
)
subheading_style = ParagraphStyle(
- "MixSubheading", parent=styles["Heading2"], fontSize=12,
- spaceBefore=12, spaceAfter=6,
+ "MixSubheading",
+ parent=styles["Heading2"],
+ fontSize=12,
+ spaceBefore=12,
+ spaceAfter=6,
)
body_style = ParagraphStyle(
- "MixBody", parent=styles["BodyText"], fontSize=11,
- leading=16, spaceAfter=10, alignment=TA_JUSTIFY,
+ "MixBody",
+ parent=styles["BodyText"],
+ fontSize=11,
+ leading=16,
+ spaceAfter=10,
+ alignment=TA_JUSTIFY,
)
bullet_style = ParagraphStyle(
- "MixBullet", parent=styles["BodyText"], fontSize=11,
- leading=16, spaceAfter=4, leftIndent=36,
- bulletIndent=18, bulletFontSize=11,
+ "MixBullet",
+ parent=styles["BodyText"],
+ fontSize=11,
+ leading=16,
+ spaceAfter=4,
+ leftIndent=36,
+ bulletIndent=18,
+ bulletFontSize=11,
)
def make_table_style():
- return TableStyle([
- ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2E75B6")),
- ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
- ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
- ("FONTSIZE", (0, 0), (-1, 0), 10),
- ("BOTTOMPADDING", (0, 0), (-1, 0), 8),
- ("TOPPADDING", (0, 0), (-1, 0), 8),
- ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#DAEEF3"), colors.white]),
- ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("TOPPADDING", (0, 1), (-1, -1), 5),
- ("BOTTOMPADDING", (0, 1), (-1, -1), 5),
- ])
+ return TableStyle(
+ [
+ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2E75B6")),
+ ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
+ ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
+ ("FONTSIZE", (0, 0), (-1, 0), 10),
+ ("BOTTOMPADDING", (0, 0), (-1, 0), 8),
+ ("TOPPADDING", (0, 0), (-1, 0), 8),
+ (
+ "ROWBACKGROUNDS",
+ (0, 1),
+ (-1, -1),
+ [colors.HexColor("#DAEEF3"), colors.white],
+ ),
+ ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
+ ("ALIGN", (0, 0), (-1, -1), "CENTER"),
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
+ ("TOPPADDING", (0, 1), (-1, -1), 5),
+ ("BOTTOMPADDING", (0, 1), (-1, -1), 5),
+ ]
+ )
story = []
@@ -763,14 +1060,16 @@ def make_table_style():
# --- Introduction ---
story.append(Paragraph("Executive Summary", heading_style))
- story.append(Paragraph(
- "This report provides an analysis of the most significant technology trends "
- "shaping the industry in 2024. From the rapid adoption of generative AI to "
- "advances in quantum computing, these trends are reshaping how businesses "
- "operate and compete. Understanding these trends is essential for strategic "
- "planning and technology investment decisions.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "This report provides an analysis of the most significant technology trends "
+ "shaping the industry in 2024. From the rapid adoption of generative AI to "
+ "advances in quantum computing, these trends are reshaping how businesses "
+ "operate and compete. Understanding these trends is essential for strategic "
+ "planning and technology investment decisions.",
+ body_style,
+ )
+ )
# --- Key Findings (bullet points) ---
story.append(Paragraph("Key Findings", heading_style))
@@ -789,23 +1088,27 @@ def make_table_style():
# --- Section with text ---
story.append(Paragraph("Generative AI in the Enterprise", heading_style))
- story.append(Paragraph(
- "Generative AI has emerged as the defining technology of 2024. Large language "
- "models (LLMs) are being deployed across industries for tasks ranging from "
- "customer support automation to code generation and content creation. "
- "Retrieval-Augmented Generation (RAG) has become the preferred architecture "
- "for enterprise AI applications, combining the fluency of generative models "
- "with the accuracy of information retrieval.",
- body_style,
- ))
- story.append(Paragraph(
- "Organizations are investing heavily in data infrastructure to support their "
- "AI initiatives. Vector databases, embedding pipelines, and document processing "
- "systems form the backbone of modern RAG deployments. The ability to accurately "
- "extract and structure information from unstructured documents is a critical "
- "capability that directly impacts AI application quality.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "Generative AI has emerged as the defining technology of 2024. Large language "
+ "models (LLMs) are being deployed across industries for tasks ranging from "
+ "customer support automation to code generation and content creation. "
+ "Retrieval-Augmented Generation (RAG) has become the preferred architecture "
+ "for enterprise AI applications, combining the fluency of generative models "
+ "with the accuracy of information retrieval.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Organizations are investing heavily in data infrastructure to support their "
+ "AI initiatives. Vector databases, embedding pipelines, and document processing "
+ "systems form the backbone of modern RAG deployments. The ability to accurately "
+ "extract and structure information from unstructured documents is a critical "
+ "capability that directly impacts AI application quality.",
+ body_style,
+ )
+ )
# --- Table: AI Adoption ---
story.append(Paragraph("AI Adoption by Industry", subheading_style))
@@ -825,14 +1128,16 @@ def make_table_style():
# --- Another text section ---
story.append(Paragraph("Cloud and Infrastructure Trends", heading_style))
- story.append(Paragraph(
- "The cloud computing landscape continues to evolve rapidly. Multi-cloud and "
- "hybrid-cloud strategies have become the norm, with organizations distributing "
- "workloads across multiple providers to optimize cost, performance, and "
- "resilience. Kubernetes has solidified its position as the de facto standard "
- "for container orchestration.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "The cloud computing landscape continues to evolve rapidly. Multi-cloud and "
+ "hybrid-cloud strategies have become the norm, with organizations distributing "
+ "workloads across multiple providers to optimize cost, performance, and "
+ "resilience. Kubernetes has solidified its position as the de facto standard "
+ "for container orchestration.",
+ body_style,
+ )
+ )
# --- Bullet points for cloud ---
story.append(Paragraph("Top Cloud Priorities for 2024", subheading_style))
@@ -863,22 +1168,26 @@ def make_table_style():
# --- Cybersecurity section ---
story.append(Paragraph("Cybersecurity Landscape", heading_style))
- story.append(Paragraph(
- "The cybersecurity threat landscape has become increasingly complex. "
- "Ransomware attacks continue to rise in frequency and sophistication, while "
- "AI-powered threats present new challenges for defense teams. Zero-trust "
- "architecture has moved from concept to implementation, with organizations "
- "adopting identity-centric security models that verify every access request.",
- body_style,
- ))
- story.append(Paragraph(
- "Supply chain security has emerged as a critical concern following high-profile "
- "incidents. Software Bill of Materials (SBOM) requirements are becoming "
- "standard, and organizations are implementing stricter controls over "
- "third-party dependencies. AI is being used both offensively and defensively, "
- "creating an arms race between attackers and defenders.",
- body_style,
- ))
+ story.append(
+ Paragraph(
+ "The cybersecurity threat landscape has become increasingly complex. "
+ "Ransomware attacks continue to rise in frequency and sophistication, while "
+ "AI-powered threats present new challenges for defense teams. Zero-trust "
+ "architecture has moved from concept to implementation, with organizations "
+ "adopting identity-centric security models that verify every access request.",
+ body_style,
+ )
+ )
+ story.append(
+ Paragraph(
+ "Supply chain security has emerged as a critical concern following high-profile "
+ "incidents. Software Bill of Materials (SBOM) requirements are becoming "
+ "standard, and organizations are implementing stricter controls over "
+ "third-party dependencies. AI is being used both offensively and defensively, "
+ "creating an arms race between attackers and defenders.",
+ body_style,
+ )
+ )
# --- Conclusion ---
story.append(Paragraph("Recommendations", heading_style))
diff --git a/unstructured_documents/02_docx/01_python_docx_extraction.py b/unstructured_documents/02_docx/01_python_docx_extraction.py
index d5c92c6..e45aba4 100644
--- a/unstructured_documents/02_docx/01_python_docx_extraction.py
+++ b/unstructured_documents/02_docx/01_python_docx_extraction.py
@@ -35,6 +35,7 @@
# 1. Extract paragraphs with style information
# ===================================================================
+
def extract_paragraphs(doc_path: Path) -> list[dict]:
"""
Extract every paragraph together with its style name and full text.
@@ -50,20 +51,22 @@ def extract_paragraphs(doc_path: Path) -> list[dict]:
text = para.text.strip()
if not text:
continue
- paragraphs.append({
- "style": para.style.name,
- "text": text,
- # Run-level detail: capture bold/italic spans
- "runs": [
- {
- "text": run.text,
- "bold": run.bold,
- "italic": run.italic,
- }
- for run in para.runs
- if run.text.strip()
- ],
- })
+ paragraphs.append(
+ {
+ "style": para.style.name,
+ "text": text,
+ # Run-level detail: capture bold/italic spans
+ "runs": [
+ {
+ "text": run.text,
+ "bold": run.bold,
+ "italic": run.italic,
+ }
+ for run in para.runs
+ if run.text.strip()
+ ],
+ }
+ )
return paragraphs
@@ -71,6 +74,7 @@ def extract_paragraphs(doc_path: Path) -> list[dict]:
# 2. Extract tables
# ===================================================================
+
def extract_tables(doc_path: Path) -> list[list[list[str]]]:
"""
Extract all tables in the document.
@@ -93,6 +97,7 @@ def extract_tables(doc_path: Path) -> list[list[list[str]]]:
# 3. Build a heading-based document hierarchy
# ===================================================================
+
def build_heading_hierarchy(doc_path: Path) -> list[dict]:
"""
Walk through the document and group content under headings.
@@ -131,11 +136,13 @@ def build_heading_hierarchy(doc_path: Path) -> list[dict]:
"body_parts": [],
}
else:
- current_section["body_parts"].append({
- "type": "paragraph",
- "style": style_name,
- "text": text,
- })
+ current_section["body_parts"].append(
+ {
+ "type": "paragraph",
+ "style": style_name,
+ "text": text,
+ }
+ )
# Don't forget the last section
if current_section["body_parts"]:
@@ -148,6 +155,7 @@ def build_heading_hierarchy(doc_path: Path) -> list[dict]:
# 4. Convert to markdown text (for heading-based chunking)
# ===================================================================
+
def docx_to_markdown(doc_path: Path) -> str:
"""
Convert the DOCX to a simple markdown string so we can reuse
@@ -189,6 +197,7 @@ def docx_to_markdown(doc_path: Path) -> str:
# Main demonstration
# ===================================================================
+
def main() -> None:
print("=" * 70)
print("DOCX Extraction with python-docx")
@@ -214,7 +223,7 @@ def main() -> None:
flags.append("BOLD")
if r["italic"]:
flags.append("ITALIC")
- print(f" {'':20s} ^ {', '.join(flags)}: \"{r['text'][:60]}\"")
+ print(f' {"":20s} ^ {", ".join(flags)}: "{r["text"][:60]}"')
if len(paragraphs) > 8:
print(f"\n ... and {len(paragraphs) - 8} more paragraphs")
@@ -245,8 +254,10 @@ def main() -> None:
for sec in sections:
indent = " " * sec["heading_level"]
n_parts = len(sec["body_parts"])
- print(f" {indent}H{sec['heading_level']}: {sec['heading_text']} "
- f"({n_parts} body element{'s' if n_parts != 1 else ''})")
+ print(
+ f" {indent}H{sec['heading_level']}: {sec['heading_text']} "
+ f"({n_parts} body element{'s' if n_parts != 1 else ''})"
+ )
# ------------------------------------------------------------------
# 4. Chunking by headings (via markdown conversion)
diff --git a/unstructured_documents/02_docx/02_mammoth_extraction.py b/unstructured_documents/02_docx/02_mammoth_extraction.py
index f9d31c9..6496505 100644
--- a/unstructured_documents/02_docx/02_mammoth_extraction.py
+++ b/unstructured_documents/02_docx/02_mammoth_extraction.py
@@ -37,6 +37,7 @@
# 1. Convert DOCX to HTML
# ===================================================================
+
def docx_to_html(doc_path: Path) -> tuple[str, list[str]]:
"""
Convert the DOCX file to clean, semantic HTML.
@@ -60,6 +61,7 @@ def docx_to_html(doc_path: Path) -> tuple[str, list[str]]:
# 2. Convert DOCX to markdown
# ===================================================================
+
def docx_to_markdown(doc_path: Path) -> tuple[str, list[str]]:
"""
Convert the DOCX file to markdown using mammoth's built-in converter.
@@ -79,6 +81,7 @@ def docx_to_markdown(doc_path: Path) -> tuple[str, list[str]]:
# 3. Heading-aware chunking on markdown output
# ===================================================================
+
def chunk_markdown_by_headings(markdown_text: str) -> list[dict]:
"""
Use the shared heading-aware chunker on mammoth's markdown output.
@@ -93,6 +96,7 @@ def chunk_markdown_by_headings(markdown_text: str) -> list[dict]:
# Main demonstration
# ===================================================================
+
def main() -> None:
print("=" * 70)
print("DOCX Extraction with mammoth")
@@ -112,7 +116,7 @@ def main() -> None:
else:
print("No conversion warnings.")
- print(f"\nHTML output (first 800 chars):")
+ print("\nHTML output (first 800 chars):")
print("-" * 50)
print(html[:800])
print("-" * 50)
@@ -131,7 +135,7 @@ def main() -> None:
else:
print("No conversion warnings.")
- print(f"\nMarkdown output (first 800 chars):")
+ print("\nMarkdown output (first 800 chars):")
print("-" * 50)
print(markdown[:800])
print("-" * 50)
@@ -148,17 +152,13 @@ def main() -> None:
# Count headings in each format
html_headings = html.count("
") + html.count("") + html.count("")
- md_headings = sum(
- 1 for line in markdown.splitlines()
- if line.strip().startswith("#")
- )
+ md_headings = sum(1 for line in markdown.splitlines() if line.strip().startswith("#"))
print(f"{'Heading elements':<30s} {html_headings:>12d} {md_headings:>12d}")
# Count list items
html_list_items = html.count("
")
md_list_items = sum(
- 1 for line in markdown.splitlines()
- if line.strip().startswith("- ") or line.strip().startswith("* ")
+ 1 for line in markdown.splitlines() if line.strip().startswith("- ") or line.strip().startswith("* ")
)
print(f"{'List items':<30s} {html_list_items:>12d} {md_list_items:>12d}")
diff --git a/unstructured_documents/02_docx/03_docx2txt_extraction.py b/unstructured_documents/02_docx/03_docx2txt_extraction.py
index bcab9c0..8be5566 100644
--- a/unstructured_documents/02_docx/03_docx2txt_extraction.py
+++ b/unstructured_documents/02_docx/03_docx2txt_extraction.py
@@ -26,8 +26,8 @@
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from unstructured_documents.shared.chunking import (
chunk_by_characters,
- chunk_by_sentences,
chunk_by_recursive_split,
+ chunk_by_sentences,
preview_chunks,
)
@@ -38,6 +38,7 @@
# 1. Basic text extraction
# ===================================================================
+
def extract_text(doc_path: Path) -> str:
"""
Extract the full plain-text content of a DOCX file.
@@ -53,20 +54,21 @@ def extract_text(doc_path: Path) -> str:
# 2. Analysis: what is preserved vs. lost
# ===================================================================
+
def analyse_extraction(text: str) -> dict:
"""
Run simple heuristics to illustrate what docx2txt keeps and drops.
"""
- lines = [l for l in text.splitlines() if l.strip()]
+ lines = [line for line in text.splitlines() if line.strip()]
words = text.split()
return {
"total_characters": len(text),
"total_words": len(words),
"non_empty_lines": len(lines),
- "contains_tabs": "\t" in text, # tables survive as tab-separated
- "contains_bullet_markers": False, # bullet symbols are lost
- "heading_markers_present": False, # heading markup is lost
+ "contains_tabs": "\t" in text, # tables survive as tab-separated
+ "contains_bullet_markers": False, # bullet symbols are lost
+ "heading_markers_present": False, # heading markup is lost
}
@@ -74,6 +76,7 @@ def analyse_extraction(text: str) -> dict:
# Main demonstration
# ===================================================================
+
def main() -> None:
print("=" * 70)
print("DOCX Extraction with docx2txt")
diff --git a/unstructured_documents/02_docx/sample_docs/generate_samples.py b/unstructured_documents/02_docx/sample_docs/generate_samples.py
index bdaf1af..0674260 100644
--- a/unstructured_documents/02_docx/sample_docs/generate_samples.py
+++ b/unstructured_documents/02_docx/sample_docs/generate_samples.py
@@ -13,10 +13,8 @@
from pathlib import Path
from docx import Document
-from docx.shared import Inches, Pt, RGBColor
-from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
-
+from docx.shared import Pt, RGBColor
SAMPLE_DIR = Path(__file__).resolve().parent
@@ -25,6 +23,7 @@
# 1. simple_document.docx
# ---------------------------------------------------------------------------
+
def create_simple_document() -> None:
"""Create a document with headings, paragraphs, bullet points, and rich text."""
@@ -67,17 +66,12 @@ def create_simple_document() -> None:
# --- H3: Key Indicators ---
doc.add_heading("Key Climate Indicators", level=3)
- doc.add_paragraph(
- "Scientists track several indicators to monitor the state of the climate system:"
- )
+ doc.add_paragraph("Scientists track several indicators to monitor the state of the climate system:")
bullets = [
- "Global mean surface temperature has risen approximately 1.1 °C above "
- "pre-industrial levels as of 2023.",
- "Arctic sea-ice extent has declined by roughly 13 % per decade since "
- "satellite records began in 1979.",
- "Global mean sea level has risen about 20 cm since 1900, with the rate of "
- "rise accelerating in recent decades.",
+ "Global mean surface temperature has risen approximately 1.1 °C above pre-industrial levels as of 2023.",
+ "Arctic sea-ice extent has declined by roughly 13 % per decade since satellite records began in 1979.",
+ "Global mean sea level has risen about 20 cm since 1900, with the rate of rise accelerating in recent decades.",
"Ocean heat content has increased steadily, with the upper 2,000 metres of "
"the ocean absorbing over 90 % of the excess heat.",
"Atmospheric methane concentrations have more than doubled since "
@@ -93,9 +87,7 @@ def create_simple_document() -> None:
para = doc.add_paragraph("The effects of climate change are ")
run = para.add_run("already being felt")
run.italic = True
- para.add_run(
- " across every continent and ocean. Some of the most significant impacts include:"
- )
+ para.add_run(" across every continent and ocean. Some of the most significant impacts include:")
doc.add_heading("Extreme Weather Events", level=3)
@@ -148,8 +140,7 @@ def create_simple_document() -> None:
"Building resilient infrastructure designed for future climate conditions.",
"Developing drought-resistant crop varieties and sustainable water management.",
"Strengthening early-warning systems for extreme weather events.",
- "Implementing nature-based solutions such as mangrove restoration for coastal "
- "protection.",
+ "Implementing nature-based solutions such as mangrove restoration for coastal protection.",
]
for point in adaptation_points:
doc.add_paragraph(point, style="List Bullet")
@@ -164,8 +155,7 @@ def create_simple_document() -> None:
"if rapid, far-reaching action is taken across all sectors of the economy. "
)
run = para.add_run(
- "The choices made in the next decade will determine the trajectory of the "
- "climate for centuries to come."
+ "The choices made in the next decade will determine the trajectory of the climate for centuries to come."
)
run.bold = True
run.italic = True
@@ -179,6 +169,7 @@ def create_simple_document() -> None:
# 2. tables_document.docx
# ---------------------------------------------------------------------------
+
def create_tables_document() -> None:
"""Create a document with multiple tables and explanatory text."""
@@ -203,11 +194,11 @@ def create_tables_document() -> None:
financial_headers = ["Metric", "Q3 2025", "Q2 2025", "Q3 2024", "YoY Change"]
financial_data = [
- ["Revenue", "$12,450,000", "$11,820,000", "$10,900,000", "+14.2 %"],
- ["Cost of Goods Sold", "$7,470,000", "$7,210,000", "$6,760,000", "+10.5 %"],
- ["Gross Profit", "$4,980,000", "$4,610,000", "$4,140,000", "+20.3 %"],
- ["Operating Expenses", "$2,850,000", "$2,790,000", "$2,610,000", "+9.2 %"],
- ["Net Income", "$2,130,000", "$1,820,000", "$1,530,000", "+39.2 %"],
+ ["Revenue", "$12,450,000", "$11,820,000", "$10,900,000", "+14.2 %"],
+ ["Cost of Goods Sold", "$7,470,000", "$7,210,000", "$6,760,000", "+10.5 %"],
+ ["Gross Profit", "$4,980,000", "$4,610,000", "$4,140,000", "+20.3 %"],
+ ["Operating Expenses", "$2,850,000", "$2,790,000", "$2,610,000", "+9.2 %"],
+ ["Net Income", "$2,130,000", "$1,820,000", "$1,530,000", "+39.2 %"],
]
table = doc.add_table(rows=1, cols=len(financial_headers))
@@ -234,15 +225,20 @@ def create_tables_document() -> None:
"current stock levels across our main product categories."
)
- inventory_headers = ["Product Category", "SKU Count", "Units in Stock",
- "Reorder Point", "Status"]
+ inventory_headers = [
+ "Product Category",
+ "SKU Count",
+ "Units in Stock",
+ "Reorder Point",
+ "Status",
+ ]
inventory_data = [
- ["Electronics", "142", "34,500", "10,000", "Adequate"],
- ["Home Appliances", "87", "12,200", "5,000", "Adequate"],
- ["Office Supplies", "215", "98,000", "25,000", "Adequate"],
- ["Industrial Tools", "63", "4,800", "5,000", "Low — reorder initiated"],
- ["Automotive Parts", "178", "22,100", "8,000", "Adequate"],
- ["Health & Safety", "54", "7,300", "7,000", "Marginal"],
+ ["Electronics", "142", "34,500", "10,000", "Adequate"],
+ ["Home Appliances", "87", "12,200", "5,000", "Adequate"],
+ ["Office Supplies", "215", "98,000", "25,000", "Adequate"],
+ ["Industrial Tools", "63", "4,800", "5,000", "Low — reorder initiated"],
+ ["Automotive Parts", "178", "22,100", "8,000", "Adequate"],
+ ["Health & Safety", "54", "7,300", "7,000", "Marginal"],
]
table = doc.add_table(rows=1, cols=len(inventory_headers))
@@ -272,13 +268,25 @@ def create_tables_document() -> None:
emp_headers = ["Name", "Title", "Department", "Office", "Email"]
emp_data = [
- ["Sarah Chen", "CEO", "Executive", "New York", "s.chen@acme.com"],
- ["James Okafor", "CFO", "Finance", "New York", "j.okafor@acme.com"],
- ["Maria Gonzalez", "VP of Engineering", "Engineering", "San Francisco","m.gonzalez@acme.com"],
- ["David Kim", "VP of Sales", "Sales", "Chicago", "d.kim@acme.com"],
- ["Priya Patel", "VP of Operations", "Operations", "Dallas", "p.patel@acme.com"],
- ["Thomas Weber", "Head of HR", "Human Resources","London", "t.weber@acme.com"],
- ["Aisha Mohammed", "General Counsel", "Legal", "New York", "a.mohammed@acme.com"],
+ ["Sarah Chen", "CEO", "Executive", "New York", "s.chen@acme.com"],
+ ["James Okafor", "CFO", "Finance", "New York", "j.okafor@acme.com"],
+ [
+ "Maria Gonzalez",
+ "VP of Engineering",
+ "Engineering",
+ "San Francisco",
+ "m.gonzalez@acme.com",
+ ],
+ ["David Kim", "VP of Sales", "Sales", "Chicago", "d.kim@acme.com"],
+ ["Priya Patel", "VP of Operations", "Operations", "Dallas", "p.patel@acme.com"],
+ ["Thomas Weber", "Head of HR", "Human Resources", "London", "t.weber@acme.com"],
+ [
+ "Aisha Mohammed",
+ "General Counsel",
+ "Legal",
+ "New York",
+ "a.mohammed@acme.com",
+ ],
]
table = doc.add_table(rows=1, cols=len(emp_headers))
@@ -309,6 +317,7 @@ def create_tables_document() -> None:
# 3. styled_document.docx
# ---------------------------------------------------------------------------
+
def create_styled_document() -> None:
"""Create a document exercising many built-in Word styles."""
@@ -316,9 +325,7 @@ def create_styled_document() -> None:
# Title & subtitle
doc.add_paragraph("The Art of Software Architecture", style="Title")
- doc.add_paragraph(
- "A Practical Guide to Designing Maintainable Systems", style="Subtitle"
- )
+ doc.add_paragraph("A Practical Guide to Designing Maintainable Systems", style="Subtitle")
# --- Heading 1 ---
doc.add_heading("Introduction", level=1)
@@ -335,15 +342,15 @@ def create_styled_document() -> None:
"This guide distils decades of collective industry experience into concise, "
"actionable advice. It is intended for developers who are transitioning into "
"architecture roles, as well as seasoned architects looking for a refresher.",
- style="Normal"
+ style="Normal",
)
# --- Quote style ---
doc.add_paragraph(
'"Architecture is the decisions that you wish you could get right early in '
- 'a project, but that you are not necessarily more likely to get them right '
+ "a project, but that you are not necessarily more likely to get them right "
'than any other." — Ralph Johnson',
- style="Quote"
+ style="Quote",
)
# --- Heading 1 ---
@@ -369,20 +376,15 @@ def create_styled_document() -> None:
doc.add_heading("SOLID Principles", level=2)
- doc.add_paragraph(
- "The SOLID principles provide a foundation for object-oriented design:"
- )
+ doc.add_paragraph("The SOLID principles provide a foundation for object-oriented design:")
# Numbered list
numbered_items = [
- "Single Responsibility Principle — a class should have one, and only one, "
- "reason to change.",
- "Open/Closed Principle — software entities should be open for extension but "
- "closed for modification.",
+ "Single Responsibility Principle — a class should have one, and only one, reason to change.",
+ "Open/Closed Principle — software entities should be open for extension but closed for modification.",
"Liskov Substitution Principle — subtypes must be substitutable for their "
"base types without altering program correctness.",
- "Interface Segregation Principle — clients should not be forced to depend on "
- "interfaces they do not use.",
+ "Interface Segregation Principle — clients should not be forced to depend on interfaces they do not use.",
"Dependency Inversion Principle — high-level modules should not depend on "
"low-level modules; both should depend on abstractions.",
]
@@ -424,22 +426,20 @@ def create_styled_document() -> None:
# --- Code-style text ---
doc.add_heading("Code Example: Dependency Injection", level=2)
- doc.add_paragraph(
- "The following pseudo-code shows constructor-based dependency injection:"
- )
+ doc.add_paragraph("The following pseudo-code shows constructor-based dependency injection:")
# Use a monospace font for code
code_para = doc.add_paragraph()
code_run = code_para.add_run(
- 'class OrderService:\n'
- ' def __init__(self, repository: OrderRepository,\n'
- ' notifier: NotificationService):\n'
- ' self._repository = repository\n'
- ' self._notifier = notifier\n'
- '\n'
- ' def place_order(self, order: Order) -> None:\n'
- ' self._repository.save(order)\n'
- ' self._notifier.send_confirmation(order)\n'
+ "class OrderService:\n"
+ " def __init__(self, repository: OrderRepository,\n"
+ " notifier: NotificationService):\n"
+ " self._repository = repository\n"
+ " self._notifier = notifier\n"
+ "\n"
+ " def place_order(self, order: Order) -> None:\n"
+ " self._repository.save(order)\n"
+ " self._notifier.send_confirmation(order)\n"
)
code_run.font.name = "Courier New"
code_run.font.size = Pt(9)
@@ -452,10 +452,7 @@ def create_styled_document() -> None:
)
# --- Another Quote ---
- doc.add_paragraph(
- '"Make each program do one thing well." — Unix Philosophy',
- style="Quote"
- )
+ doc.add_paragraph('"Make each program do one thing well." — Unix Philosophy', style="Quote")
# --- Heading 1 ---
doc.add_heading("Quality Attributes", level=1)
@@ -488,7 +485,7 @@ def create_styled_document() -> None:
doc.add_paragraph(
'"The best architectures are grown, not designed." — Adapted from Fred Brooks',
- style="Quote"
+ style="Quote",
)
path = SAMPLE_DIR / "styled_document.docx"
diff --git a/unstructured_documents/03_pptx/01_python_pptx_extraction.py b/unstructured_documents/03_pptx/01_python_pptx_extraction.py
index 15feee7..3959b50 100644
--- a/unstructured_documents/03_pptx/01_python_pptx_extraction.py
+++ b/unstructured_documents/03_pptx/01_python_pptx_extraction.py
@@ -16,15 +16,14 @@
# --- shared chunking import ------------------------------------------------
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from pptx import Presentation
+
from unstructured_documents.shared.chunking import (
- chunk_by_sentences,
chunk_by_recursive_split,
+ chunk_by_sentences,
preview_chunks,
)
-from pptx import Presentation
-
-
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
@@ -36,6 +35,7 @@
# Extraction helpers
# ---------------------------------------------------------------------------
+
def extract_text_from_shape(shape) -> list[dict]:
"""
Recursively extract text from a single shape.
@@ -101,6 +101,7 @@ def extract_table_data(slide) -> list[list[list[str]]]:
# Full extraction
# ---------------------------------------------------------------------------
+
def extract_all_slides(pptx_path: Path) -> list[dict]:
"""
Walk every slide and extract text, tables, and notes.
@@ -120,11 +121,13 @@ def extract_all_slides(pptx_path: Path) -> list[dict]:
for shape in slide.shapes:
shape_extracts.extend(extract_text_from_shape(shape))
- slides_data.append({
- "slide_number": idx,
- "shapes": shape_extracts,
- "notes": extract_notes(slide),
- })
+ slides_data.append(
+ {
+ "slide_number": idx,
+ "shapes": shape_extracts,
+ "notes": extract_notes(slide),
+ }
+ )
return slides_data
@@ -133,12 +136,13 @@ def extract_all_slides(pptx_path: Path) -> list[dict]:
# Display
# ---------------------------------------------------------------------------
+
def print_extraction_results(slides_data: list[dict]) -> None:
"""Pretty-print the extraction results."""
for slide in slides_data:
- print(f"\n{'='*60}")
+ print(f"\n{'=' * 60}")
print(f" SLIDE {slide['slide_number']}")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
if not slide["shapes"]:
print(" (no extractable text)")
@@ -150,7 +154,7 @@ def print_extraction_results(slides_data: list[dict]) -> None:
print(f" {line}")
if slide["notes"]:
- print(f"\n [SPEAKER NOTES]")
+ print("\n [SPEAKER NOTES]")
for line in slide["notes"].split("\n"):
print(f" {line}")
@@ -175,9 +179,9 @@ def print_extraction_results(slides_data: list[dict]) -> None:
print_extraction_results(slides_data)
# ── 2. Table extraction detail ────────────────────────────────────────
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" TABLE EXTRACTION DETAIL")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
prs = Presentation(str(PPTX_PATH))
for idx, slide in enumerate(prs.slides, start=1):
tables = extract_table_data(slide)
@@ -197,14 +201,14 @@ def print_extraction_results(slides_data: list[dict]) -> None:
full_text = "\n\n".join(all_text_parts)
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" CHUNKING DEMO — Sentence-based")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
sentence_chunks = chunk_by_sentences(full_text, sentences_per_chunk=4, overlap_sentences=1)
preview_chunks(sentence_chunks, max_preview=4, max_chars=300)
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" CHUNKING DEMO — Recursive split")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
recursive_chunks = chunk_by_recursive_split(full_text, chunk_size=400)
preview_chunks(recursive_chunks, max_preview=4, max_chars=300)
diff --git a/unstructured_documents/03_pptx/02_slide_structured_extraction.py b/unstructured_documents/03_pptx/02_slide_structured_extraction.py
index ed77a4a..5fdc327 100644
--- a/unstructured_documents/03_pptx/02_slide_structured_extraction.py
+++ b/unstructured_documents/03_pptx/02_slide_structured_extraction.py
@@ -18,13 +18,13 @@
# --- shared chunking import ------------------------------------------------
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from pptx import Presentation
+
from unstructured_documents.shared.chunking import (
chunk_by_sentences,
preview_chunks,
)
-from pptx import Presentation
-
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
@@ -36,6 +36,7 @@
# Structured extraction
# ---------------------------------------------------------------------------
+
def _collect_body_text(shape) -> list[str]:
"""
Recursively collect body text from a shape (skipping titles).
@@ -116,13 +117,15 @@ def extract_structured_slides(pptx_path: Path) -> list[dict]:
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text.strip()
- structured.append({
- "slide_number": idx,
- "title": title,
- "body_text": body_text,
- "table_data": tables,
- "notes": notes,
- })
+ structured.append(
+ {
+ "slide_number": idx,
+ "title": title,
+ "body_text": body_text,
+ "table_data": tables,
+ "notes": notes,
+ }
+ )
return structured
@@ -131,6 +134,7 @@ def extract_structured_slides(pptx_path: Path) -> list[dict]:
# RAG-ready conversion
# ---------------------------------------------------------------------------
+
def table_to_text(table: list[list[str]]) -> str:
"""
Convert a 2-D table into a readable text block.
@@ -184,15 +188,17 @@ def slides_to_rag_chunks(slides: list[dict], include_notes: bool = True) -> list
if not text:
continue
- chunks.append({
- "text": text,
- "metadata": {
- "slide_number": slide["slide_number"],
- "title": slide["title"],
- "has_table": len(slide["table_data"]) > 0,
- "has_notes": bool(slide["notes"]),
- },
- })
+ chunks.append(
+ {
+ "text": text,
+ "metadata": {
+ "slide_number": slide["slide_number"],
+ "title": slide["title"],
+ "has_table": len(slide["table_data"]) > 0,
+ "has_notes": bool(slide["notes"]),
+ },
+ }
+ )
return chunks
@@ -217,15 +223,16 @@ def build_slide_summaries(slides: list[dict]) -> list[str]:
# Display helpers
# ---------------------------------------------------------------------------
+
def print_structured_slides(slides: list[dict]) -> None:
"""Pretty-print the structured slide data."""
for slide in slides:
- print(f"\n{'='*60}")
+ print(f"\n{'=' * 60}")
print(f" SLIDE {slide['slide_number']}: {slide['title'] or '(no title)'}")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
if slide["body_text"]:
- print(f"\n Body text:")
+ print("\n Body text:")
for line in slide["body_text"].split("\n"):
print(f" {line}")
@@ -236,7 +243,7 @@ def print_structured_slides(slides: list[dict]) -> None:
print(f" {row}")
if slide["notes"]:
- print(f"\n Speaker notes:")
+ print("\n Speaker notes:")
for line in slide["notes"].split("\n"):
print(f" {line}")
@@ -245,11 +252,13 @@ def print_rag_chunks(chunks: list[dict]) -> None:
"""Print RAG chunks with metadata."""
for i, chunk in enumerate(chunks, start=1):
meta = chunk["metadata"]
- print(f"\n{'- '*30}")
- print(f" Chunk {i} | Slide {meta['slide_number']} | "
- f"Title: {meta['title'] or 'N/A'} | "
- f"Table: {meta['has_table']} | Notes: {meta['has_notes']}")
- print(f"{'- '*30}")
+ print(f"\n{'- ' * 30}")
+ print(
+ f" Chunk {i} | Slide {meta['slide_number']} | "
+ f"Title: {meta['title'] or 'N/A'} | "
+ f"Table: {meta['has_table']} | Notes: {meta['has_notes']}"
+ )
+ print(f"{'- ' * 30}")
# Truncate for display
text = chunk["text"]
if len(text) > 400:
@@ -280,30 +289,30 @@ def print_rag_chunks(chunks: list[dict]) -> None:
# ── 2. Slide summaries ────────────────────────────────────────────────
summaries = build_slide_summaries(slides)
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" SLIDE SUMMARIES")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
for s in summaries:
print(f" {s}")
# ── 3. RAG-ready chunks (one per slide) ───────────────────────────────
rag_chunks = slides_to_rag_chunks(slides, include_notes=True)
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" RAG-READY CHUNKS (one per slide, notes included)")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
print_rag_chunks(rag_chunks)
# ── 4. Sentence-based chunking on full text ───────────────────────────
full_text = "\n\n".join(chunk["text"] for chunk in rag_chunks)
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" SENTENCE-BASED CHUNKING (merged text)")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
sentence_chunks = chunk_by_sentences(full_text, sentences_per_chunk=5, overlap_sentences=1)
preview_chunks(sentence_chunks, max_preview=5, max_chars=350)
# ── 5. JSON output sample ─────────────────────────────────────────────
- print(f"\n\n{'='*60}")
+ print(f"\n\n{'=' * 60}")
print(" JSON OUTPUT (first 2 slides)")
- print(f"{'='*60}")
+ print(f"{'=' * 60}")
json_sample = json.dumps(rag_chunks[:2], indent=2, ensure_ascii=False)
print(json_sample)
diff --git a/unstructured_documents/03_pptx/sample_docs/generate_samples.py b/unstructured_documents/03_pptx/sample_docs/generate_samples.py
index 3db6d46..6803c76 100644
--- a/unstructured_documents/03_pptx/sample_docs/generate_samples.py
+++ b/unstructured_documents/03_pptx/sample_docs/generate_samples.py
@@ -12,9 +12,8 @@
from pathlib import Path
from pptx import Presentation
-from pptx.util import Inches, Pt, Emu
from pptx.enum.text import PP_ALIGN
-
+from pptx.util import Emu, Inches, Pt
# ---------------------------------------------------------------------------
# Helpers
@@ -39,6 +38,7 @@ def _add_textbox(slide, left, top, width, height, text, font_size=14, bold=False
# Presentation 1 – Introduction to Machine Learning (6 slides)
# ---------------------------------------------------------------------------
+
def create_ml_presentation() -> Path:
prs = Presentation()
@@ -46,9 +46,7 @@ def create_ml_presentation() -> Path:
slide_layout = prs.slide_layouts[0] # Title Slide layout
slide = prs.slides.add_slide(slide_layout)
slide.shapes.title.text = "Introduction to Machine Learning"
- slide.placeholders[1].text = (
- "A practical overview of ML concepts, algorithms, and applications"
- )
+ slide.placeholders[1].text = "A practical overview of ML concepts, algorithms, and applications"
# ── Slide 2: Bullet points – ML types ─────────────────────────────────
slide_layout = prs.slide_layouts[1] # Title and Content
@@ -70,13 +68,19 @@ def create_ml_presentation() -> Path:
# ── Slide 3: Table – Algorithm comparison ─────────────────────────────
slide_layout = prs.slide_layouts[5] # Blank layout
slide = prs.slides.add_slide(slide_layout)
- _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6),
- "Comparison of ML Algorithms", font_size=24, bold=True)
+ _add_textbox(
+ slide,
+ Inches(0.5),
+ Inches(0.3),
+ Inches(9),
+ Inches(0.6),
+ "Comparison of ML Algorithms",
+ font_size=24,
+ bold=True,
+ )
rows, cols = 6, 4
- table_shape = slide.shapes.add_table(rows, cols,
- Inches(0.5), Inches(1.2),
- Inches(9), Inches(4))
+ table_shape = slide.shapes.add_table(rows, cols, Inches(0.5), Inches(1.2), Inches(9), Inches(4))
table = table_shape.table
headers = ["Algorithm", "Type", "Use Case", "Complexity"]
@@ -125,19 +129,35 @@ def create_ml_presentation() -> Path:
# ── Slide 5: Grouped text boxes – key takeaways ───────────────────────
slide_layout = prs.slide_layouts[5] # Blank
slide = prs.slides.add_slide(slide_layout)
- _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6),
- "Key Takeaways", font_size=24, bold=True)
+ _add_textbox(
+ slide,
+ Inches(0.5),
+ Inches(0.3),
+ Inches(9),
+ Inches(0.6),
+ "Key Takeaways",
+ font_size=24,
+ bold=True,
+ )
# Create a group shape containing three text boxes
group = slide.shapes.add_group_shape()
takeaways = [
- ("Data is King", "The quality and quantity of your data matters more than the algorithm you choose."),
- ("Start Simple", "Begin with simple models like linear regression before moving to complex architectures."),
- ("Iterate Fast", "Rapid experimentation and iteration lead to better results than perfecting a single approach."),
+ (
+ "Data is King",
+ "The quality and quantity of your data matters more than the algorithm you choose.",
+ ),
+ (
+ "Start Simple",
+ "Begin with simple models like linear regression before moving to complex architectures.",
+ ),
+ (
+ "Iterate Fast",
+ "Rapid experimentation and iteration lead to better results than perfecting a single approach.",
+ ),
]
box_width = Emu(Inches(2.8).emu)
- box_height = Emu(Inches(3).emu)
top = Emu(Inches(1.2).emu)
for idx, (title, desc) in enumerate(takeaways):
@@ -149,8 +169,7 @@ def create_ml_presentation() -> Path:
tb_title.text_frame.paragraphs[0].font.bold = True
tb_title.text_frame.paragraphs[0].alignment = PP_ALIGN.CENTER
# Description box
- tb_desc = group.shapes.add_textbox(left, Emu(top.emu + Inches(0.7).emu),
- box_width, Emu(Inches(2).emu))
+ tb_desc = group.shapes.add_textbox(left, Emu(top.emu + Inches(0.7).emu), box_width, Emu(Inches(2).emu))
tb_desc.text_frame.word_wrap = True
tb_desc.text_frame.paragraphs[0].text = desc
tb_desc.text_frame.paragraphs[0].font.size = Pt(14)
@@ -186,6 +205,7 @@ def create_ml_presentation() -> Path:
# Presentation 2 – Q4 Financial Review (4 slides)
# ---------------------------------------------------------------------------
+
def create_data_presentation() -> Path:
prs = Presentation()
@@ -198,13 +218,19 @@ def create_data_presentation() -> Path:
# ── Slide 2: Revenue table ────────────────────────────────────────────
slide_layout = prs.slide_layouts[5] # Blank
slide = prs.slides.add_slide(slide_layout)
- _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6),
- "Quarterly Revenue Breakdown", font_size=24, bold=True)
+ _add_textbox(
+ slide,
+ Inches(0.5),
+ Inches(0.3),
+ Inches(9),
+ Inches(0.6),
+ "Quarterly Revenue Breakdown",
+ font_size=24,
+ bold=True,
+ )
rows, cols = 5, 4
- table_shape = slide.shapes.add_table(rows, cols,
- Inches(0.5), Inches(1.2),
- Inches(9), Inches(3.5))
+ table_shape = slide.shapes.add_table(rows, cols, Inches(0.5), Inches(1.2), Inches(9), Inches(3.5))
table = table_shape.table
headers = ["Region", "Q1 ($M)", "Q2 ($M)", "Q3 ($M)"]
@@ -223,8 +249,16 @@ def create_data_presentation() -> Path:
# ── Slide 3: Key metrics text boxes ───────────────────────────────────
slide_layout = prs.slide_layouts[5]
slide = prs.slides.add_slide(slide_layout)
- _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6),
- "Key Financial Metrics", font_size=24, bold=True)
+ _add_textbox(
+ slide,
+ Inches(0.5),
+ Inches(0.3),
+ Inches(9),
+ Inches(0.6),
+ "Key Financial Metrics",
+ font_size=24,
+ bold=True,
+ )
metrics = [
("Total Revenue", "$140.4M", "Up 12% YoY"),
@@ -234,12 +268,36 @@ def create_data_presentation() -> Path:
]
for idx, (label, value, note) in enumerate(metrics):
row_top = Inches(1.3 + idx * 1.1)
- _add_textbox(slide, Inches(0.8), row_top, Inches(3), Inches(0.5),
- label, font_size=16, bold=True)
- _add_textbox(slide, Inches(4.0), row_top, Inches(2), Inches(0.5),
- value, font_size=16, bold=False)
- _add_textbox(slide, Inches(6.2), row_top, Inches(3), Inches(0.5),
- note, font_size=14, bold=False)
+ _add_textbox(
+ slide,
+ Inches(0.8),
+ row_top,
+ Inches(3),
+ Inches(0.5),
+ label,
+ font_size=16,
+ bold=True,
+ )
+ _add_textbox(
+ slide,
+ Inches(4.0),
+ row_top,
+ Inches(2),
+ Inches(0.5),
+ value,
+ font_size=16,
+ bold=False,
+ )
+ _add_textbox(
+ slide,
+ Inches(6.2),
+ row_top,
+ Inches(3),
+ Inches(0.5),
+ note,
+ font_size=14,
+ bold=False,
+ )
# ── Slide 4: Conclusion with bullets ──────────────────────────────────
slide_layout = prs.slide_layouts[1]
diff --git a/unstructured_documents/04_html/01_beautifulsoup_extraction.py b/unstructured_documents/04_html/01_beautifulsoup_extraction.py
index 85b9e06..9dd29ee 100644
--- a/unstructured_documents/04_html/01_beautifulsoup_extraction.py
+++ b/unstructured_documents/04_html/01_beautifulsoup_extraction.py
@@ -63,10 +63,12 @@ def extract_article_content(html_path: Path) -> dict:
# Extract headings
for heading in content.find_all(["h1", "h2", "h3", "h4"]):
- result["headings"].append({
- "level": heading.name,
- "text": heading.get_text(strip=True),
- })
+ result["headings"].append(
+ {
+ "level": heading.name,
+ "text": heading.get_text(strip=True),
+ }
+ )
# Extract paragraphs
for p in content.find_all("p"):
@@ -170,7 +172,7 @@ def html_to_markdown_like(html_path: Path) -> str:
print("=" * 60)
tables = extract_tables_only(table_path)
for i, table in enumerate(tables):
- print(f"\nTable {i+1} ({len(table)} rows):")
+ print(f"\nTable {i + 1} ({len(table)} rows):")
for row in table[:3]:
print(f" {row}")
if len(table) > 3:
diff --git a/unstructured_documents/04_html/02_html2text_extraction.py b/unstructured_documents/04_html/02_html2text_extraction.py
index ae122cf..442c850 100644
--- a/unstructured_documents/04_html/02_html2text_extraction.py
+++ b/unstructured_documents/04_html/02_html2text_extraction.py
@@ -42,12 +42,12 @@ def extract_clean_text(html_path: Path) -> str:
def extract_with_custom_settings(html_path: Path) -> str:
"""Convert HTML to markdown with RAG-optimized settings."""
converter = html2text.HTML2Text()
- converter.body_width = 0 # No line wrapping (better for chunking)
- converter.ignore_links = True # Links add noise for RAG
- converter.ignore_images = True # Can't embed images in text chunks
+ converter.body_width = 0 # No line wrapping (better for chunking)
+ converter.ignore_links = True # Links add noise for RAG
+ converter.ignore_images = True # Can't embed images in text chunks
converter.ignore_emphasis = False # Keep bold/italic for context
converter.protect_links = False
- converter.unicode_snob = True # Use unicode instead of ASCII approximations
+ converter.unicode_snob = True # Use unicode instead of ASCII approximations
converter.skip_internal_links = True
return converter.handle(html_path.read_text())
diff --git a/unstructured_documents/04_html/03_trafilatura_extraction.py b/unstructured_documents/04_html/03_trafilatura_extraction.py
index 4e0f1e3..7ca5a4d 100644
--- a/unstructured_documents/04_html/03_trafilatura_extraction.py
+++ b/unstructured_documents/04_html/03_trafilatura_extraction.py
@@ -16,7 +16,6 @@
import trafilatura
from unstructured_documents.shared.chunking import (
- chunk_by_headings,
chunk_by_recursive_split,
preview_chunks,
)
@@ -42,6 +41,7 @@ def extract_with_metadata(html_path: Path) -> dict | None:
)
if result:
import json
+
return json.loads(result)
return None
diff --git a/unstructured_documents/04_html/sample_docs/generate_samples.py b/unstructured_documents/04_html/sample_docs/generate_samples.py
index a812983..14b1fff 100644
--- a/unstructured_documents/04_html/sample_docs/generate_samples.py
+++ b/unstructured_documents/04_html/sample_docs/generate_samples.py
@@ -43,7 +43,8 @@ def generate_article_page():
Key NLP Tasks
- Tokenization: Breaking text into individual words or subwords
- - Named Entity Recognition: Identifying entities like people, organizations, and locations
+ - Named Entity Recognition: Identifying entities like people, organizations, \
+and locations
- Sentiment Analysis: Determining the emotional tone of text
- Machine Translation: Converting text from one language to another
- Text Summarization: Creating concise summaries of longer documents
diff --git a/unstructured_documents/05_spreadsheets/01_openpyxl_extraction.py b/unstructured_documents/05_spreadsheets/01_openpyxl_extraction.py
index 68decdf..fc555ec 100644
--- a/unstructured_documents/05_spreadsheets/01_openpyxl_extraction.py
+++ b/unstructured_documents/05_spreadsheets/01_openpyxl_extraction.py
@@ -14,7 +14,10 @@
import openpyxl
-from unstructured_documents.shared.chunking import chunk_by_recursive_split, preview_chunks
+from unstructured_documents.shared.chunking import (
+ chunk_by_recursive_split,
+ preview_chunks,
+)
SAMPLE_DIR = Path(__file__).parent / "sample_docs"
@@ -46,13 +49,15 @@ def extract_sheet_with_metadata(xlsx_path: Path) -> list[dict]:
headers = [str(h) if h else f"col_{i}" for i, h in enumerate(rows[0])]
data = rows[1:]
- results.append({
- "sheet_name": sheet_name,
- "dimensions": ws.dimensions,
- "row_count": len(data),
- "headers": headers,
- "data": data,
- })
+ results.append(
+ {
+ "sheet_name": sheet_name,
+ "dimensions": ws.dimensions,
+ "row_count": len(data),
+ "headers": headers,
+ "data": data,
+ }
+ )
return results
@@ -137,9 +142,6 @@ def sheet_to_markdown_table(headers: list, rows: list) -> str:
print(f"\n{'=' * 60}")
print("5. CHUNKED OUTPUT FOR RAG")
print("=" * 60)
- all_text = "\n\n".join(
- sheet_to_natural_language(s["sheet_name"], s["headers"], s["data"])
- for s in sheet_meta
- )
+ all_text = "\n\n".join(sheet_to_natural_language(s["sheet_name"], s["headers"], s["data"]) for s in sheet_meta)
chunks = chunk_by_recursive_split(all_text, chunk_size=400)
preview_chunks(chunks)
diff --git a/unstructured_documents/05_spreadsheets/02_pandas_extraction.py b/unstructured_documents/05_spreadsheets/02_pandas_extraction.py
index b364fc5..52a0f3f 100644
--- a/unstructured_documents/05_spreadsheets/02_pandas_extraction.py
+++ b/unstructured_documents/05_spreadsheets/02_pandas_extraction.py
@@ -14,7 +14,10 @@
import pandas as pd
-from unstructured_documents.shared.chunking import chunk_by_recursive_split, preview_chunks
+from unstructured_documents.shared.chunking import (
+ chunk_by_recursive_split,
+ preview_chunks,
+)
SAMPLE_DIR = Path(__file__).parent / "sample_docs"
@@ -61,8 +64,8 @@ def dataframe_to_row_chunks(df: pd.DataFrame, sheet_name: str, rows_per_chunk: i
headers = list(df.columns)
for i in range(0, len(df), rows_per_chunk):
- batch = df.iloc[i:i + rows_per_chunk]
- lines = [f"[{sheet_name} - rows {i+1} to {min(i+rows_per_chunk, len(df))}]"]
+ batch = df.iloc[i : i + rows_per_chunk]
+ lines = [f"[{sheet_name} - rows {i + 1} to {min(i + rows_per_chunk, len(df))}]"]
lines.append(f"Columns: {', '.join(str(h) for h in headers)}\n")
for _, row in batch.iterrows():
parts = [f"{col}: {val}" for col, val in row.items() if pd.notna(val)]
@@ -112,7 +115,7 @@ def dataframe_to_row_chunks(df: pd.DataFrame, sheet_name: str, rows_per_chunk: i
products_df = pd.read_csv(csv_path)
print(f"Shape: {products_df.shape}")
print(f"Columns: {list(products_df.columns)}")
- print(f"\nFirst 3 rows:")
+ print("\nFirst 3 rows:")
print(products_df.head(3).to_string())
# Convert to natural language
diff --git a/unstructured_documents/05_spreadsheets/03_csv_extraction.py b/unstructured_documents/05_spreadsheets/03_csv_extraction.py
index af400c2..365304e 100644
--- a/unstructured_documents/05_spreadsheets/03_csv_extraction.py
+++ b/unstructured_documents/05_spreadsheets/03_csv_extraction.py
@@ -13,7 +13,10 @@
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
-from unstructured_documents.shared.chunking import chunk_by_recursive_split, preview_chunks
+from unstructured_documents.shared.chunking import (
+ chunk_by_recursive_split,
+ preview_chunks,
+)
SAMPLE_DIR = Path(__file__).parent / "sample_docs"
diff --git a/unstructured_documents/05_spreadsheets/sample_docs/generate_samples.py b/unstructured_documents/05_spreadsheets/sample_docs/generate_samples.py
index 5fbeafe..8bee408 100644
--- a/unstructured_documents/05_spreadsheets/sample_docs/generate_samples.py
+++ b/unstructured_documents/05_spreadsheets/sample_docs/generate_samples.py
@@ -23,14 +23,78 @@ def generate_multi_sheet_workbook():
cell.fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
employees = [
- [101, "Alice Johnson", "Engineering", "Senior Developer", "alice@company.com", 125000, "2020-03-15"],
- [102, "Bob Smith", "Marketing", "Marketing Manager", "bob@company.com", 95000, "2019-07-01"],
- [103, "Carol Williams", "Engineering", "Tech Lead", "carol@company.com", 145000, "2018-01-10"],
- [104, "David Brown", "Sales", "Account Executive", "david@company.com", 85000, "2021-06-20"],
- [105, "Eve Davis", "Engineering", "Junior Developer", "eve@company.com", 75000, "2023-02-01"],
- [106, "Frank Miller", "HR", "HR Director", "frank@company.com", 110000, "2017-11-15"],
- [107, "Grace Lee", "Finance", "Financial Analyst", "grace@company.com", 90000, "2022-04-01"],
- [108, "Henry Wilson", "Engineering", "DevOps Engineer", "henry@company.com", 120000, "2020-09-10"],
+ [
+ 101,
+ "Alice Johnson",
+ "Engineering",
+ "Senior Developer",
+ "alice@company.com",
+ 125000,
+ "2020-03-15",
+ ],
+ [
+ 102,
+ "Bob Smith",
+ "Marketing",
+ "Marketing Manager",
+ "bob@company.com",
+ 95000,
+ "2019-07-01",
+ ],
+ [
+ 103,
+ "Carol Williams",
+ "Engineering",
+ "Tech Lead",
+ "carol@company.com",
+ 145000,
+ "2018-01-10",
+ ],
+ [
+ 104,
+ "David Brown",
+ "Sales",
+ "Account Executive",
+ "david@company.com",
+ 85000,
+ "2021-06-20",
+ ],
+ [
+ 105,
+ "Eve Davis",
+ "Engineering",
+ "Junior Developer",
+ "eve@company.com",
+ 75000,
+ "2023-02-01",
+ ],
+ [
+ 106,
+ "Frank Miller",
+ "HR",
+ "HR Director",
+ "frank@company.com",
+ 110000,
+ "2017-11-15",
+ ],
+ [
+ 107,
+ "Grace Lee",
+ "Finance",
+ "Financial Analyst",
+ "grace@company.com",
+ 90000,
+ "2022-04-01",
+ ],
+ [
+ 108,
+ "Henry Wilson",
+ "Engineering",
+ "DevOps Engineer",
+ "henry@company.com",
+ 120000,
+ "2020-09-10",
+ ],
]
for emp in employees:
ws1.append(emp)
@@ -62,11 +126,51 @@ def generate_multi_sheet_workbook():
cell.font = Font(bold=True)
projects = [
- ["Website Redesign", "In Progress", "Alice", "High", "2025-03-01", 50000, "Phase 2 of 3 complete"],
- ["Mobile App v2", "Planning", "Carol", "High", "2025-06-15", 120000, "Requirements gathering"],
- ["Data Pipeline", "Complete", "Henry", "Medium", "2025-01-15", 35000, "Deployed to production"],
- ["CRM Integration", "In Progress", "Bob", "Medium", "2025-04-01", 25000, "API development phase"],
- ["Security Audit", "Not Started", "Frank", "High", "2025-02-28", 15000, "Vendor selection pending"],
+ [
+ "Website Redesign",
+ "In Progress",
+ "Alice",
+ "High",
+ "2025-03-01",
+ 50000,
+ "Phase 2 of 3 complete",
+ ],
+ [
+ "Mobile App v2",
+ "Planning",
+ "Carol",
+ "High",
+ "2025-06-15",
+ 120000,
+ "Requirements gathering",
+ ],
+ [
+ "Data Pipeline",
+ "Complete",
+ "Henry",
+ "Medium",
+ "2025-01-15",
+ 35000,
+ "Deployed to production",
+ ],
+ [
+ "CRM Integration",
+ "In Progress",
+ "Bob",
+ "Medium",
+ "2025-04-01",
+ 25000,
+ "API development phase",
+ ],
+ [
+ "Security Audit",
+ "Not Started",
+ "Frank",
+ "High",
+ "2025-02-28",
+ 15000,
+ "Vendor selection pending",
+ ],
]
for proj in projects:
ws3.append(proj)
@@ -99,14 +203,47 @@ def generate_csv_files():
writer = csv.writer(f)
writer.writerow(["SKU", "Product Name", "Category", "Description", "Price", "Stock"])
products = [
- ["WDG-001", "Smart Widget Pro", "Electronics",
- "Advanced smart widget with WiFi connectivity, 4K display, and voice control. Compatible with all major smart home ecosystems.", 299.99, 150],
- ["TBL-002", "Ergonomic Desk", "Furniture",
- "Height-adjustable standing desk with memory presets. Supports up to 200 lbs. Available in walnut and oak finishes.", 549.99, 42],
- ["SFT-003", "CloudSync Suite", "Software",
- "Enterprise file synchronization and collaboration platform. Includes 1TB storage, version control, and real-time editing.", 19.99, 999],
- ["ACC-004", "USB-C Hub Deluxe", "Accessories",
- "12-in-1 USB-C hub with dual HDMI, ethernet, SD card reader, and 100W passthrough charging.", 79.99, 320],
+ [
+ "WDG-001",
+ "Smart Widget Pro",
+ "Electronics",
+ (
+ "Advanced smart widget with WiFi connectivity, 4K display, and voice control."
+ " Compatible with all major smart home ecosystems."
+ ),
+ 299.99,
+ 150,
+ ],
+ [
+ "TBL-002",
+ "Ergonomic Desk",
+ "Furniture",
+ (
+ "Height-adjustable standing desk with memory presets."
+ " Supports up to 200 lbs. Available in walnut and oak finishes."
+ ),
+ 549.99,
+ 42,
+ ],
+ [
+ "SFT-003",
+ "CloudSync Suite",
+ "Software",
+ (
+ "Enterprise file synchronization and collaboration platform."
+ " Includes 1TB storage, version control, and real-time editing."
+ ),
+ 19.99,
+ 999,
+ ],
+ [
+ "ACC-004",
+ "USB-C Hub Deluxe",
+ "Accessories",
+ "12-in-1 USB-C hub with dual HDMI, ethernet, SD card reader, and 100W passthrough charging.",
+ 79.99,
+ 320,
+ ],
]
writer.writerows(products)
print("Generated: products.csv")
diff --git a/unstructured_documents/06_images_ocr/01_tesseract_ocr.py b/unstructured_documents/06_images_ocr/01_tesseract_ocr.py
index 7e67015..c9b1a98 100644
--- a/unstructured_documents/06_images_ocr/01_tesseract_ocr.py
+++ b/unstructured_documents/06_images_ocr/01_tesseract_ocr.py
@@ -149,17 +149,19 @@ def ocr_with_details(image_path: Path) -> list[dict]:
word = data["text"][i].strip()
conf = int(data["conf"][i])
if word and conf > 0:
- results.append({
- "text": word,
- "confidence": conf,
- "left": data["left"][i],
- "top": data["top"][i],
- "width": data["width"][i],
- "height": data["height"][i],
- "block": data["block_num"][i],
- "paragraph": data["par_num"][i],
- "line": data["line_num"][i],
- })
+ results.append(
+ {
+ "text": word,
+ "confidence": conf,
+ "left": data["left"][i],
+ "top": data["top"][i],
+ "width": data["width"][i],
+ "height": data["height"][i],
+ "block": data["block_num"][i],
+ "paragraph": data["par_num"][i],
+ "line": data["line_num"][i],
+ }
+ )
return results
@@ -218,10 +220,9 @@ def ocr_with_details(image_path: Path) -> list[dict]:
print("=" * 60)
details = ocr_with_details(simple_img)
print(f"Detected {len(details)} words")
- print(f"\nFirst 10 words with confidence:")
+ print("\nFirst 10 words with confidence:")
for word in details[:10]:
- print(f" '{word['text']}' (conf: {word['confidence']}%, "
- f"pos: x={word['left']}, y={word['top']})")
+ print(f" '{word['text']}' (conf: {word['confidence']}%, pos: x={word['left']}, y={word['top']})")
avg_conf = sum(w["confidence"] for w in details) / len(details) if details else 0
print(f"\nAverage confidence: {avg_conf:.1f}%")
diff --git a/unstructured_documents/06_images_ocr/02_easyocr_extraction.py b/unstructured_documents/06_images_ocr/02_easyocr_extraction.py
index 6d33404..88db941 100644
--- a/unstructured_documents/06_images_ocr/02_easyocr_extraction.py
+++ b/unstructured_documents/06_images_ocr/02_easyocr_extraction.py
@@ -30,6 +30,7 @@ def check_easyocr_available() -> bool:
"""Check if EasyOCR is installed."""
try:
import easyocr # noqa: F401
+
return True
except ImportError:
print("=" * 60)
@@ -86,14 +87,16 @@ def extract_with_confidence(image_path: Path, min_confidence: float = 0.5) -> li
# bbox is a list of 4 points: [top-left, top-right, bottom-right, bottom-left]
top_left = bbox[0]
bottom_right = bbox[2]
- extracted.append({
- "text": text,
- "confidence": round(confidence, 3),
- "x_min": int(top_left[0]),
- "y_min": int(top_left[1]),
- "x_max": int(bottom_right[0]),
- "y_max": int(bottom_right[1]),
- })
+ extracted.append(
+ {
+ "text": text,
+ "confidence": round(confidence, 3),
+ "x_min": int(top_left[0]),
+ "y_min": int(top_left[1]),
+ "x_max": int(bottom_right[0]),
+ "y_max": int(bottom_right[1]),
+ }
+ )
return extracted
@@ -175,8 +178,7 @@ def combine_into_paragraphs(
regions = extract_with_confidence(simple_img, min_confidence=0.3)
print(f"\nRegions with confidence >= 0.3: {len(regions)}\n")
for r in regions:
- print(f" [{r['confidence']:.3f}] '{r['text']}' "
- f"(bbox: {r['x_min']},{r['y_min']} -> {r['x_max']},{r['y_max']})")
+ print(f" [{r['confidence']:.3f}] '{r['text']}' (bbox: {r['x_min']},{r['y_min']} -> {r['x_max']},{r['y_max']})")
# --- 3. Multi-paragraph document ---
print(f"\n{'=' * 60}")
diff --git a/unstructured_documents/07_email/01_email_parsing.py b/unstructured_documents/07_email/01_email_parsing.py
index 60594b1..f342c84 100644
--- a/unstructured_documents/07_email/01_email_parsing.py
+++ b/unstructured_documents/07_email/01_email_parsing.py
@@ -24,6 +24,7 @@
# HTML tag stripper using Python's built-in html.parser
# ---------------------------------------------------------------------------
+
class HTMLTextExtractor(HTMLParser):
"""Strip HTML tags and return plain text content."""
@@ -67,6 +68,7 @@ def strip_html_tags(html: str) -> str:
# Email parsing functions
# ---------------------------------------------------------------------------
+
def load_email(eml_path: Path):
"""Load and parse an .eml file into an email.message.EmailMessage object."""
with open(eml_path, "rb") as f:
@@ -230,7 +232,7 @@ def parse_email_complete(eml_path: Path) -> dict:
print(f" Size: {att['size_bytes']} bytes")
if att.get("content") and not att["content"].startswith("[Binary"):
preview = att["content"][:200]
- print(f" Content preview:")
+ print(" Content preview:")
for line in preview.split("\n"):
print(f" {line}")
if len(att["content"]) > 200:
diff --git a/unstructured_documents/07_email/02_structured_email_extraction.py b/unstructured_documents/07_email/02_structured_email_extraction.py
index ed69a16..9c790f9 100644
--- a/unstructured_documents/07_email/02_structured_email_extraction.py
+++ b/unstructured_documents/07_email/02_structured_email_extraction.py
@@ -16,16 +16,16 @@
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+# Re-use parsing utilities from the companion script.
+# Directory/file names start with digits, so we import via importlib.
+import importlib.util
+
from unstructured_documents.shared.chunking import (
- chunk_by_sentences,
chunk_by_recursive_split,
+ chunk_by_sentences,
preview_chunks,
)
-# Re-use parsing utilities from the companion script.
-# Directory/file names start with digits, so we import via importlib.
-import importlib.util
-
_parsing_spec = importlib.util.spec_from_file_location(
"email_parsing",
Path(__file__).parent / "01_email_parsing.py",
@@ -45,6 +45,7 @@
# Structured email representation
# ---------------------------------------------------------------------------
+
def build_email_record(eml_path: Path) -> dict:
"""
Parse an .eml file into a structured record optimised for RAG.
@@ -70,9 +71,7 @@ def build_email_record(eml_path: Path) -> dict:
attachment_names.append(att["filename"])
content = att.get("content", "")
if content and not content.startswith("[Binary"):
- attachments_text_parts.append(
- f"--- Attachment: {att['filename']} ---\n{content}"
- )
+ attachments_text_parts.append(f"--- Attachment: {att['filename']} ---\n{content}")
return {
"subject": headers.get("Subject", ""),
@@ -91,6 +90,7 @@ def build_email_record(eml_path: Path) -> dict:
# RAG-ready text block construction
# ---------------------------------------------------------------------------
+
def email_to_rag_text(record: dict, include_attachments: bool = True) -> str:
"""
Combine email metadata and body into a single RAG-ready text block.
@@ -127,6 +127,7 @@ def email_to_rag_text(record: dict, include_attachments: bool = True) -> str:
# Chunking strategies for emails
# ---------------------------------------------------------------------------
+
def chunk_per_email(records: list[dict]) -> list[str]:
"""
Strategy 1: One chunk per email.
@@ -149,30 +150,31 @@ def chunk_email_body_only(records: list[dict], chunk_size: int = 500) -> list[di
all_chunks = []
for record in records:
metadata_prefix = (
- f"Email from {record['from']} to {record['to']} on {record['date']}. "
- f"Subject: {record['subject']}.\n\n"
+ f"Email from {record['from']} to {record['to']} on {record['date']}. Subject: {record['subject']}.\n\n"
)
body_chunks = chunk_by_recursive_split(record["body_text"], chunk_size=chunk_size)
for i, chunk_text in enumerate(body_chunks):
- all_chunks.append({
- "text": metadata_prefix + chunk_text,
- "source_email": record["subject"],
- "chunk_index": i,
- "total_chunks": len(body_chunks),
- })
+ all_chunks.append(
+ {
+ "text": metadata_prefix + chunk_text,
+ "source_email": record["subject"],
+ "chunk_index": i,
+ "total_chunks": len(body_chunks),
+ }
+ )
# Attachment text as separate chunks
if record["attachments_text"]:
- att_chunks = chunk_by_recursive_split(
- record["attachments_text"], chunk_size=chunk_size
- )
+ att_chunks = chunk_by_recursive_split(record["attachments_text"], chunk_size=chunk_size)
for j, att_chunk in enumerate(att_chunks):
- all_chunks.append({
- "text": metadata_prefix + f"[Attachment content]\n{att_chunk}",
- "source_email": record["subject"],
- "chunk_index": len(body_chunks) + j,
- "total_chunks": len(body_chunks) + len(att_chunks),
- })
+ all_chunks.append(
+ {
+ "text": metadata_prefix + f"[Attachment content]\n{att_chunk}",
+ "source_email": record["subject"],
+ "chunk_index": len(body_chunks) + j,
+ "total_chunks": len(body_chunks) + len(att_chunks),
+ }
+ )
return all_chunks
@@ -244,15 +246,17 @@ def chunk_email_sentences(records: list[dict], sentences_per_chunk: int = 5) ->
per_email_chunks = chunk_per_email(records)
print(f"Total chunks: {len(per_email_chunks)}")
for i, chunk in enumerate(per_email_chunks):
- print(f" Chunk {i+1}: {len(chunk)} chars")
+ print(f" Chunk {i + 1}: {len(chunk)} chars")
# Strategy 2: Body chunking with metadata prefix
print("\n--- Strategy 2: Body chunking (500 char chunks) ---")
body_chunks = chunk_email_body_only(records, chunk_size=500)
print(f"Total chunks: {len(body_chunks)}")
for chunk_info in body_chunks[:5]:
- print(f" [{chunk_info['source_email']}] chunk {chunk_info['chunk_index']+1}/"
- f"{chunk_info['total_chunks']} ({len(chunk_info['text'])} chars)")
+ print(
+ f" [{chunk_info['source_email']}] chunk {chunk_info['chunk_index'] + 1}/"
+ f"{chunk_info['total_chunks']} ({len(chunk_info['text'])} chars)"
+ )
if len(body_chunks) > 5:
print(f" ... and {len(body_chunks) - 5} more chunks")
@@ -266,10 +270,16 @@ def chunk_email_sentences(records: list[dict], sentences_per_chunk: int = 5) ->
print(f"\n{'=' * 70}")
print("CHUNKING STRATEGY SUMMARY")
print("=" * 70)
- print(f" Per-email: {len(per_email_chunks):>3} chunks | "
- f"avg {sum(len(c) for c in per_email_chunks) // len(per_email_chunks):>5} chars/chunk")
+ print(
+ f" Per-email: {len(per_email_chunks):>3} chunks | "
+ f"avg {sum(len(c) for c in per_email_chunks) // len(per_email_chunks):>5} chars/chunk"
+ )
body_texts = [c["text"] for c in body_chunks]
- print(f" Body-chunked: {len(body_chunks):>3} chunks | "
- f"avg {sum(len(c) for c in body_texts) // len(body_texts):>5} chars/chunk")
- print(f" Sentence-based: {len(sentence_chunks):>3} chunks | "
- f"avg {sum(len(c) for c in sentence_chunks) // len(sentence_chunks):>5} chars/chunk")
+ print(
+ f" Body-chunked: {len(body_chunks):>3} chunks | "
+ f"avg {sum(len(c) for c in body_texts) // len(body_texts):>5} chars/chunk"
+ )
+ print(
+ f" Sentence-based: {len(sentence_chunks):>3} chunks | "
+ f"avg {sum(len(c) for c in sentence_chunks) // len(sentence_chunks):>5} chars/chunk"
+ )
diff --git a/unstructured_documents/07_email/sample_docs/generate_samples.py b/unstructured_documents/07_email/sample_docs/generate_samples.py
index 58da58e..9ea94d9 100644
--- a/unstructured_documents/07_email/sample_docs/generate_samples.py
+++ b/unstructured_documents/07_email/sample_docs/generate_samples.py
@@ -1,9 +1,9 @@
"""Generate sample .eml files for testing email extraction methods."""
-from email.mime.text import MIMEText
-from email.mime.multipart import MIMEMultipart
-from email.mime.base import MIMEBase
from email import encoders
+from email.mime.base import MIMEBase
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
from pathlib import Path
SAMPLE_DIR = Path(__file__).parent
diff --git a/unstructured_documents/08_markdown_txt/01_text_chunking_strategies.py b/unstructured_documents/08_markdown_txt/01_text_chunking_strategies.py
index f4ee9da..01b7ab1 100644
--- a/unstructured_documents/08_markdown_txt/01_text_chunking_strategies.py
+++ b/unstructured_documents/08_markdown_txt/01_text_chunking_strategies.py
@@ -17,8 +17,8 @@
from unstructured_documents.shared.chunking import (
chunk_by_characters,
- chunk_by_sentences,
chunk_by_recursive_split,
+ chunk_by_sentences,
preview_chunks,
)
@@ -29,6 +29,7 @@
# Additional chunking strategy: paragraph-based
# ---------------------------------------------------------------------------
+
def chunk_by_paragraphs(text: str, min_paragraph_length: int = 50) -> list[str]:
"""
Split text into chunks at paragraph boundaries (double newlines).
@@ -65,6 +66,7 @@ def chunk_by_paragraphs(text: str, min_paragraph_length: int = 50) -> list[str]:
# Comparison utilities
# ---------------------------------------------------------------------------
+
def compute_stats(chunks: list[str]) -> dict:
"""Compute summary statistics for a list of text chunks."""
if not chunks:
@@ -81,11 +83,13 @@ def compute_stats(chunks: list[str]) -> dict:
def print_stats(label: str, stats: dict):
"""Pretty-print chunk statistics."""
- print(f" {label:30s} "
- f"chunks={stats['count']:>3} "
- f"avg={stats['avg_chars']:>5} chars "
- f"min={stats['min_chars']:>4} "
- f"max={stats['max_chars']:>5}")
+ print(
+ f" {label:30s} "
+ f"chunks={stats['count']:>3} "
+ f"avg={stats['avg_chars']:>5} chars "
+ f"min={stats['min_chars']:>4} "
+ f"max={stats['max_chars']:>5}"
+ )
# ---------------------------------------------------------------------------
@@ -100,8 +104,7 @@ def print_stats(label: str, stats: dict):
sys.exit(1)
text = text_path.read_text()
- print(f"Loaded: {text_path.name} ({len(text)} chars, "
- f"~{len(text.split())} words)\n")
+ print(f"Loaded: {text_path.name} ({len(text)} chars, ~{len(text.split())} words)\n")
# ===================================================================
# Strategy 1: Fixed character chunking at different sizes
@@ -123,7 +126,7 @@ def print_stats(label: str, stats: dict):
print("\n Sample chunk (500 chars, chunk #2):")
if len(chunks_500) >= 2:
sample = chunks_500[1]
- print(f" \"{sample[:150]}...\"")
+ print(f' "{sample[:150]}..."')
# ===================================================================
# Strategy 2: Sentence-based chunking
@@ -143,7 +146,7 @@ def print_stats(label: str, stats: dict):
print("\n Sample chunk (5 sentences, chunk #1):")
if chunks_sent:
sample = chunks_sent[0]
- print(f" \"{sample[:200]}...\"")
+ print(f' "{sample[:200]}..."')
# ===================================================================
# Strategy 3: Paragraph-based chunking
@@ -161,7 +164,7 @@ def print_stats(label: str, stats: dict):
print("\n Sample chunk (paragraph #1):")
if chunks_para:
sample = chunks_para[0]
- print(f" \"{sample[:200]}...\"")
+ print(f' "{sample[:200]}..."')
# ===================================================================
# Strategy 4: Recursive splitting
@@ -181,7 +184,7 @@ def print_stats(label: str, stats: dict):
print("\n Sample chunk (recursive, 500 chars, chunk #1):")
if chunks_rec:
sample = chunks_rec[0]
- print(f" \"{sample[:200]}...\"")
+ print(f' "{sample[:200]}..."')
# ===================================================================
# Summary comparison
@@ -198,12 +201,11 @@ def print_stats(label: str, stats: dict):
]
print(f"\n {'Strategy':35s} {'Chunks':>6} {'Avg':>6} {'Min':>5} {'Max':>5}")
- print(f" {'-'*35} {'-'*6} {'-'*6} {'-'*5} {'-'*5}")
+ print(f" {'-' * 35} {'-' * 6} {'-' * 6} {'-' * 5} {'-' * 5}")
for label, chunks in strategies:
s = compute_stats(chunks)
- print(f" {label:35s} {s['count']:>6} {s['avg_chars']:>6} "
- f"{s['min_chars']:>5} {s['max_chars']:>5}")
+ print(f" {label:35s} {s['count']:>6} {s['avg_chars']:>6} {s['min_chars']:>5} {s['max_chars']:>5}")
# ===================================================================
# Preview each strategy
diff --git a/unstructured_documents/08_markdown_txt/02_markdown_parsing.py b/unstructured_documents/08_markdown_txt/02_markdown_parsing.py
index 2cc4c81..8f9b106 100644
--- a/unstructured_documents/08_markdown_txt/02_markdown_parsing.py
+++ b/unstructured_documents/08_markdown_txt/02_markdown_parsing.py
@@ -29,6 +29,7 @@
# AST-based extraction helpers
# ---------------------------------------------------------------------------
+
def parse_markdown_ast(md_text: str) -> list[dict]:
"""Parse markdown text into a mistune AST (list of token dicts)."""
markdown_parser = mistune.create_markdown(renderer=None)
@@ -82,10 +83,12 @@ def extract_code_blocks(tokens: list) -> list[dict]:
if isinstance(token, dict):
if token.get("type") == "code":
attrs = token.get("attrs", {})
- blocks.append({
- "language": attrs.get("info", "") or "",
- "code": token.get("raw", "") or token.get("text", ""),
- })
+ blocks.append(
+ {
+ "language": attrs.get("info", "") or "",
+ "code": token.get("raw", "") or token.get("text", ""),
+ }
+ )
for key in ("children", "body"):
if key in token and isinstance(token[key], list):
blocks.extend(extract_code_blocks(token[key]))
@@ -179,6 +182,7 @@ def _collect_text(tokens: list) -> str:
# Heading-aware chunking for markdown
# ---------------------------------------------------------------------------
+
def chunk_markdown_by_sections(md_text: str) -> list[dict]:
"""
Split markdown into chunks where each section (heading + content until
@@ -199,7 +203,6 @@ def extract_code_blocks_for_rag(md_text: str) -> list[dict]:
"""
ast = parse_markdown_ast(md_text)
code_blocks = extract_code_blocks(ast)
- headings = extract_headings(ast)
# Approximate: match code blocks to the nearest preceding heading
# by finding the heading positions in the raw text
@@ -227,13 +230,15 @@ def extract_code_blocks_for_rag(md_text: str) -> list[dict]:
else:
break
- result.append({
- "section": section,
- "language": block["language"],
- "code": code_text,
- "rag_text": f"Code example from section '{section}' "
- f"(language: {block['language'] or 'unknown'}):\n\n{code_text}",
- })
+ result.append(
+ {
+ "section": section,
+ "language": block["language"],
+ "code": code_text,
+ "rag_text": f"Code example from section '{section}' "
+ f"(language: {block['language'] or 'unknown'}):\n\n{code_text}",
+ }
+ )
return result
@@ -242,6 +247,7 @@ def extract_code_blocks_for_rag(md_text: str) -> list[dict]:
# Structured section extraction (for research papers)
# ---------------------------------------------------------------------------
+
def extract_research_sections(md_text: str) -> dict[str, str]:
"""
Extract named sections from a research paper-style markdown document.
@@ -292,7 +298,7 @@ def extract_research_sections(md_text: str) -> dict[str, str]:
paragraphs = extract_paragraphs(ast)
print(f"\n--- Paragraphs ({len(paragraphs)} total) ---")
for i, p in enumerate(paragraphs[:3]):
- print(f" [{i+1}] {p[:100]}...")
+ print(f" [{i + 1}] {p[:100]}...")
if len(paragraphs) > 3:
print(f" ... and {len(paragraphs) - 3} more")
@@ -301,21 +307,20 @@ def extract_research_sections(md_text: str) -> dict[str, str]:
print(f"\n--- Code Blocks ({len(code_blocks)} total) ---")
for i, cb in enumerate(code_blocks):
preview = cb["code"][:80].replace("\n", "\\n")
- print(f" [{i+1}] lang={cb['language'] or 'none'}: {preview}...")
+ print(f" [{i + 1}] lang={cb['language'] or 'none'}: {preview}...")
# --- Extract lists ---
lists = extract_lists(ast)
print(f"\n--- Lists ({len(lists)} total) ---")
for i, lst in enumerate(lists):
kind = "ordered" if lst["ordered"] else "unordered"
- print(f" [{i+1}] {kind}, {len(lst['items'])} items: "
- f"{lst['items'][0][:60]}...")
+ print(f" [{i + 1}] {kind}, {len(lst['items'])} items: {lst['items'][0][:60]}...")
# --- Extract tables ---
tables = extract_tables(ast)
print(f"\n--- Tables ({len(tables)} total) ---")
for i, tbl in enumerate(tables):
- print(f" [{i+1}] {len(tbl['headers'])} columns, {len(tbl['rows'])} rows")
+ print(f" [{i + 1}] {len(tbl['headers'])} columns, {len(tbl['rows'])} rows")
print(f" Headers: {tbl['headers']}")
if tbl["rows"]:
print(f" First row: {tbl['rows'][0]}")
@@ -341,8 +346,7 @@ def extract_research_sections(md_text: str) -> dict[str, str]:
code_rag_chunks = extract_code_blocks_for_rag(tech_md)
for i, chunk in enumerate(code_rag_chunks):
- print(f" [{i+1}] Section: '{chunk['section']}', "
- f"Language: {chunk['language'] or 'none'}")
+ print(f" [{i + 1}] Section: '{chunk['section']}', Language: {chunk['language'] or 'none'}")
preview = chunk["code"][:100].replace("\n", "\\n")
print(f" Code: {preview}...")
diff --git a/unstructured_documents/08_markdown_txt/03_semantic_chunking.py b/unstructured_documents/08_markdown_txt/03_semantic_chunking.py
index f6419db..574672e 100644
--- a/unstructured_documents/08_markdown_txt/03_semantic_chunking.py
+++ b/unstructured_documents/08_markdown_txt/03_semantic_chunking.py
@@ -22,8 +22,8 @@
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from unstructured_documents.shared.chunking import (
- chunk_by_sentences,
chunk_by_recursive_split,
+ chunk_by_sentences,
preview_chunks,
)
@@ -34,6 +34,7 @@
# Sentence tokenization
# ---------------------------------------------------------------------------
+
def tokenize_sentences(text: str) -> list[str]:
"""
Split text into individual sentences.
@@ -43,7 +44,7 @@ def tokenize_sentences(text: str) -> list[str]:
better accuracy with abbreviations, decimal numbers, etc.
"""
# Split on .!? followed by whitespace, but not inside common abbreviations
- pattern = r'(?<=[.!?])\s+(?=[A-Z])'
+ pattern = r"(?<=[.!?])\s+(?=[A-Z])"
sentences = re.split(pattern, text)
return [s.strip() for s in sentences if s.strip()]
@@ -52,6 +53,7 @@ def tokenize_sentences(text: str) -> list[str]:
# Sliding window chunking
# ---------------------------------------------------------------------------
+
def chunk_sliding_window(
text: str,
window_size: int = 5,
@@ -76,7 +78,7 @@ def chunk_sliding_window(
chunks = []
for i in range(0, len(sentences) - window_size + 1, stride):
- window = sentences[i:i + window_size]
+ window = sentences[i : i + window_size]
chunks.append(" ".join(window))
# Ensure we capture the last sentences if they were not fully covered
@@ -92,6 +94,7 @@ def chunk_sliding_window(
# Paragraph-based semantic chunking
# ---------------------------------------------------------------------------
+
def chunk_by_topic_paragraphs(text: str, max_chunk_size: int = 800) -> list[str]:
"""
Paragraph-based chunking that groups related paragraphs together.
@@ -130,6 +133,7 @@ def chunk_by_topic_paragraphs(text: str, max_chunk_size: int = 800) -> list[str]
# Semantic coherence analysis
# ---------------------------------------------------------------------------
+
def analyze_chunk_coherence(chunks: list[str]) -> list[dict]:
"""
Analyze the semantic coherence of each chunk using simple heuristics.
@@ -153,19 +157,21 @@ def analyze_chunk_coherence(chunks: list[str]) -> list[dict]:
ends_complete = chunk.rstrip()[-1] in ".!?" if chunk.rstrip() else False
# Count unique "topic words" (non-stopword words with 4+ chars)
- words = re.findall(r'\b[a-zA-Z]{4,}\b', chunk.lower())
+ words = re.findall(r"\b[a-zA-Z]{4,}\b", chunk.lower())
unique_words = set(words)
# Higher ratio = more diverse vocabulary = potentially less focused
vocab_diversity = len(unique_words) / len(words) if words else 0
- results.append({
- "chunk_index": i,
- "length": len(chunk),
- "sentence_count": len(sentences),
- "starts_complete": starts_complete,
- "ends_complete": ends_complete,
- "vocab_diversity": round(vocab_diversity, 3),
- })
+ results.append(
+ {
+ "chunk_index": i,
+ "length": len(chunk),
+ "sentence_count": len(sentences),
+ "starts_complete": starts_complete,
+ "ends_complete": ends_complete,
+ "vocab_diversity": round(vocab_diversity, 3),
+ }
+ )
return results
@@ -180,12 +186,9 @@ def print_coherence_report(label: str, chunks: list[str]):
print(f"\n {label}:")
print(f" Total chunks: {len(chunks)}")
- print(f" Complete sentence starts: {complete_starts}/{len(chunks)} "
- f"({100*complete_starts//len(chunks)}%)")
- print(f" Complete sentence ends: {complete_ends}/{len(chunks)} "
- f"({100*complete_ends//len(chunks)}%)")
- print(f" Avg vocab diversity: {avg_diversity:.3f} "
- f"(lower = more focused)")
+ print(f" Complete sentence starts: {complete_starts}/{len(chunks)} ({100 * complete_starts // len(chunks)}%)")
+ print(f" Complete sentence ends: {complete_ends}/{len(chunks)} ({100 * complete_ends // len(chunks)}%)")
+ print(f" Avg vocab diversity: {avg_diversity:.3f} (lower = more focused)")
avg_len = sum(len(c) for c in chunks) // len(chunks) if chunks else 0
print(f" Avg chunk length: {avg_len} chars")
@@ -215,9 +218,9 @@ def print_coherence_report(label: str, chunks: list[str]):
sentences = tokenize_sentences(text)
print(f" Total sentences: {len(sentences)}")
print(f" Avg sentence length: {sum(len(s) for s in sentences) // len(sentences)} chars")
- print(f"\n First 5 sentences:")
+ print("\n First 5 sentences:")
for i, sent in enumerate(sentences[:5]):
- print(f" [{i+1}] {sent[:100]}{'...' if len(sent) > 100 else ''}")
+ print(f" [{i + 1}] {sent[:100]}{'...' if len(sent) > 100 else ''}")
# ===================================================================
# 2. Sliding window chunking
@@ -294,24 +297,23 @@ def print_coherence_report(label: str, chunks: list[str]):
# Show the 3rd paragraph and check how each strategy handles it
target_para = paragraphs[2]
target_start = target_para[:50]
- print(f" Target passage (paragraph 3, starts with):")
- print(f" \"{target_start}...\"\n")
+ print(" Target passage (paragraph 3, starts with):")
+ print(f' "{target_start}..."\n')
for label, chunks in all_strategies.items():
# Find which chunk(s) contain the start of this paragraph
- matching = [
- (i, c) for i, c in enumerate(chunks)
- if target_start in c
- ]
+ matching = [(i, c) for i, c in enumerate(chunks) if target_start in c]
if matching:
idx, chunk = matching[0]
# Check if the full paragraph is in this chunk
full_match = target_para in chunk
print(f" {label}:")
- print(f" Found in chunk {idx+1}/{len(chunks)}, "
- f"full paragraph preserved: {'Yes' if full_match else 'No'}")
- print(f" Chunk preview: \"{chunk[:100]}...\"")
+ print(
+ f" Found in chunk {idx + 1}/{len(chunks)}, "
+ f"full paragraph preserved: {'Yes' if full_match else 'No'}"
+ )
+ print(f' Chunk preview: "{chunk[:100]}..."')
else:
print(f" {label}:")
- print(f" Target paragraph split across chunks (partial match)")
+ print(" Target paragraph split across chunks (partial match)")
print()
diff --git a/unstructured_documents/08_markdown_txt/sample_docs/generate_samples.py b/unstructured_documents/08_markdown_txt/sample_docs/generate_samples.py
index 26ce668..ca4ebe7 100644
--- a/unstructured_documents/08_markdown_txt/sample_docs/generate_samples.py
+++ b/unstructured_documents/08_markdown_txt/sample_docs/generate_samples.py
@@ -12,17 +12,24 @@ def generate_technical_doc():
## Introduction
-REST (Representational State Transfer) is an architectural style for designing networked applications. RESTful APIs have become the standard way for web services to communicate, powering everything from mobile apps to microservices architectures. This guide covers the essential concepts, best practices, and implementation patterns for building robust REST APIs.
+REST (Representational State Transfer) is an architectural style for designing networked applications. RESTful APIs \
+have become the standard way for web services to communicate, powering everything from mobile apps to microservices \
+architectures. This guide covers the essential concepts, best practices, and implementation patterns for building \
+robust REST APIs.
## Core Principles
REST is built on several fundamental principles that guide API design:
-- **Statelessness**: Each request contains all the information needed to process it. The server does not store client session state between requests.
-- **Client-Server Separation**: The client and server are independent. The client does not need to know about data storage, and the server does not need to know about the user interface.
-- **Uniform Interface**: Resources are identified by URIs, manipulated through representations, and self-descriptive messages.
+- **Statelessness**: Each request contains all the information needed to process it. The server does not store client \
+session state between requests.
+- **Client-Server Separation**: The client and server are independent. The client does not need to know about data \
+storage, and the server does not need to know about the user interface.
+- **Uniform Interface**: Resources are identified by URIs, manipulated through representations, and self-descriptive \
+messages.
- **Cacheability**: Responses must define themselves as cacheable or non-cacheable to improve performance.
-- **Layered System**: The architecture can be composed of hierarchical layers, with each layer only knowing about the layer it interacts with.
+- **Layered System**: The architecture can be composed of hierarchical layers, with each layer only knowing about the \
+layer it interacts with.
## HTTP Methods
@@ -38,7 +45,8 @@ def generate_technical_doc():
### Idempotency
-An operation is **idempotent** if performing it multiple times produces the same result as performing it once. GET, PUT, and DELETE are idempotent by design. POST is not, because calling it twice creates two resources.
+An operation is **idempotent** if performing it multiple times produces the same result as performing it once. GET, \
+PUT, and DELETE are idempotent by design. POST is not, because calling it twice creates two resources.
## Resource Design
@@ -173,25 +181,44 @@ def generate_research_paper():
## Abstract
-Retrieval-Augmented Generation (RAG) systems rely on splitting documents into chunks for embedding and retrieval. The choice of chunk size significantly impacts retrieval accuracy, answer quality, and system latency. In this paper, we conduct a systematic evaluation of chunk sizes ranging from 128 to 2048 tokens across four domain-specific datasets. Our experiments reveal that the optimal chunk size is highly domain-dependent, with technical documentation benefiting from larger chunks (512-1024 tokens) while conversational datasets perform best with smaller chunks (128-256 tokens). We propose an adaptive chunking framework that selects chunk sizes based on document characteristics, achieving a 15.3% improvement in answer accuracy over fixed-size approaches.
+Retrieval-Augmented Generation (RAG) systems rely on splitting documents into chunks for embedding and retrieval. The \
+choice of chunk size significantly impacts retrieval accuracy, answer quality, and system latency. In this paper, we \
+conduct a systematic evaluation of chunk sizes ranging from 128 to 2048 tokens across four domain-specific datasets. \
+Our experiments reveal that the optimal chunk size is highly domain-dependent, with technical documentation benefiting \
+from larger chunks (512-1024 tokens) while conversational datasets perform best with smaller chunks (128-256 tokens). \
+We propose an adaptive chunking framework that selects chunk sizes based on document characteristics, achieving a \
+15.3% \
+improvement in answer accuracy over fixed-size approaches.
## Introduction
-The emergence of large language models (LLMs) has transformed natural language processing, yet these models face fundamental limitations when dealing with knowledge that is not present in their training data. Retrieval-Augmented Generation addresses this limitation by augmenting LLM inputs with relevant information retrieved from external knowledge bases.
+The emergence of large language models (LLMs) has transformed natural language processing, yet these models face \
+fundamental limitations when dealing with knowledge that is not present in their training data. Retrieval-Augmented \
+Generation addresses this limitation by augmenting LLM inputs with relevant information retrieved from external \
+knowledge bases.
-A critical but often overlooked component of RAG systems is the chunking strategy -- how source documents are divided into segments for embedding and retrieval. The chunk size directly affects multiple aspects of system performance. Chunks that are too small may lack sufficient context for meaningful retrieval, while chunks that are too large may introduce noise and exceed the context window limitations of embedding models.
+A critical but often overlooked component of RAG systems is the chunking strategy -- how source documents are divided \
+into segments for embedding and retrieval. The chunk size directly affects multiple aspects of system performance. \
+Chunks that are too small may lack sufficient context for meaningful retrieval, while chunks that are too large may \
+introduce noise and exceed the context window limitations of embedding models.
-Despite its importance, chunk size selection is typically treated as a hyperparameter to be tuned empirically, with little guidance available to practitioners. Most implementations default to arbitrary sizes (e.g., 512 or 1000 characters) without considering the characteristics of the source documents or the nature of expected queries.
+Despite its importance, chunk size selection is typically treated as a hyperparameter to be tuned empirically, with \
+little guidance available to practitioners. Most implementations default to arbitrary sizes (e.g., 512 or 1000 \
+characters) without considering the characteristics of the source documents or the nature of expected queries.
-This paper makes three contributions. First, we present a comprehensive empirical study of chunk size effects across diverse domains. Second, we identify document characteristics that correlate with optimal chunk sizes. Third, we propose an adaptive chunking framework that automatically selects appropriate chunk sizes based on document analysis.
+This paper makes three contributions. First, we present a comprehensive empirical study of chunk size effects across \
+diverse domains. Second, we identify document characteristics that correlate with optimal chunk sizes. Third, we \
+propose an adaptive chunking framework that automatically selects appropriate chunk sizes based on document analysis.
## Methodology
### Datasets
-We evaluate our approach on four datasets representing different document types commonly encountered in RAG applications:
+We evaluate our approach on four datasets representing different document types commonly encountered in RAG \
+applications:
-1. **TechDocs**: 5,000 technical documentation pages from open-source software projects, containing code examples, API references, and tutorials.
+1. **TechDocs**: 5,000 technical documentation pages from open-source software projects, containing code examples, API \
+references, and tutorials.
2. **LegalCorpus**: 2,500 legal documents including contracts, regulations, and court opinions.
3. **MedicalQA**: 3,200 medical articles and clinical guidelines from PubMed.
4. **ConversationLogs**: 10,000 customer support conversations from an enterprise help desk system.
@@ -216,7 +243,10 @@ def generate_research_paper():
### Experimental Setup
-All experiments use the same embedding model (text-embedding-3-small) and LLM (GPT-4) to isolate the effect of chunking. We use cosine similarity for retrieval with a FAISS index. Each configuration is evaluated on 500 queries per dataset, with three independent runs to account for variance.
+All experiments use the same embedding model (text-embedding-3-small) and LLM (GPT-4) to isolate the effect of \
+chunking. We use cosine similarity for retrieval with a FAISS index. Each configuration is evaluated on 500 queries \
+per \
+dataset, with three independent runs to account for variance.
## Results
@@ -224,25 +254,51 @@ def generate_research_paper():
Our experiments reveal a clear relationship between chunk size and performance that varies by domain:
-For TechDocs, retrieval precision peaks at 512 tokens (0.82) and remains stable up to 1024 tokens (0.80), dropping sharply at 2048 tokens (0.61). Answer accuracy follows a similar pattern, with the best scores at 512-768 tokens. The presence of code blocks in technical documentation means that smaller chunks often split code examples, losing critical context.
+For TechDocs, retrieval precision peaks at 512 tokens (0.82) and remains stable up to 1024 tokens (0.80), dropping \
+sharply at 2048 tokens (0.61). Answer accuracy follows a similar pattern, with the best scores at 512-768 tokens. The \
+presence of code blocks in technical documentation means that smaller chunks often split code examples, losing \
+critical \
+context.
-For LegalCorpus, larger chunks consistently outperform smaller ones. Precision at 1024 tokens (0.78) is significantly higher than at 128 tokens (0.52). Legal text relies heavily on cross-references within paragraphs, and splitting these references across chunks degrades retrieval quality.
+For LegalCorpus, larger chunks consistently outperform smaller ones. Precision at 1024 tokens (0.78) is significantly \
+higher than at 128 tokens (0.52). Legal text relies heavily on cross-references within paragraphs, and splitting these \
+references across chunks degrades retrieval quality.
-MedicalQA shows optimal performance at 256-512 tokens. Medical text is information-dense, and smaller chunks allow more precise retrieval of specific facts. However, chunks below 128 tokens lose the clinical context needed for accurate answers.
+MedicalQA shows optimal performance at 256-512 tokens. Medical text is information-dense, and smaller chunks allow \
+more \
+precise retrieval of specific facts. However, chunks below 128 tokens lose the clinical context needed for accurate \
+answers.
-ConversationLogs perform best at 128-256 tokens, reflecting the short, turn-based nature of support conversations. Each turn typically contains a discrete piece of information, and larger chunks introduce noise from unrelated conversation turns.
+ConversationLogs perform best at 128-256 tokens, reflecting the short, turn-based nature of support conversations. \
+Each \
+turn typically contains a discrete piece of information, and larger chunks introduce noise from unrelated conversation \
+turns.
### Chunking Strategy Comparison
-Across all datasets and sizes, recursive chunking consistently outperforms fixed-character chunking by 8-12% on retrieval precision. Sentence-based chunking performs comparably to recursive chunking for well-structured documents but falls behind for documents with inconsistent formatting. Semantic chunking achieves the best results on ConversationLogs but is computationally expensive, adding 200-400ms of preprocessing time per document.
+Across all datasets and sizes, recursive chunking consistently outperforms fixed-character chunking by 8-12% on \
+retrieval precision. Sentence-based chunking performs comparably to recursive chunking for well-structured documents \
+but falls behind for documents with inconsistent formatting. Semantic chunking achieves the best results on \
+ConversationLogs but is computationally expensive, adding 200-400ms of preprocessing time per document.
## Conclusion
-Our study demonstrates that chunk size selection is a critical design decision in RAG systems that should not be treated as a simple hyperparameter. The optimal chunk size depends on document type, content density, and query patterns. We recommend that practitioners begin with recursive chunking at 512 tokens as a reasonable default, then adjust based on domain-specific evaluation.
-
-The adaptive chunking framework we propose provides an automated approach to chunk size selection that eliminates the need for manual tuning. By analyzing document characteristics such as average sentence length, paragraph structure, and vocabulary density, the framework selects an appropriate chunk size for each document, achieving consistent improvements across all evaluated domains.
-
-Future work will explore dynamic chunk sizes within a single document, where different sections may benefit from different chunk sizes based on their content characteristics. We also plan to investigate the interaction between chunk size and different embedding models, as model architecture and training data may influence the optimal chunking strategy.
+Our study demonstrates that chunk size selection is a critical design decision in RAG systems that should not be \
+treated as a simple hyperparameter. The optimal chunk size depends on document type, content density, and query \
+patterns. We recommend that practitioners begin with recursive chunking at 512 tokens as a reasonable default, then \
+adjust based on domain-specific evaluation.
+
+The adaptive chunking framework we propose provides an automated approach to chunk size selection that eliminates the \
+need for manual tuning. By analyzing document characteristics such as average sentence length, paragraph structure, \
+and \
+vocabulary density, the framework selects an appropriate chunk size for each document, achieving consistent \
+improvements across all evaluated domains.
+
+Future work will explore dynamic chunk sizes within a single document, where different sections may benefit from \
+different chunk sizes based on their content characteristics. We also plan to investigate the interaction between \
+chunk \
+size and different embedding models, as model architecture and training data may influence the optimal chunking \
+strategy.
"""
(SAMPLE_DIR / "research_paper.md").write_text(md)
print("Generated: research_paper.md")
@@ -253,35 +309,105 @@ def generate_plain_text():
txt = """\
The History of Computing: From Mechanical Calculators to Artificial Intelligence
-The story of computing stretches back thousands of years, beginning with the earliest counting devices and culminating in the powerful artificial intelligence systems we use today. Understanding this history provides essential context for appreciating how far the field has come and where it might be heading.
-
-The earliest computing devices were simple mechanical tools designed to assist with arithmetic. The abacus, which originated in ancient Mesopotamia around 2400 BCE, was one of the first widely used calculating tools. It remained the primary computational aid for merchants and scholars for millennia, and variations of it are still used in some parts of the world today. The abacus demonstrated a fundamental principle that would persist throughout computing history: the use of physical representations to model abstract mathematical concepts.
-
-The seventeenth century saw the first mechanical calculators. Blaise Pascal invented the Pascaline in 1642, a device that could perform addition and subtraction through a system of interlocking gears. A few decades later, Gottfried Wilhelm Leibniz improved upon Pascal's design with his Step Reckoner, which could also perform multiplication and division. These devices were remarkable engineering achievements, but they were expensive, fragile, and limited to basic arithmetic operations.
-
-The conceptual foundations of modern computing were laid in the nineteenth century by Charles Babbage, an English mathematician and inventor. Babbage designed two groundbreaking machines: the Difference Engine, intended for computing mathematical tables, and the Analytical Engine, a general-purpose computing machine. The Analytical Engine, though never completed during Babbage's lifetime, contained many features found in modern computers, including a processing unit, memory, and the ability to be programmed using punched cards. Ada Lovelace, who worked with Babbage, wrote what is widely considered the first computer program -- an algorithm for the Analytical Engine to compute Bernoulli numbers. Her insight that the machine could manipulate symbols beyond mere numbers foreshadowed the versatility of modern computers.
-
-The late nineteenth and early twentieth centuries brought the development of electromechanical computing devices. Herman Hollerith invented a tabulating machine that used punched cards to process data for the 1890 United States Census. His company eventually became IBM, which would dominate the computing industry for decades. The punched card system proved so effective that it remained in widespread use well into the 1970s.
-
-The true dawn of electronic computing arrived during World War II. The war created an urgent need for rapid calculations -- breaking enemy codes, computing artillery firing tables, and designing nuclear weapons. The British Colossus machines, built at Bletchley Park starting in 1943, were among the first electronic digital computers, designed specifically for codebreaking. In the United States, the ENIAC (Electronic Numerical Integrator and Computer) became operational in 1945. ENIAC was a massive machine, weighing over 27 tons and containing more than 17,000 vacuum tubes. Despite its size, it could perform calculations thousands of times faster than any previous device.
-
-The postwar period saw rapid advances in computing technology. The invention of the transistor at Bell Labs in 1947 by John Bardeen, Walter Brattain, and William Shockley transformed the field. Transistors were smaller, more reliable, and consumed far less power than vacuum tubes. By the late 1950s, transistor-based computers were replacing their vacuum-tube predecessors, making computing more accessible and affordable.
-
-The development of integrated circuits in the late 1950s and early 1960s represented another quantum leap. Jack Kilby at Texas Instruments and Robert Noyce at Fairchild Semiconductor independently developed methods for fabricating multiple transistors on a single piece of semiconductor material. This innovation dramatically reduced the size and cost of computing components while increasing their speed and reliability. Gordon Moore, co-founder of Intel, observed in 1965 that the number of transistors on integrated circuits was doubling approximately every two years -- an observation that became known as Moore's Law and has held roughly true for over five decades.
-
-The 1970s brought computing to the masses with the development of the microprocessor and the personal computer. The Intel 4004, released in 1971, was the first commercially available microprocessor, integrating the entire central processing unit of a computer onto a single chip. This development paved the way for personal computers. The Altair 8800, introduced in 1975, is often considered the first commercially successful personal computer, though it required assembly and had limited capabilities. The Apple II, released in 1977, and the IBM PC, introduced in 1981, brought personal computing to homes and offices worldwide.
-
-The 1980s and 1990s saw the rise of software as a driving force in computing. Operating systems like Microsoft Windows and Apple's Macintosh OS made computers accessible to non-technical users through graphical user interfaces. The development of the World Wide Web by Tim Berners-Lee in 1989, built on top of the Internet infrastructure that had been growing since the 1960s, transformed computing from a standalone activity into a connected experience. Email, web browsing, and online commerce changed how people communicated, accessed information, and conducted business.
-
-The twenty-first century has been characterized by the explosion of mobile computing, cloud services, and artificial intelligence. The introduction of the iPhone in 2007 and subsequent smartphones put powerful computers in billions of pockets worldwide. Cloud computing, pioneered by companies like Amazon Web Services, Google Cloud, and Microsoft Azure, shifted computing resources from local machines to massive data centers, enabling on-demand access to virtually unlimited processing power and storage.
-
-Perhaps the most significant development of recent years has been the rapid advancement of artificial intelligence and machine learning. Neural networks, a concept dating back to the 1940s, have experienced a renaissance thanks to increased computational power, vast amounts of training data, and algorithmic improvements. Deep learning techniques have achieved remarkable results in image recognition, natural language processing, game playing, and scientific discovery. The development of large language models, trained on enormous text corpora, has demonstrated capabilities that were considered science fiction just a decade ago.
-
-The field of computing continues to evolve at a remarkable pace. Quantum computing promises to solve certain problems that are intractable for classical computers. Edge computing brings processing closer to data sources, reducing latency for real-time applications. Advances in hardware design, from specialized AI accelerators to neuromorphic chips that mimic brain architecture, continue to push the boundaries of what is computationally feasible.
-
-Looking back over the history of computing, several themes emerge. First, the trend toward miniaturization and increased capability has been remarkably consistent, from room-sized machines to pocket-sized devices millions of times more powerful. Second, each major advance in hardware has enabled new categories of software applications that were previously impractical. Third, computing has progressively moved from being a specialized tool for scientists and engineers to being an integral part of everyday life for billions of people. Finally, the pace of change continues to accelerate, with each decade bringing transformations that would have seemed impossible to previous generations.
-
-As we look toward the future, the history of computing reminds us that the most impactful developments often come from unexpected directions. The inventors of the transistor could not have imagined smartphones, and the creators of the Internet did not foresee social media. Whatever comes next in computing will likely be equally surprising and transformative.
+The story of computing stretches back thousands of years, beginning with the earliest counting devices and culminating \
+in the powerful artificial intelligence systems we use today. Understanding this history provides essential context \
+for \
+appreciating how far the field has come and where it might be heading.
+
+The earliest computing devices were simple mechanical tools designed to assist with arithmetic. The abacus, which \
+originated in ancient Mesopotamia around 2400 BCE, was one of the first widely used calculating tools. It remained the \
+primary computational aid for merchants and scholars for millennia, and variations of it are still used in some parts \
+of the world today. The abacus demonstrated a fundamental principle that would persist throughout computing history: \
+the use of physical representations to model abstract mathematical concepts.
+
+The seventeenth century saw the first mechanical calculators. Blaise Pascal invented the Pascaline in 1642, a device \
+that could perform addition and subtraction through a system of interlocking gears. A few decades later, Gottfried \
+Wilhelm Leibniz improved upon Pascal's design with his Step Reckoner, which could also perform multiplication and \
+division. These devices were remarkable engineering achievements, but they were expensive, fragile, and limited to \
+basic arithmetic operations.
+
+The conceptual foundations of modern computing were laid in the nineteenth century by Charles Babbage, an English \
+mathematician and inventor. Babbage designed two groundbreaking machines: the Difference Engine, intended for \
+computing \
+mathematical tables, and the Analytical Engine, a general-purpose computing machine. The Analytical Engine, though \
+never completed during Babbage's lifetime, contained many features found in modern computers, including a processing \
+unit, memory, and the ability to be programmed using punched cards. Ada Lovelace, who worked with Babbage, wrote what \
+is widely considered the first computer program -- an algorithm for the Analytical Engine to compute Bernoulli \
+numbers. \
+Her insight that the machine could manipulate symbols beyond mere numbers foreshadowed the versatility of modern \
+computers.
+
+The late nineteenth and early twentieth centuries brought the development of electromechanical computing devices. \
+Herman Hollerith invented a tabulating machine that used punched cards to process data for the 1890 United States \
+Census. His company eventually became IBM, which would dominate the computing industry for decades. The punched card \
+system proved so effective that it remained in widespread use well into the 1970s.
+
+The true dawn of electronic computing arrived during World War II. The war created an urgent need for rapid \
+calculations -- breaking enemy codes, computing artillery firing tables, and designing nuclear weapons. The British \
+Colossus machines, built at Bletchley Park starting in 1943, were among the first electronic digital computers, \
+designed specifically for codebreaking. In the United States, the ENIAC (Electronic Numerical Integrator and Computer) \
+became operational in 1945. ENIAC was a massive machine, weighing over 27 tons and containing more than 17,000 vacuum \
+tubes. Despite its size, it could perform calculations thousands of times faster than any previous device.
+
+The postwar period saw rapid advances in computing technology. The invention of the transistor at Bell Labs in 1947 by \
+John Bardeen, Walter Brattain, and William Shockley transformed the field. Transistors were smaller, more reliable, \
+and \
+consumed far less power than vacuum tubes. By the late 1950s, transistor-based computers were replacing their \
+vacuum-tube predecessors, making computing more accessible and affordable.
+
+The development of integrated circuits in the late 1950s and early 1960s represented another quantum leap. Jack Kilby \
+at Texas Instruments and Robert Noyce at Fairchild Semiconductor independently developed methods for fabricating \
+multiple transistors on a single piece of semiconductor material. This innovation dramatically reduced the size and \
+cost of computing components while increasing their speed and reliability. Gordon Moore, co-founder of Intel, observed \
+in 1965 that the number of transistors on integrated circuits was doubling approximately every two years -- an \
+observation that became known as Moore's Law and has held roughly true for over five decades.
+
+The 1970s brought computing to the masses with the development of the microprocessor and the personal computer. The \
+Intel 4004, released in 1971, was the first commercially available microprocessor, integrating the entire central \
+processing unit of a computer onto a single chip. This development paved the way for personal computers. The Altair \
+8800, introduced in 1975, is often considered the first commercially successful personal computer, though it required \
+assembly and had limited capabilities. The Apple II, released in 1977, and the IBM PC, introduced in 1981, brought \
+personal computing to homes and offices worldwide.
+
+The 1980s and 1990s saw the rise of software as a driving force in computing. Operating systems like Microsoft Windows \
+and Apple's Macintosh OS made computers accessible to non-technical users through graphical user interfaces. The \
+development of the World Wide Web by Tim Berners-Lee in 1989, built on top of the Internet infrastructure that had \
+been \
+growing since the 1960s, transformed computing from a standalone activity into a connected experience. Email, web \
+browsing, and online commerce changed how people communicated, accessed information, and conducted business.
+
+The twenty-first century has been characterized by the explosion of mobile computing, cloud services, and artificial \
+intelligence. The introduction of the iPhone in 2007 and subsequent smartphones put powerful computers in billions of \
+pockets worldwide. Cloud computing, pioneered by companies like Amazon Web Services, Google Cloud, and Microsoft \
+Azure, \
+shifted computing resources from local machines to massive data centers, enabling on-demand access to virtually \
+unlimited processing power and storage.
+
+Perhaps the most significant development of recent years has been the rapid advancement of artificial intelligence and \
+machine learning. Neural networks, a concept dating back to the 1940s, have experienced a renaissance thanks to \
+increased computational power, vast amounts of training data, and algorithmic improvements. Deep learning techniques \
+have achieved remarkable results in image recognition, natural language processing, game playing, and scientific \
+discovery. The development of large language models, trained on enormous text corpora, has demonstrated capabilities \
+that were considered science fiction just a decade ago.
+
+The field of computing continues to evolve at a remarkable pace. Quantum computing promises to solve certain problems \
+that are intractable for classical computers. Edge computing brings processing closer to data sources, reducing \
+latency \
+for real-time applications. Advances in hardware design, from specialized AI accelerators to neuromorphic chips that \
+mimic brain architecture, continue to push the boundaries of what is computationally feasible.
+
+Looking back over the history of computing, several themes emerge. First, the trend toward miniaturization and \
+increased capability has been remarkably consistent, from room-sized machines to pocket-sized devices millions of \
+times \
+more powerful. Second, each major advance in hardware has enabled new categories of software applications that were \
+previously impractical. Third, computing has progressively moved from being a specialized tool for scientists and \
+engineers to being an integral part of everyday life for billions of people. Finally, the pace of change continues to \
+accelerate, with each decade bringing transformations that would have seemed impossible to previous generations.
+
+As we look toward the future, the history of computing reminds us that the most impactful developments often come from \
+unexpected directions. The inventors of the transistor could not have imagined smartphones, and the creators of the \
+Internet did not foresee social media. Whatever comes next in computing will likely be equally surprising and \
+transformative.
"""
(SAMPLE_DIR / "plain_text.txt").write_text(txt)
print("Generated: plain_text.txt")
@@ -292,7 +418,8 @@ def generate_structured_notes():
txt = """\
Topic: Machine Learning Fundamentals
-Machine learning is a subset of artificial intelligence that enables systems to learn from data without being explicitly programmed. It has become one of the most important areas of computer science.
+Machine learning is a subset of artificial intelligence that enables systems to learn from data without being \
+explicitly programmed. It has become one of the most important areas of computer science.
Key Concepts:
- Supervised learning uses labeled training data to learn a mapping function
@@ -311,7 +438,8 @@ def generate_structured_notes():
Topic: Data Preprocessing
-Data preprocessing is a critical step that can significantly impact model performance. Raw data often contains noise, missing values, and inconsistencies that must be addressed before training.
+Data preprocessing is a critical step that can significantly impact model performance. Raw data often contains noise, \
+missing values, and inconsistencies that must be addressed before training.
Steps in Data Preprocessing:
- Data cleaning: handle missing values, remove duplicates, fix errors
@@ -328,7 +456,8 @@ def generate_structured_notes():
Topic: Model Evaluation
-Evaluating model performance correctly is essential for building reliable systems. Different metrics are appropriate for different types of problems.
+Evaluating model performance correctly is essential for building reliable systems. Different metrics are appropriate \
+for different types of problems.
Classification Metrics:
- Accuracy: overall proportion of correct predictions
@@ -351,7 +480,8 @@ def generate_structured_notes():
Topic: Deployment Considerations
-Moving a model from development to production requires careful planning and infrastructure. Many models that perform well in testing fail to deliver value in production.
+Moving a model from development to production requires careful planning and infrastructure. Many models that perform \
+well in testing fail to deliver value in production.
Key Challenges:
- Model serving: choosing between batch and real-time inference
diff --git a/unstructured_documents/09_epub/01_ebooklib_extraction.py b/unstructured_documents/09_epub/01_ebooklib_extraction.py
index 0d40b00..8a2790c 100644
--- a/unstructured_documents/09_epub/01_ebooklib_extraction.py
+++ b/unstructured_documents/09_epub/01_ebooklib_extraction.py
@@ -59,12 +59,14 @@ def list_items(book: epub.EpubBook) -> list[dict]:
"""
items = []
for item in book.get_items():
- items.append({
- "id": item.get_id(),
- "name": item.get_name(),
- "type": item.get_type(),
- "is_document": item.get_type() == ITEM_DOCUMENT,
- })
+ items.append(
+ {
+ "id": item.get_id(),
+ "name": item.get_name(),
+ "type": item.get_type(),
+ "is_document": item.get_type() == ITEM_DOCUMENT,
+ }
+ )
return items
diff --git a/unstructured_documents/09_epub/02_epub_to_text.py b/unstructured_documents/09_epub/02_epub_to_text.py
index e4dc5c9..4201d12 100644
--- a/unstructured_documents/09_epub/02_epub_to_text.py
+++ b/unstructured_documents/09_epub/02_epub_to_text.py
@@ -99,11 +99,13 @@ def build_table_of_contents(chapters: list[epub.EpubHtml]) -> list[dict]:
toc = []
for i, item in enumerate(chapters, 1):
heading = extract_chapter_heading(item)
- toc.append({
- "chapter_num": i,
- "title": heading,
- "file": item.get_name(),
- })
+ toc.append(
+ {
+ "chapter_num": i,
+ "title": heading,
+ "file": item.get_name(),
+ }
+ )
return toc
@@ -208,49 +210,55 @@ def prepare_for_rag(book: epub.EpubBook, strategy: str = "chapter") -> list[dict
# One chunk per chapter
for i, item in enumerate(chapters):
text = extract_clean_text(item)
- rag_chunks.append({
- "text": text,
- "metadata": {
- "source": "epub",
- "book_title": book_title,
- "author": book_author,
- "chapter": toc[i]["title"],
- "chapter_num": toc[i]["chapter_num"],
- "chunk_strategy": "chapter",
- },
- })
+ rag_chunks.append(
+ {
+ "text": text,
+ "metadata": {
+ "source": "epub",
+ "book_title": book_title,
+ "author": book_author,
+ "chapter": toc[i]["title"],
+ "chapter_num": toc[i]["chapter_num"],
+ "chunk_strategy": "chapter",
+ },
+ }
+ )
elif strategy == "recursive":
# Recursive split across entire book for uniform chunk sizes
full_text = epub_to_full_text(book)
text_chunks = chunk_by_recursive_split(full_text, chunk_size=500)
for j, chunk_text in enumerate(text_chunks):
- rag_chunks.append({
- "text": chunk_text,
- "metadata": {
- "source": "epub",
- "book_title": book_title,
- "author": book_author,
- "chunk_index": j,
- "chunk_strategy": "recursive_split",
- },
- })
+ rag_chunks.append(
+ {
+ "text": chunk_text,
+ "metadata": {
+ "source": "epub",
+ "book_title": book_title,
+ "author": book_author,
+ "chunk_index": j,
+ "chunk_strategy": "recursive_split",
+ },
+ }
+ )
elif strategy == "heading":
# Heading-aware chunking using markdown conversion
md_text = epub_to_markdown(book)
heading_chunks = chunk_by_headings(md_text)
for chunk in heading_chunks:
- rag_chunks.append({
- "text": chunk["content"],
- "metadata": {
- "source": "epub",
- "book_title": book_title,
- "author": book_author,
- "section_heading": chunk["heading"],
- "chunk_strategy": "heading_aware",
- },
- })
+ rag_chunks.append(
+ {
+ "text": chunk["content"],
+ "metadata": {
+ "source": "epub",
+ "book_title": book_title,
+ "author": book_author,
+ "section_heading": chunk["heading"],
+ "chunk_strategy": "heading_aware",
+ },
+ }
+ )
return rag_chunks
@@ -280,7 +288,7 @@ def prepare_for_rag(book: epub.EpubBook, strategy: str = "chapter") -> list[dict
print("=" * 60)
full_text = epub_to_full_text(book)
print(f"Total text length: {len(full_text)} characters")
- print(f"\nFirst 500 chars:")
+ print("\nFirst 500 chars:")
print(full_text[:500])
print("...")
@@ -290,7 +298,7 @@ def prepare_for_rag(book: epub.EpubBook, strategy: str = "chapter") -> list[dict
print("=" * 60)
md_text = epub_to_markdown(book)
print(f"Markdown length: {len(md_text)} characters")
- print(f"\nFirst 500 chars:")
+ print("\nFirst 500 chars:")
print(md_text[:500])
print("...")
@@ -305,17 +313,14 @@ def prepare_for_rag(book: epub.EpubBook, strategy: str = "chapter") -> list[dict
print(f"Chunks: {len(chapter_chunks)}")
for chunk in chapter_chunks:
meta = chunk["metadata"]
- print(f" [{meta['chapter_num']}] {meta['chapter']} "
- f"({len(chunk['text'])} chars)")
+ print(f" [{meta['chapter_num']}] {meta['chapter']} ({len(chunk['text'])} chars)")
# Strategy B: Recursive split
print("\n--- Strategy B: Recursive Split (500 chars) ---")
recursive_chunks = prepare_for_rag(book, strategy="recursive")
print(f"Chunks: {len(recursive_chunks)}")
for chunk in recursive_chunks[:5]:
- print(f" Chunk {chunk['metadata']['chunk_index']}: "
- f"{len(chunk['text'])} chars - "
- f"{chunk['text'][:80]}...")
+ print(f" Chunk {chunk['metadata']['chunk_index']}: {len(chunk['text'])} chars - {chunk['text'][:80]}...")
if len(recursive_chunks) > 5:
print(f" ... and {len(recursive_chunks) - 5} more chunks")
@@ -341,7 +346,7 @@ def prepare_for_rag(book: epub.EpubBook, strategy: str = "chapter") -> list[dict
print("\nEach chunk includes text + metadata for vector DB storage:\n")
if heading_chunks:
sample = heading_chunks[0]
- print(f" text: \"{sample['text'][:150]}...\"")
- print(f" metadata:")
+ print(f' text: "{sample["text"][:150]}..."')
+ print(" metadata:")
for key, value in sample["metadata"].items():
print(f" {key}: {value}")
diff --git a/unstructured_documents/09_epub/sample_docs/generate_samples.py b/unstructured_documents/09_epub/sample_docs/generate_samples.py
index af20688..a7670f9 100644
--- a/unstructured_documents/09_epub/sample_docs/generate_samples.py
+++ b/unstructured_documents/09_epub/sample_docs/generate_samples.py
@@ -25,12 +25,7 @@ def create_chapter(
) -> epub.EpubHtml:
"""Create an EPUB chapter with proper HTML wrapping."""
chapter = epub.EpubHtml(title=title, file_name=filename, lang=lang)
- chapter.content = (
- f""
- f"{title}
"
- f"{html_body}"
- f""
- )
+ chapter.content = f"{title}
{html_body}"
return chapter
@@ -43,7 +38,11 @@ def generate_sample_book() -> Path:
book.set_title("Introduction to Data Science")
book.set_language("en")
book.add_author("AI Research Lab")
- book.add_metadata("DC", "description", "A beginner-friendly introduction to data science concepts.")
+ book.add_metadata(
+ "DC",
+ "description",
+ "A beginner-friendly introduction to data science concepts.",
+ )
book.add_metadata("DC", "publisher", "RAG Source Publishing")
# --- Chapter 1: What is Data Science? ---
@@ -196,9 +195,9 @@ def generate_sample_book() -> Path:
epub.write_epub(str(output_path), book, {})
print(f" Created: {output_path.name}")
- print(f" Title: Introduction to Data Science")
- print(f" Author: AI Research Lab")
- print(f" Chapters: 4")
+ print(" Title: Introduction to Data Science")
+ print(" Author: AI Research Lab")
+ print(" Chapters: 4")
return output_path
diff --git a/unstructured_documents/generate_presentation.py b/unstructured_documents/generate_presentation.py
index b912d72..27a3e04 100644
--- a/unstructured_documents/generate_presentation.py
+++ b/unstructured_documents/generate_presentation.py
@@ -11,10 +11,10 @@
from pathlib import Path
from pptx import Presentation
-from pptx.util import Inches, Pt, Emu
from pptx.dml.color import RGBColor
-from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
from pptx.enum.shapes import MSO_SHAPE
+from pptx.enum.text import PP_ALIGN
+from pptx.util import Inches, Pt
# ---------------------------------------------------------------------------
# Color scheme
@@ -36,11 +36,10 @@
# Helper functions
# ---------------------------------------------------------------------------
+
def add_title_bar(slide, title_text: str):
"""Add a dark blue title bar at the top of a slide."""
- shape = slide.shapes.add_shape(
- MSO_SHAPE.RECTANGLE, Inches(0), Inches(0), Inches(13.333), Inches(1.2)
- )
+ shape = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, Inches(0), Inches(0), Inches(13.333), Inches(1.2))
shape.fill.solid()
shape.fill.fore_color.rgb = DARK_BLUE
shape.line.fill.background()
@@ -59,9 +58,7 @@ def add_title_bar(slide, title_text: str):
def add_section_divider(prs, section_title: str, subtitle: str = ""):
"""Add a full-slide section divider with dark background."""
slide = prs.slides.add_slide(prs.slide_layouts[6]) # blank
- bg = slide.shapes.add_shape(
- MSO_SHAPE.RECTANGLE, Inches(0), Inches(0), Inches(13.333), Inches(7.5)
- )
+ bg = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, Inches(0), Inches(0), Inches(13.333), Inches(7.5))
bg.fill.solid()
bg.fill.fore_color.rgb = DARK_BLUE
bg.line.fill.background()
@@ -139,10 +136,7 @@ def add_table_slide(prs, title: str, headers: list[str], rows: list[list[str]],
col_width = min(Inches(12) // n_cols, Inches(3))
table_width = col_width * n_cols
- table_shape = slide.shapes.add_table(
- n_rows, n_cols,
- Inches(0.6), top, table_width, Inches(0.4) * n_rows
- )
+ table_shape = slide.shapes.add_table(n_rows, n_cols, Inches(0.6), top, table_width, Inches(0.4) * n_rows)
table = table_shape.table
# Header row
@@ -171,8 +165,14 @@ def add_table_slide(prs, title: str, headers: list[str], rows: list[list[str]],
return slide
-def add_two_column_slide(prs, title: str, left_title: str, left_bullets: list[str],
- right_title: str, right_bullets: list[str]):
+def add_two_column_slide(
+ prs,
+ title: str,
+ left_title: str,
+ left_bullets: list[str],
+ right_title: str,
+ right_bullets: list[str],
+):
"""Add a slide with two columns."""
slide = prs.slides.add_slide(prs.slide_layouts[6])
add_title_bar(slide, title)
@@ -207,6 +207,7 @@ def add_two_column_slide(prs, title: str, left_title: str, left_bullets: list[st
# BUILD THE PRESENTATION
# =========================================================================
+
def build_presentation():
prs = Presentation()
prs.slide_width = Inches(13.333)
@@ -247,73 +248,130 @@ def build_presentation():
# =================================================================
# SLIDE 2: What is RAG?
# =================================================================
- add_content_slide(prs, "What is RAG? (Retrieval-Augmented Generation)", [
- "RAG combines a Large Language Model (LLM) with an external knowledge base",
- "Instead of relying solely on training data, the LLM retrieves relevant documents at query time",
- "The quality of RAG depends entirely on the quality of your document parsing",
- "Garbage in = Garbage out: poorly parsed documents lead to irrelevant retrieval",
- "This presentation covers HOW to extract text from every major document type",
- ], {
- 0: ["User asks a question → System retrieves relevant chunks → LLM generates answer using those chunks"],
- 3: ["If your PDF text is garbled, table data is mangled, or images aren't OCR'd — the LLM can't help"],
- })
+ add_content_slide(
+ prs,
+ "What is RAG? (Retrieval-Augmented Generation)",
+ [
+ "RAG combines a Large Language Model (LLM) with an external knowledge base",
+ "Instead of relying solely on training data, the LLM retrieves relevant documents at query time",
+ "The quality of RAG depends entirely on the quality of your document parsing",
+ "Garbage in = Garbage out: poorly parsed documents lead to irrelevant retrieval",
+ "This presentation covers HOW to extract text from every major document type",
+ ],
+ {
+ 0: ["User asks a question → System retrieves relevant chunks → LLM generates answer using those chunks"],
+ 3: ["If your PDF text is garbled, table data is mangled, or images aren't OCR'd — the LLM can't help"],
+ },
+ )
# =================================================================
# SLIDE 3: The Document Parsing Challenge
# =================================================================
- add_content_slide(prs, "The Document Parsing Challenge", [
- 'Why can\'t we just "read" a file?',
- "Each file format stores data in fundamentally different ways internally",
- "A PDF doesn't contain 'paragraphs' — it contains coordinates and character codes",
- "A DOCX isn't a text file — it's a ZIP archive containing XML",
- "An image has NO text at all — just millions of colored pixels",
- "Tables, headers, footers, and multi-column layouts add complexity",
- "The same logical content looks completely different across formats",
- "Different methods give different results — choosing the right one matters",
- ])
+ add_content_slide(
+ prs,
+ "The Document Parsing Challenge",
+ [
+ 'Why can\'t we just "read" a file?',
+ "Each file format stores data in fundamentally different ways internally",
+ "A PDF doesn't contain 'paragraphs' — it contains coordinates and character codes",
+ "A DOCX isn't a text file — it's a ZIP archive containing XML",
+ "An image has NO text at all — just millions of colored pixels",
+ "Tables, headers, footers, and multi-column layouts add complexity",
+ "The same logical content looks completely different across formats",
+ "Different methods give different results — choosing the right one matters",
+ ],
+ )
# =================================================================
# SLIDE 4: Overview of 9 Data Sources
# =================================================================
- add_table_slide(prs, "9 Unstructured Data Sources for RAG",
+ add_table_slide(
+ prs,
+ "9 Unstructured Data Sources for RAG",
["Type", "Format", "Key Challenge", "# Methods"],
[
- ["PDF", "Binary / coordinate-based", "Text encoding, tables, scanned docs", "6"],
- ["Word (DOCX)", "ZIP of XML files", "Styles, heading hierarchy, tables", "3"],
- ["PowerPoint (PPTX)", "ZIP of XML slides", "Sparse text, visual content, notes", "2"],
- ["HTML / Web", "DOM tree markup", "Boilerplate removal (nav, ads, footer)", "3"],
- ["Spreadsheets", "Cell-based (XLSX/CSV)", "Meaning from position, not text", "3"],
+ [
+ "PDF",
+ "Binary / coordinate-based",
+ "Text encoding, tables, scanned docs",
+ "6",
+ ],
+ [
+ "Word (DOCX)",
+ "ZIP of XML files",
+ "Styles, heading hierarchy, tables",
+ "3",
+ ],
+ [
+ "PowerPoint (PPTX)",
+ "ZIP of XML slides",
+ "Sparse text, visual content, notes",
+ "2",
+ ],
+ [
+ "HTML / Web",
+ "DOM tree markup",
+ "Boilerplate removal (nav, ads, footer)",
+ "3",
+ ],
+ [
+ "Spreadsheets",
+ "Cell-based (XLSX/CSV)",
+ "Meaning from position, not text",
+ "3",
+ ],
["Images", "Pixel grid (PNG/JPG)", "No inherent text — needs OCR", "2"],
- ["Email (EML)", "MIME multipart", "Encoded parts, HTML bodies, attachments", "2"],
- ["Markdown / Text", "Plain text", "No structure — chunking is the challenge", "3"],
- ["EPUB (Ebooks)", "ZIP of XHTML chapters", "Chapter extraction, HTML inside", "2"],
+ [
+ "Email (EML)",
+ "MIME multipart",
+ "Encoded parts, HTML bodies, attachments",
+ "2",
+ ],
+ [
+ "Markdown / Text",
+ "Plain text",
+ "No structure — chunking is the challenge",
+ "3",
+ ],
+ [
+ "EPUB (Ebooks)",
+ "ZIP of XHTML chapters",
+ "Chapter extraction, HTML inside",
+ "2",
+ ],
],
- intro="Each document type stores data differently and requires specialized extraction approaches."
+ intro="Each document type stores data differently and requires specialized extraction approaches.",
)
# =================================================================
# SLIDE 5: Universal RAG Pipeline
# =================================================================
- add_content_slide(prs, "The Universal RAG Document Pipeline", [
- "Every document type follows the same high-level pipeline:",
- "",
- "1. SOURCE DOCUMENT → Your PDF, DOCX, HTML, image, etc.",
- "2. PARSE / EXTRACT → Pull out raw text, tables, metadata (THIS presentation focuses here)",
- "3. CLEAN → Remove noise, normalize formatting, strip boilerplate",
- "4. CHUNK → Split into retrieval-sized pieces (200-500 tokens typical)",
- "5. EMBED → Convert each chunk to a vector using an embedding model",
- "6. STORE → Save vectors + metadata in a vector database (Pinecone, Chroma, etc.)",
- "",
- "Steps 2-4 (Parse, Clean, Chunk) are where most RAG quality is won or lost",
- "This presentation covers steps 2-4 in depth for every document type",
- ])
+ add_content_slide(
+ prs,
+ "The Universal RAG Document Pipeline",
+ [
+ "Every document type follows the same high-level pipeline:",
+ "",
+ "1. SOURCE DOCUMENT → Your PDF, DOCX, HTML, image, etc.",
+ "2. PARSE / EXTRACT → Pull out raw text, tables, metadata (THIS presentation focuses here)",
+ "3. CLEAN → Remove noise, normalize formatting, strip boilerplate",
+ "4. CHUNK → Split into retrieval-sized pieces (200-500 tokens typical)",
+ "5. EMBED → Convert each chunk to a vector using an embedding model",
+ "6. STORE → Save vectors + metadata in a vector database (Pinecone, Chroma, etc.)",
+ "",
+ "Steps 2-4 (Parse, Clean, Chunk) are where most RAG quality is won or lost",
+ "This presentation covers steps 2-4 in depth for every document type",
+ ],
+ )
# =================================================================
# SLIDE 6: Why Different Approaches
# =================================================================
- add_two_column_slide(prs,
+ add_two_column_slide(
+ prs,
"Why Different Formats Need Different Approaches",
- "What You See (Visual)", [
+ "What You See (Visual)",
+ [
"A heading in bold, large font",
"A paragraph of body text",
"A table with rows and columns",
@@ -322,7 +380,8 @@ def build_presentation():
"",
"All of these LOOK similar across formats...",
],
- "What The Computer Sees (Internal)", [
+ "What The Computer Sees (Internal)",
+ [
"PDF: BT /F1 24 Tf 72 720 Td (Heading) Tj ET",
'DOCX: ',
"PPTX: Heading inside ",
@@ -330,35 +389,63 @@ def build_presentation():
"Image: [0,0,0,255,255,255,0,0,0...] pixels",
"",
"...but are stored completely differently!",
- ]
+ ],
)
# =================================================================
# SLIDE 7: How Data Lives Inside Files
# =================================================================
- add_content_slide(prs, "How Data Actually Lives Inside Files", [
- "PDF: Binary content streams with character codes + (x,y) coordinates. The text 'Hello' might be stored as individual characters placed at specific pixel positions. No concept of 'paragraphs'.",
- "DOCX: A ZIP file containing XML. Unzip a .docx and you'll find word/document.xml with (paragraph) and (run) tags.",
- "PPTX: A ZIP file of XML slides. Each slide has shapes () containing text frames. Text isn't linear — it's in positioned boxes.",
- "HTML: A tree of nested tags. Actual content is mixed with navigation, ads, footers, and JavaScript.",
- "Spreadsheets: Cells addressed by (row, col). 'Salary: $125,000' only makes sense because the header is in row 1.",
- "Images: Just a grid of pixel colors (R, G, B values). The letter 'A' is a pattern of dark pixels on a light background — the computer has zero knowledge it's text.",
- "Email: MIME-encoded parts, potentially Base64, with headers and multipart boundaries.",
- ])
+ add_content_slide(
+ prs,
+ "How Data Actually Lives Inside Files",
+ [
+ (
+ "PDF: Binary content streams with character codes + (x,y) coordinates."
+ " The text 'Hello' might be stored as individual characters placed at"
+ " specific pixel positions. No concept of 'paragraphs'."
+ ),
+ (
+ "DOCX: A ZIP file containing XML. Unzip a .docx and you'll find"
+ " word/document.xml with (paragraph) and (run) tags."
+ ),
+ (
+ "PPTX: A ZIP file of XML slides. Each slide has shapes ()"
+ " containing text frames. Text isn't linear — it's in positioned boxes."
+ ),
+ "HTML: A tree of nested tags. Actual content is mixed with navigation, ads, footers, and JavaScript.",
+ (
+ "Spreadsheets: Cells addressed by (row, col). 'Salary: $125,000' only"
+ " makes sense because the header is in row 1."
+ ),
+ (
+ "Images: Just a grid of pixel colors (R, G, B values). The letter 'A'"
+ " is a pattern of dark pixels on a light background — the computer has"
+ " zero knowledge it's text."
+ ),
+ "Email: MIME-encoded parts, potentially Base64, with headers and multipart boundaries.",
+ ],
+ )
# =================================================================
# SLIDE 8: Key Concepts
# =================================================================
- add_content_slide(prs, "Key Concepts for This Presentation", [
- "Parsing: Reading the raw file format and extracting its contents",
- "Extraction: Pulling meaningful text, tables, and metadata from parsed content",
- "Chunking: Splitting extracted text into smaller pieces sized for embedding models",
- "Tokens: Sub-word units that LLMs process (~1 token ≈ 4 characters in English)",
- "Embeddings: Dense vector representations of text chunks (e.g., 1536 dimensions)",
- "Overlap: Including some text from the previous chunk in the next one to avoid losing context at boundaries",
- "Metadata: Information ABOUT the chunk (source file, page number, heading, date) used for filtering",
- "Boilerplate: Non-content elements (navigation, footers, ads) that add noise to embeddings",
- ])
+ add_content_slide(
+ prs,
+ "Key Concepts for This Presentation",
+ [
+ "Parsing: Reading the raw file format and extracting its contents",
+ "Extraction: Pulling meaningful text, tables, and metadata from parsed content",
+ "Chunking: Splitting extracted text into smaller pieces sized for embedding models",
+ "Tokens: Sub-word units that LLMs process (~1 token ≈ 4 characters in English)",
+ "Embeddings: Dense vector representations of text chunks (e.g., 1536 dimensions)",
+ (
+ "Overlap: Including some text from the previous chunk in the next one"
+ " to avoid losing context at boundaries"
+ ),
+ "Metadata: Information ABOUT the chunk (source file, page number, heading, date) used for filtering",
+ "Boilerplate: Non-content elements (navigation, footers, ads) that add noise to embeddings",
+ ],
+ )
# =================================================================
# SECTION 2: File Format Internals
@@ -366,117 +453,153 @@ def build_presentation():
add_section_divider(prs, "Section 2", "How Data Lives Inside Each Format")
# SLIDE 9: PDF Internals
- add_content_slide(prs, "PDF Internals: Not What You Think", [
- "PDF = Portable Document Format. Designed for DISPLAY, not data extraction.",
- "Text is NOT stored as paragraphs — it's individual characters at (x, y) coordinates",
- "Example: The word 'Hello' might be stored as:",
- " BT /F1 12 Tf 72 720 Td (H) Tj 7.2 0 Td (e) Tj 5.4 0 Td (l) Tj ...",
- "Fonts can use custom encoding — character code 65 doesn't always mean 'A'",
- "Images in PDFs are embedded binary streams — text inside images is invisible to parsers",
- "Tables are NOT tables — they're just text characters aligned in a grid visually",
- "Multi-column layouts: text flows down column 1, then column 2 — parsers may interleave them",
- "This is why PDF parsing has 6 different methods — each handles these challenges differently",
- ])
+ add_content_slide(
+ prs,
+ "PDF Internals: Not What You Think",
+ [
+ "PDF = Portable Document Format. Designed for DISPLAY, not data extraction.",
+ "Text is NOT stored as paragraphs — it's individual characters at (x, y) coordinates",
+ "Example: The word 'Hello' might be stored as:",
+ " BT /F1 12 Tf 72 720 Td (H) Tj 7.2 0 Td (e) Tj 5.4 0 Td (l) Tj ...",
+ "Fonts can use custom encoding — character code 65 doesn't always mean 'A'",
+ "Images in PDFs are embedded binary streams — text inside images is invisible to parsers",
+ "Tables are NOT tables — they're just text characters aligned in a grid visually",
+ "Multi-column layouts: text flows down column 1, then column 2 — parsers may interleave them",
+ "This is why PDF parsing has 6 different methods — each handles these challenges differently",
+ ],
+ )
# SLIDE 10: DOCX Internals
- add_content_slide(prs, "DOCX Internals: ZIP of XML Files", [
- "A .docx file is literally a ZIP archive. Rename it to .zip and extract it!",
- "Inside you'll find: word/document.xml, word/styles.xml, word/media/ (images), etc.",
- "document.xml structure: → (paragraph) → (run) → (text)",
- "Styles define Heading 1, Heading 2, Normal, etc. — essential for structured extraction",
- "Each 'run' can have different formatting (bold, italic, font) within the same paragraph",
- "Tables are → (row) → (cell) — well-structured for extraction",
- "Advantage over PDF: paragraphs and headings are explicitly marked in the XML",
- "This is why DOCX parsing is generally easier and more reliable than PDF parsing",
- ])
+ add_content_slide(
+ prs,
+ "DOCX Internals: ZIP of XML Files",
+ [
+ "A .docx file is literally a ZIP archive. Rename it to .zip and extract it!",
+ "Inside you'll find: word/document.xml, word/styles.xml, word/media/ (images), etc.",
+ "document.xml structure: → (paragraph) → (run) → (text)",
+ "Styles define Heading 1, Heading 2, Normal, etc. — essential for structured extraction",
+ "Each 'run' can have different formatting (bold, italic, font) within the same paragraph",
+ "Tables are → (row) → (cell) — well-structured for extraction",
+ "Advantage over PDF: paragraphs and headings are explicitly marked in the XML",
+ "This is why DOCX parsing is generally easier and more reliable than PDF parsing",
+ ],
+ )
# SLIDE 11: PPTX Internals
- add_content_slide(prs, "PPTX Internals: Slides, Shapes, and Text Frames", [
- "A .pptx file is also a ZIP archive, similar to DOCX",
- "Inside: ppt/slides/slide1.xml, slide2.xml, etc. + notesSlides/ for speaker notes",
- "Each slide contains shapes () — text boxes, tables, images, group shapes",
- "Text is inside text frames: → (paragraph) → (run) → (text)",
- "Title and content are in 'placeholders' — but not all shapes are placeholders",
- "Group shapes nest other shapes inside them — extractors must recurse",
- "Speaker notes live in separate notesSlide XML files — often contain the BEST text for RAG",
- "Challenge: Slides are visual — text is sparse and context depends on visual layout",
- ])
+ add_content_slide(
+ prs,
+ "PPTX Internals: Slides, Shapes, and Text Frames",
+ [
+ "A .pptx file is also a ZIP archive, similar to DOCX",
+ "Inside: ppt/slides/slide1.xml, slide2.xml, etc. + notesSlides/ for speaker notes",
+ "Each slide contains shapes () — text boxes, tables, images, group shapes",
+ "Text is inside text frames: → (paragraph) → (run) → (text)",
+ "Title and content are in 'placeholders' — but not all shapes are placeholders",
+ "Group shapes nest other shapes inside them — extractors must recurse",
+ "Speaker notes live in separate notesSlide XML files — often contain the BEST text for RAG",
+ "Challenge: Slides are visual — text is sparse and context depends on visual layout",
+ ],
+ )
# SLIDE 12: HTML Internals
- add_content_slide(prs, "HTML Internals: DOM Tree with Boilerplate", [
- "HTML is a tree of nested tags: → →