Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions advanced_methods/01_docling/01_basic_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@

uv pip install docling
"""
import sys

from pathlib import Path

# Reference sample docs from the documents folder
SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"


def convert_single_document():
"""Convert a single PDF to markdown using default settings."""
from docling.document_converter import DocumentConverter
Expand All @@ -29,7 +30,7 @@ def convert_single_document():
print("DOCLING: Basic PDF to Markdown")
print("=" * 60)
print(f"Status: {result.status}")
print(f"\n--- Markdown Output ---\n")
print("\n--- Markdown Output ---\n")
print(result.document.export_to_markdown())


Expand Down Expand Up @@ -84,7 +85,6 @@ def export_formats():
print(text[:300])

# JSON (lossless serialization)
import json
json_str = doc.model_dump_json(indent=2)
print(f"\n--- JSON ({len(json_str)} chars) ---")
print(json_str[:400] + "...")
Expand Down
23 changes: 9 additions & 14 deletions advanced_methods/01_docling/02_pdf_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@
uv pip install "docling[tesserocr]" # for Tesseract OCR
uv pip install "docling[easyocr]" # for EasyOCR
"""
import sys

from pathlib import Path

SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"


def table_extraction_modes():
"""Compare FAST vs ACCURATE table detection on a table-heavy PDF."""
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.document_converter import PdfFormatOption
from docling.document_converter import DocumentConverter, PdfFormatOption

pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "tables.pdf"

for mode_name, mode in [("FAST", TableFormerMode.FAST), ("ACCURATE", TableFormerMode.ACCURATE)]:
for mode_name, mode in [
("FAST", TableFormerMode.FAST),
("ACCURATE", TableFormerMode.ACCURATE),
]:
print(f"\n{'=' * 60}")
print(f"TABLE DETECTION MODE: {mode_name}")
print(f"{'=' * 60}")
Expand All @@ -41,9 +43,7 @@ def table_extraction_modes():
)

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

result = converter.convert(str(pdf_path))
Expand All @@ -54,10 +54,9 @@ def table_extraction_modes():

def ocr_configuration():
"""Configure OCR settings for scanned documents."""
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import PdfFormatOption
from docling.document_converter import DocumentConverter, PdfFormatOption

pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "simple_text.pdf"

Expand All @@ -70,11 +69,7 @@ def ocr_configuration():
# ocr_options=TesseractCliOcrOptions(lang=["eng"])
)

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)})

print("=" * 60)
print("PDF WITH OCR ENABLED")
Expand Down
24 changes: 13 additions & 11 deletions advanced_methods/01_docling/03_chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

uv pip install docling docling-core
"""
import sys

from pathlib import Path

SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
Expand All @@ -37,9 +37,9 @@ def hierarchical_chunking():
print("=" * 60)

for i, chunk in enumerate(chunks[:5]):
print(f"\n--- Chunk {i+1} ---")
print(f"\n--- Chunk {i + 1} ---")
print(f"Text: {chunk.text[:200]}...")
if hasattr(chunk, 'meta') and chunk.meta:
if hasattr(chunk, "meta") and chunk.meta:
print(f"Metadata: {chunk.meta}")
print()

Expand All @@ -66,9 +66,9 @@ def hybrid_chunking():
print("=" * 60)

for i, chunk in enumerate(chunks[:5]):
print(f"\n--- Chunk {i+1} ---")
print(f"\n--- Chunk {i + 1} ---")
print(f"Text: {chunk.text[:200]}...")
if hasattr(chunk, 'meta') and chunk.meta:
if hasattr(chunk, "meta") and chunk.meta:
print(f"Metadata: {chunk.meta}")


Expand All @@ -83,21 +83,23 @@ def compare_chunking_strategies():

hier_chunks = list(HierarchicalChunker().chunk(result.document))

hybrid_chunks = list(HybridChunker(
tokenizer="sentence-transformers/all-MiniLM-L6-v2",
max_tokens=256,
).chunk(result.document))
hybrid_chunks = list(
HybridChunker(
tokenizer="sentence-transformers/all-MiniLM-L6-v2",
max_tokens=256,
).chunk(result.document)
)

print("=" * 60)
print("CHUNKING STRATEGY COMPARISON")
print("=" * 60)
print(f"\nHierarchical: {len(hier_chunks)} chunks")
for i, c in enumerate(hier_chunks[:3]):
print(f" [{i+1}] {len(c.text)} chars: {c.text[:80]}...")
print(f" [{i + 1}] {len(c.text)} chars: {c.text[:80]}...")

print(f"\nHybrid (256 tokens): {len(hybrid_chunks)} chunks")
for i, c in enumerate(hybrid_chunks[:3]):
print(f" [{i+1}] {len(c.text)} chars: {c.text[:80]}...")
print(f" [{i + 1}] {len(c.text)} chars: {c.text[:80]}...")


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions advanced_methods/01_docling/04_integrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
uv pip install llama-index-readers-docling # for LlamaIndex
uv pip install langchain-docling # for LangChain
"""
import sys

from pathlib import Path

SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
Expand Down Expand Up @@ -48,7 +48,7 @@ def llamaindex_integration():
print("DOCLING + LLAMAINDEX")
print("=" * 60)
for i, doc in enumerate(documents):
print(f"\nDocument {i+1}:")
print(f"\nDocument {i + 1}:")
print(f" Text length: {len(doc.text)}")
print(f" Preview: {doc.text[:200]}...")

Expand Down Expand Up @@ -85,7 +85,7 @@ def langchain_integration():
print("DOCLING + LANGCHAIN")
print("=" * 60)
for i, doc in enumerate(documents):
print(f"\nDocument {i+1}:")
print(f"\nDocument {i + 1}:")
print(f" Content: {doc.page_content[:200]}...")
print(f" Metadata: {doc.metadata}")

Expand Down
17 changes: 13 additions & 4 deletions advanced_methods/02_unstructured_io/01_auto_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

uv pip install "unstructured[all-docs]"
"""
import sys

from pathlib import Path

SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"
Expand All @@ -29,7 +29,7 @@ def auto_partition_pdf():
for el in elements:
print(f"\n[{type(el).__name__}]")
print(f" Text: {str(el)[:150]}")
if hasattr(el, 'metadata'):
if hasattr(el, "metadata"):
if el.metadata.page_number:
print(f" Page: {el.metadata.page_number}")

Expand All @@ -44,7 +44,10 @@ def auto_partition_multiple():
("HTML", SAMPLES_DIR / "04_html" / "sample_docs" / "article_page.html"),
("PPTX", SAMPLES_DIR / "03_pptx" / "sample_docs" / "presentation.pptx"),
("Email", SAMPLES_DIR / "07_email" / "sample_docs" / "plain_text.eml"),
("Markdown", SAMPLES_DIR / "08_markdown_txt" / "sample_docs" / "technical_doc.md"),
(
"Markdown",
SAMPLES_DIR / "08_markdown_txt" / "sample_docs" / "technical_doc.md",
),
("EPUB", SAMPLES_DIR / "09_epub" / "sample_docs" / "sample_book.epub"),
]

Expand All @@ -59,6 +62,7 @@ def auto_partition_multiple():

# Show element type distribution
from collections import Counter

type_counts = Counter(type(el).__name__ for el in elements)
for etype, count in type_counts.most_common():
print(f" {etype}: {count}")
Expand Down Expand Up @@ -97,6 +101,7 @@ def element_types_overview():
""")

from collections import Counter

type_counts = Counter(type(el).__name__ for el in elements)
print(f"Found in this document ({len(elements)} elements):")
for etype, count in type_counts.most_common():
Expand All @@ -110,4 +115,8 @@ def element_types_overview():
print("3. Element types overview")
choice = input("Enter 1/2/3 (default=1): ").strip() or "1"

{"1": auto_partition_pdf, "2": auto_partition_multiple, "3": element_types_overview}[choice]()
{
"1": auto_partition_pdf,
"2": auto_partition_multiple,
"3": element_types_overview,
}[choice]()
19 changes: 10 additions & 9 deletions advanced_methods/02_unstructured_io/02_pdf_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@

uv pip install "unstructured[pdf]"
"""
import sys

from pathlib import Path

SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"


def compare_strategies():
"""Compare fast, hi_res, and ocr_only on the same PDF."""
from unstructured.partition.pdf import partition_pdf
from collections import Counter
import time
from collections import Counter

from unstructured.partition.pdf import partition_pdf

pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "tables.pdf")

Expand Down Expand Up @@ -51,7 +52,7 @@ def compare_strategies():
print(f" [{type(el).__name__}] {str(el)[:120]}")
except Exception as e:
print(f" Error: {e}")
print(f" (hi_res requires: uv pip install \"unstructured[pdf]\" and model downloads)")
print(' (hi_res requires: uv pip install "unstructured[pdf]" and model downloads)')


def hi_res_with_options():
Expand All @@ -68,16 +69,16 @@ def hi_res_with_options():
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
infer_table_structure=True, # Extract table HTML
include_page_breaks=True, # Insert PageBreak elements
languages=["eng"], # OCR language hints
infer_table_structure=True, # Extract table HTML
include_page_breaks=True, # Insert PageBreak elements
languages=["eng"], # OCR language hints
)

for el in elements:
if type(el).__name__ == "Table":
print(f"\n--- Table Found ---")
print("\n--- Table Found ---")
print(f"Text: {str(el)[:200]}")
if hasattr(el.metadata, 'text_as_html') and el.metadata.text_as_html:
if hasattr(el.metadata, "text_as_html") and el.metadata.text_as_html:
print(f"HTML: {el.metadata.text_as_html[:300]}")
break
else:
Expand Down
10 changes: 5 additions & 5 deletions advanced_methods/02_unstructured_io/03_specific_partitioners.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

uv pip install "unstructured[all-docs]"
"""
import sys
from pathlib import Path

from collections import Counter
from pathlib import Path

SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents"

Expand Down Expand Up @@ -82,11 +82,11 @@ def partition_email_demo():
# Show email-specific metadata
for el in elements[:1]:
meta = el.metadata
if hasattr(meta, 'sent_from') and meta.sent_from:
if hasattr(meta, "sent_from") and meta.sent_from:
print(f" From: {meta.sent_from}")
if hasattr(meta, 'sent_to') and meta.sent_to:
if hasattr(meta, "sent_to") and meta.sent_to:
print(f" To: {meta.sent_to}")
if hasattr(meta, 'subject') and meta.subject:
if hasattr(meta, "subject") and meta.subject:
print(f" Subject: {meta.subject}")


Expand Down
Loading