diff --git a/advanced_methods/01_docling/01_basic_conversion.py b/advanced_methods/01_docling/01_basic_conversion.py index f4ae90f..de88a7e 100644 --- a/advanced_methods/01_docling/01_basic_conversion.py +++ b/advanced_methods/01_docling/01_basic_conversion.py @@ -10,12 +10,13 @@ uv pip install docling """ -import sys + from pathlib import Path # Reference sample docs from the documents folder SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" + def convert_single_document(): """Convert a single PDF to markdown using default settings.""" from docling.document_converter import DocumentConverter @@ -29,7 +30,7 @@ def convert_single_document(): print("DOCLING: Basic PDF to Markdown") print("=" * 60) print(f"Status: {result.status}") - print(f"\n--- Markdown Output ---\n") + print("\n--- Markdown Output ---\n") print(result.document.export_to_markdown()) @@ -84,7 +85,6 @@ def export_formats(): print(text[:300]) # JSON (lossless serialization) - import json json_str = doc.model_dump_json(indent=2) print(f"\n--- JSON ({len(json_str)} chars) ---") print(json_str[:400] + "...") diff --git a/advanced_methods/01_docling/02_pdf_advanced.py b/advanced_methods/01_docling/02_pdf_advanced.py index 55b48fd..9bf33dd 100644 --- a/advanced_methods/01_docling/02_pdf_advanced.py +++ b/advanced_methods/01_docling/02_pdf_advanced.py @@ -15,7 +15,7 @@ uv pip install "docling[tesserocr]" # for Tesseract OCR uv pip install "docling[easyocr]" # for EasyOCR """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -23,14 +23,16 @@ def table_extraction_modes(): """Compare FAST vs ACCURATE table detection on a table-heavy PDF.""" - from docling.document_converter import DocumentConverter from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode - from docling.document_converter import PdfFormatOption + from docling.document_converter import DocumentConverter, PdfFormatOption pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "tables.pdf" - for mode_name, mode in [("FAST", TableFormerMode.FAST), ("ACCURATE", TableFormerMode.ACCURATE)]: + for mode_name, mode in [ + ("FAST", TableFormerMode.FAST), + ("ACCURATE", TableFormerMode.ACCURATE), + ]: print(f"\n{'=' * 60}") print(f"TABLE DETECTION MODE: {mode_name}") print(f"{'=' * 60}") @@ -41,9 +43,7 @@ def table_extraction_modes(): ) converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) - } + format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} ) result = converter.convert(str(pdf_path)) @@ -54,10 +54,9 @@ def table_extraction_modes(): def ocr_configuration(): """Configure OCR settings for scanned documents.""" - from docling.document_converter import DocumentConverter from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions - from docling.document_converter import PdfFormatOption + from docling.document_converter import DocumentConverter, PdfFormatOption pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "simple_text.pdf" @@ -70,11 +69,7 @@ def ocr_configuration(): # ocr_options=TesseractCliOcrOptions(lang=["eng"]) ) - converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) - } - ) + converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}) print("=" * 60) print("PDF WITH OCR ENABLED") diff --git a/advanced_methods/01_docling/03_chunking.py b/advanced_methods/01_docling/03_chunking.py index df6fa4e..210aa53 100644 --- a/advanced_methods/01_docling/03_chunking.py +++ b/advanced_methods/01_docling/03_chunking.py @@ -14,7 +14,7 @@ uv pip install docling docling-core """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -37,9 +37,9 @@ def hierarchical_chunking(): print("=" * 60) for i, chunk in enumerate(chunks[:5]): - print(f"\n--- Chunk {i+1} ---") + print(f"\n--- Chunk {i + 1} ---") print(f"Text: {chunk.text[:200]}...") - if hasattr(chunk, 'meta') and chunk.meta: + if hasattr(chunk, "meta") and chunk.meta: print(f"Metadata: {chunk.meta}") print() @@ -66,9 +66,9 @@ def hybrid_chunking(): print("=" * 60) for i, chunk in enumerate(chunks[:5]): - print(f"\n--- Chunk {i+1} ---") + print(f"\n--- Chunk {i + 1} ---") print(f"Text: {chunk.text[:200]}...") - if hasattr(chunk, 'meta') and chunk.meta: + if hasattr(chunk, "meta") and chunk.meta: print(f"Metadata: {chunk.meta}") @@ -83,21 +83,23 @@ def compare_chunking_strategies(): hier_chunks = list(HierarchicalChunker().chunk(result.document)) - hybrid_chunks = list(HybridChunker( - tokenizer="sentence-transformers/all-MiniLM-L6-v2", - max_tokens=256, - ).chunk(result.document)) + hybrid_chunks = list( + HybridChunker( + tokenizer="sentence-transformers/all-MiniLM-L6-v2", + max_tokens=256, + ).chunk(result.document) + ) print("=" * 60) print("CHUNKING STRATEGY COMPARISON") print("=" * 60) print(f"\nHierarchical: {len(hier_chunks)} chunks") for i, c in enumerate(hier_chunks[:3]): - print(f" [{i+1}] {len(c.text)} chars: {c.text[:80]}...") + print(f" [{i + 1}] {len(c.text)} chars: {c.text[:80]}...") print(f"\nHybrid (256 tokens): {len(hybrid_chunks)} chunks") for i, c in enumerate(hybrid_chunks[:3]): - print(f" [{i+1}] {len(c.text)} chars: {c.text[:80]}...") + print(f" [{i + 1}] {len(c.text)} chars: {c.text[:80]}...") if __name__ == "__main__": diff --git a/advanced_methods/01_docling/04_integrations.py b/advanced_methods/01_docling/04_integrations.py index 6e5c748..1ff6a3f 100644 --- a/advanced_methods/01_docling/04_integrations.py +++ b/advanced_methods/01_docling/04_integrations.py @@ -12,7 +12,7 @@ uv pip install llama-index-readers-docling # for LlamaIndex uv pip install langchain-docling # for LangChain """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -48,7 +48,7 @@ def llamaindex_integration(): print("DOCLING + LLAMAINDEX") print("=" * 60) for i, doc in enumerate(documents): - print(f"\nDocument {i+1}:") + print(f"\nDocument {i + 1}:") print(f" Text length: {len(doc.text)}") print(f" Preview: {doc.text[:200]}...") @@ -85,7 +85,7 @@ def langchain_integration(): print("DOCLING + LANGCHAIN") print("=" * 60) for i, doc in enumerate(documents): - print(f"\nDocument {i+1}:") + print(f"\nDocument {i + 1}:") print(f" Content: {doc.page_content[:200]}...") print(f" Metadata: {doc.metadata}") diff --git a/advanced_methods/02_unstructured_io/01_auto_partition.py b/advanced_methods/02_unstructured_io/01_auto_partition.py index 5573eda..93daf04 100644 --- a/advanced_methods/02_unstructured_io/01_auto_partition.py +++ b/advanced_methods/02_unstructured_io/01_auto_partition.py @@ -9,7 +9,7 @@ uv pip install "unstructured[all-docs]" """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -29,7 +29,7 @@ def auto_partition_pdf(): for el in elements: print(f"\n[{type(el).__name__}]") print(f" Text: {str(el)[:150]}") - if hasattr(el, 'metadata'): + if hasattr(el, "metadata"): if el.metadata.page_number: print(f" Page: {el.metadata.page_number}") @@ -44,7 +44,10 @@ def auto_partition_multiple(): ("HTML", SAMPLES_DIR / "04_html" / "sample_docs" / "article_page.html"), ("PPTX", SAMPLES_DIR / "03_pptx" / "sample_docs" / "presentation.pptx"), ("Email", SAMPLES_DIR / "07_email" / "sample_docs" / "plain_text.eml"), - ("Markdown", SAMPLES_DIR / "08_markdown_txt" / "sample_docs" / "technical_doc.md"), + ( + "Markdown", + SAMPLES_DIR / "08_markdown_txt" / "sample_docs" / "technical_doc.md", + ), ("EPUB", SAMPLES_DIR / "09_epub" / "sample_docs" / "sample_book.epub"), ] @@ -59,6 +62,7 @@ def auto_partition_multiple(): # Show element type distribution from collections import Counter + type_counts = Counter(type(el).__name__ for el in elements) for etype, count in type_counts.most_common(): print(f" {etype}: {count}") @@ -97,6 +101,7 @@ def element_types_overview(): """) from collections import Counter + type_counts = Counter(type(el).__name__ for el in elements) print(f"Found in this document ({len(elements)} elements):") for etype, count in type_counts.most_common(): @@ -110,4 +115,8 @@ def element_types_overview(): print("3. Element types overview") choice = input("Enter 1/2/3 (default=1): ").strip() or "1" - {"1": auto_partition_pdf, "2": auto_partition_multiple, "3": element_types_overview}[choice]() + { + "1": auto_partition_pdf, + "2": auto_partition_multiple, + "3": element_types_overview, + }[choice]() diff --git a/advanced_methods/02_unstructured_io/02_pdf_strategies.py b/advanced_methods/02_unstructured_io/02_pdf_strategies.py index a7ce819..2842a81 100644 --- a/advanced_methods/02_unstructured_io/02_pdf_strategies.py +++ b/advanced_methods/02_unstructured_io/02_pdf_strategies.py @@ -12,7 +12,7 @@ uv pip install "unstructured[pdf]" """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -20,9 +20,10 @@ def compare_strategies(): """Compare fast, hi_res, and ocr_only on the same PDF.""" - from unstructured.partition.pdf import partition_pdf - from collections import Counter import time + from collections import Counter + + from unstructured.partition.pdf import partition_pdf pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "tables.pdf") @@ -51,7 +52,7 @@ def compare_strategies(): print(f" [{type(el).__name__}] {str(el)[:120]}") except Exception as e: print(f" Error: {e}") - print(f" (hi_res requires: uv pip install \"unstructured[pdf]\" and model downloads)") + print(' (hi_res requires: uv pip install "unstructured[pdf]" and model downloads)') def hi_res_with_options(): @@ -68,16 +69,16 @@ def hi_res_with_options(): elements = partition_pdf( filename=pdf_path, strategy="hi_res", - infer_table_structure=True, # Extract table HTML - include_page_breaks=True, # Insert PageBreak elements - languages=["eng"], # OCR language hints + infer_table_structure=True, # Extract table HTML + include_page_breaks=True, # Insert PageBreak elements + languages=["eng"], # OCR language hints ) for el in elements: if type(el).__name__ == "Table": - print(f"\n--- Table Found ---") + print("\n--- Table Found ---") print(f"Text: {str(el)[:200]}") - if hasattr(el.metadata, 'text_as_html') and el.metadata.text_as_html: + if hasattr(el.metadata, "text_as_html") and el.metadata.text_as_html: print(f"HTML: {el.metadata.text_as_html[:300]}") break else: diff --git a/advanced_methods/02_unstructured_io/03_specific_partitioners.py b/advanced_methods/02_unstructured_io/03_specific_partitioners.py index 97366be..7d82b4e 100644 --- a/advanced_methods/02_unstructured_io/03_specific_partitioners.py +++ b/advanced_methods/02_unstructured_io/03_specific_partitioners.py @@ -17,9 +17,9 @@ uv pip install "unstructured[all-docs]" """ -import sys -from pathlib import Path + from collections import Counter +from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -82,11 +82,11 @@ def partition_email_demo(): # Show email-specific metadata for el in elements[:1]: meta = el.metadata - if hasattr(meta, 'sent_from') and meta.sent_from: + if hasattr(meta, "sent_from") and meta.sent_from: print(f" From: {meta.sent_from}") - if hasattr(meta, 'sent_to') and meta.sent_to: + if hasattr(meta, "sent_to") and meta.sent_to: print(f" To: {meta.sent_to}") - if hasattr(meta, 'subject') and meta.subject: + if hasattr(meta, "subject") and meta.subject: print(f" Subject: {meta.subject}") diff --git a/advanced_methods/02_unstructured_io/04_chunking_and_export.py b/advanced_methods/02_unstructured_io/04_chunking_and_export.py index 0250785..17db313 100644 --- a/advanced_methods/02_unstructured_io/04_chunking_and_export.py +++ b/advanced_methods/02_unstructured_io/04_chunking_and_export.py @@ -16,7 +16,7 @@ uv pip install "unstructured[all-docs]" """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -24,8 +24,8 @@ def chunk_by_title_demo(): """Chunk elements by title for semantic RAG chunks.""" - from unstructured.partition.auto import partition from unstructured.chunking.title import chunk_by_title + from unstructured.partition.auto import partition pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "mixed_content.pdf") elements = partition(filename=pdf_path) @@ -33,8 +33,8 @@ def chunk_by_title_demo(): # Chunk by title with size constraints chunks = chunk_by_title( elements, - max_characters=1000, # Max chunk size - new_after_n_chars=500, # Soft limit to start new chunk + max_characters=1000, # Max chunk size + new_after_n_chars=500, # Soft limit to start new chunk combine_text_under_n_chars=200, # Merge small chunks ) @@ -43,7 +43,7 @@ def chunk_by_title_demo(): print("=" * 60) for i, chunk in enumerate(chunks[:5]): - print(f"\n--- Chunk {i+1} [{type(chunk).__name__}] ---") + print(f"\n--- Chunk {i + 1} [{type(chunk).__name__}] ---") text = str(chunk) print(f" Length: {len(text)} chars") print(f" Text: {text[:200]}...") @@ -66,8 +66,10 @@ def export_formats_demo(): print(text[:300]) # 2. JSON - from unstructured.staging.base import elements_to_json import json + + from unstructured.staging.base import elements_to_json + json_str = elements_to_json(elements) print(f"\n--- JSON ({len(json_str)} chars) ---") parsed = json.loads(json_str) @@ -75,6 +77,7 @@ def export_formats_demo(): # 3. Dict list from unstructured.staging.base import elements_to_dicts + dicts = elements_to_dicts(elements) print(f"\n--- Dicts ({len(dicts)} items) ---") if dicts: @@ -84,9 +87,10 @@ def export_formats_demo(): # 4. DataFrame try: from unstructured.staging.base import convert_to_dataframe + df = convert_to_dataframe(elements) print(f"\n--- DataFrame ({len(df)} rows) ---") - print(df[['type', 'text']].head().to_string()) + print(df[["type", "text"]].head().to_string()) except Exception as e: print(f"\n--- DataFrame: {e} ---") @@ -105,8 +109,15 @@ def metadata_exploration(): for el in elements[:5]: print(f"\n[{type(el).__name__}] {str(el)[:80]}") meta = el.metadata - attrs = ['filename', 'file_directory', 'page_number', 'coordinates', - 'text_as_html', 'languages', 'detection_class_prob'] + attrs = [ + "filename", + "file_directory", + "page_number", + "coordinates", + "text_as_html", + "languages", + "detection_class_prob", + ] for attr in attrs: val = getattr(meta, attr, None) if val is not None: diff --git a/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py b/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py index 34bbd4e..1652061 100644 --- a/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py +++ b/advanced_methods/03_azure_doc_intelligence/01_layout_extraction.py @@ -18,8 +18,8 @@ Free tier: 500 pages/month """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -28,9 +28,8 @@ def layout_extraction(): """Extract layout from a PDF using prebuilt-layout model.""" try: - from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient - from azure.ai.documentintelligence.models import AnalyzeDocumentRequest + from azure.core.credentials import AzureKeyCredential except ImportError: print("Install: uv pip install azure-ai-documentintelligence") _show_example_code() @@ -81,7 +80,7 @@ def layout_extraction(): if result.tables: print(f"\n--- Tables: {len(result.tables)} found ---") for i, table in enumerate(result.tables): - print(f"\n Table {i+1}: {table.row_count} rows x {table.column_count} cols") + print(f"\n Table {i + 1}: {table.row_count} rows x {table.column_count} cols") for cell in table.cells[:6]: print(f" [{cell.row_index},{cell.column_index}] = {cell.content}") @@ -126,8 +125,8 @@ def _show_example_code(): def markdown_output(): """Get document content as Markdown (Azure's built-in conversion).""" try: - from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential except ImportError: print("Install: uv pip install azure-ai-documentintelligence") return diff --git a/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py b/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py index cd2d6bd..77fd483 100644 --- a/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py +++ b/advanced_methods/03_azure_doc_intelligence/02_prebuilt_models.py @@ -16,8 +16,8 @@ uv pip install azure-ai-documentintelligence """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -26,8 +26,8 @@ def prebuilt_read(): """Use prebuilt-read for pure text extraction (OCR optimized).""" try: - from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential except ImportError: print("Install: uv pip install azure-ai-documentintelligence") _show_setup_message("PREBUILT-READ MODEL") @@ -56,7 +56,7 @@ def prebuilt_read(): print(result.content[:500]) if result.languages: - print(f"\nDetected languages: {[l.locale for l in result.languages]}") + print(f"\nDetected languages: {[lang.locale for lang in result.languages]}") def prebuilt_document(): @@ -181,4 +181,9 @@ def _show_read_example(): print("4. Model comparison overview") choice = input("Enter 1/2/3/4 (default=4): ").strip() or "4" - {"1": prebuilt_read, "2": prebuilt_document, "3": prebuilt_invoice, "4": model_comparison}[choice]() + { + "1": prebuilt_read, + "2": prebuilt_document, + "3": prebuilt_invoice, + "4": model_comparison, + }[choice]() diff --git a/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py b/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py index d678e28..96d38a6 100644 --- a/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py +++ b/advanced_methods/03_azure_doc_intelligence/03_table_and_figure_extraction.py @@ -7,8 +7,8 @@ uv pip install azure-ai-documentintelligence """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -17,8 +17,8 @@ def table_extraction(): """Extract tables with full structure from PDF.""" try: - from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential except ImportError: print("Install: uv pip install azure-ai-documentintelligence") _show_table_example() @@ -50,7 +50,7 @@ def table_extraction(): return for i, table in enumerate(result.tables): - print(f"\n--- Table {i+1} ---") + print(f"\n--- Table {i + 1} ---") print(f" Rows: {table.row_count}, Columns: {table.column_count}") print(f" Page: {table.bounding_regions[0].page_number if table.bounding_regions else 'N/A'}") diff --git a/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py b/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py index df7d511..7537dbc 100644 --- a/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py +++ b/advanced_methods/03_azure_doc_intelligence/04_rag_pipeline_example.py @@ -14,8 +14,8 @@ uv pip install azure-ai-documentintelligence """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -49,8 +49,8 @@ def rag_pipeline(): _show_pipeline_code() return - from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key)) pdf_path = SAMPLES_DIR / "01_pdf" / "sample_docs" / "mixed_content.pdf" @@ -76,7 +76,10 @@ def rag_pipeline(): if line.startswith("# ") or line.startswith("## "): if current_chunk["text"].strip(): chunks.append(current_chunk.copy()) - current_chunk = {"text": line + "\n", "metadata": {"section": line.strip("# ")}} + current_chunk = { + "text": line + "\n", + "metadata": {"section": line.strip("# ")}, + } else: current_chunk["text"] += line + "\n" @@ -86,7 +89,7 @@ def rag_pipeline(): print(f"Step 3: Created {len(chunks)} semantic chunks") for i, chunk in enumerate(chunks[:5]): - print(f"\n Chunk {i+1} ({len(chunk['text'])} chars):") + print(f"\n Chunk {i + 1} ({len(chunk['text'])} chars):") print(f" Section: {chunk['metadata']['section']}") print(f" Preview: {chunk['text'][:150].strip()}...") diff --git a/advanced_methods/04_llamaparse/01_basic_parsing.py b/advanced_methods/04_llamaparse/01_basic_parsing.py index 0077642..1934e1d 100644 --- a/advanced_methods/04_llamaparse/01_basic_parsing.py +++ b/advanced_methods/04_llamaparse/01_basic_parsing.py @@ -19,8 +19,8 @@ # OR newer: uv pip install llama-cloud>=1.0 """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -62,7 +62,7 @@ def basic_parse_pdf(): documents = parser.load_data(pdf_path) for i, doc in enumerate(documents): - print(f"\n--- Document {i+1} ---") + print(f"\n--- Document {i + 1} ---") print(f"Text length: {len(doc.text)}") print(f"Preview:\n{doc.text[:500]}") diff --git a/advanced_methods/04_llamaparse/02_llamaindex_integration.py b/advanced_methods/04_llamaparse/02_llamaindex_integration.py index a1adb84..c58f4f9 100644 --- a/advanced_methods/04_llamaparse/02_llamaindex_integration.py +++ b/advanced_methods/04_llamaparse/02_llamaindex_integration.py @@ -6,8 +6,8 @@ uv pip install llama-parse llama-index """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -28,8 +28,8 @@ def llamaindex_rag_pipeline(): return try: + from llama_index.core import VectorStoreIndex from llama_parse import LlamaParse - from llama_index.core import VectorStoreIndex, SimpleDirectoryReader except ImportError: print("Install: uv pip install llama-parse llama-index") _show_pipeline_code() diff --git a/advanced_methods/04_llamaparse/03_parsing_tiers.py b/advanced_methods/04_llamaparse/03_parsing_tiers.py index 6e373f7..7a3395f 100644 --- a/advanced_methods/04_llamaparse/03_parsing_tiers.py +++ b/advanced_methods/04_llamaparse/03_parsing_tiers.py @@ -12,8 +12,8 @@ uv pip install llama-parse """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" diff --git a/advanced_methods/05_marker/01_basic_conversion.py b/advanced_methods/05_marker/01_basic_conversion.py index 3988db1..45cd395 100644 --- a/advanced_methods/05_marker/01_basic_conversion.py +++ b/advanced_methods/05_marker/01_basic_conversion.py @@ -15,7 +15,7 @@ uv pip install marker-pdf uv pip install marker-pdf[full] # for DOCX, PPTX, XLSX support """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -50,7 +50,7 @@ def basic_pdf_to_markdown(): print(f"\n--- Markdown Output ({len(text)} chars) ---") print(text[:800]) - print(f"\n--- Metadata ---") + print("\n--- Metadata ---") for key, val in metadata.items(): print(f" {key}: {val}") @@ -61,9 +61,9 @@ def basic_pdf_to_markdown(): def convert_with_config(): """Convert with custom configuration: output format, page range, etc.""" try: + from marker.config.parser import ConfigParser from marker.converters.pdf import PdfConverter from marker.models import create_model_dict - from marker.config.parser import ConfigParser from marker.output import text_from_rendered except ImportError: print("Install: uv pip install marker-pdf") @@ -75,7 +75,7 @@ def convert_with_config(): # Custom configuration config = { "output_format": "markdown", - "page_range": "0-2", # First 3 pages only + "page_range": "0-2", # First 3 pages only "force_ocr": False, "disable_image_extraction": True, } diff --git a/advanced_methods/05_marker/02_output_formats.py b/advanced_methods/05_marker/02_output_formats.py index 2175836..3e37eda 100644 --- a/advanced_methods/05_marker/02_output_formats.py +++ b/advanced_methods/05_marker/02_output_formats.py @@ -10,8 +10,7 @@ uv pip install marker-pdf """ -import sys -import json + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -46,9 +45,9 @@ def output_formats_overview(): # Try live demo if marker is installed try: + from marker.config.parser import ConfigParser from marker.converters.pdf import PdfConverter from marker.models import create_model_dict - from marker.config.parser import ConfigParser from marker.output import text_from_rendered pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "simple_text.pdf") diff --git a/advanced_methods/05_marker/03_specialized_converters.py b/advanced_methods/05_marker/03_specialized_converters.py index 0233658..82fb8da 100644 --- a/advanced_methods/05_marker/03_specialized_converters.py +++ b/advanced_methods/05_marker/03_specialized_converters.py @@ -9,7 +9,7 @@ uv pip install marker-pdf """ -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -132,4 +132,8 @@ def _show_table_example(): print("3. Structured extraction") choice = input("Enter 1/2/3 (default=1): ").strip() or "1" - {"1": table_converter_demo, "2": ocr_converter_demo, "3": extraction_converter_demo}[choice]() + { + "1": table_converter_demo, + "2": ocr_converter_demo, + "3": extraction_converter_demo, + }[choice]() diff --git a/advanced_methods/06_megaparse/01_basic_parsing.py b/advanced_methods/06_megaparse/01_basic_parsing.py index 18489c7..b705ba3 100644 --- a/advanced_methods/06_megaparse/01_basic_parsing.py +++ b/advanced_methods/06_megaparse/01_basic_parsing.py @@ -16,8 +16,7 @@ Requirements: Python >= 3.11, poppler, tesseract-ocr uv pip install megaparse """ -import os -import sys + from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" diff --git a/advanced_methods/06_megaparse/02_vision_parsing.py b/advanced_methods/06_megaparse/02_vision_parsing.py index b263da1..0c9e0e6 100644 --- a/advanced_methods/06_megaparse/02_vision_parsing.py +++ b/advanced_methods/06_megaparse/02_vision_parsing.py @@ -17,8 +17,8 @@ # OR uv pip install megaparse langchain-anthropic """ + import os -import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -37,8 +37,8 @@ def vision_parse_openai(): return try: - from megaparse.parser.megaparse_vision import MegaParseVision from langchain_openai import ChatOpenAI + from megaparse.parser.megaparse_vision import MegaParseVision except ImportError: print("Install: uv pip install megaparse langchain-openai") return @@ -142,4 +142,8 @@ def _show_vision_example(): print("3. Standard vs Vision comparison") choice = input("Enter 1/2/3 (default=3): ").strip() or "3" - {"1": vision_parse_openai, "2": vision_parse_anthropic, "3": compare_standard_vs_vision}[choice]() + { + "1": vision_parse_openai, + "2": vision_parse_anthropic, + "3": compare_standard_vs_vision, + }[choice]() diff --git a/advanced_methods/06_megaparse/03_rag_preparation.py b/advanced_methods/06_megaparse/03_rag_preparation.py index 8f48e57..a1ea80d 100644 --- a/advanced_methods/06_megaparse/03_rag_preparation.py +++ b/advanced_methods/06_megaparse/03_rag_preparation.py @@ -13,9 +13,9 @@ uv pip install megaparse """ -import os -import sys + import re +import sys from pathlib import Path SAMPLES_DIR = Path(__file__).resolve().parent.parent.parent / "unstructured_documents" @@ -42,18 +42,20 @@ def rag_preparation_pipeline(): # Step 3: Add metadata enriched_chunks = [] for i, chunk in enumerate(chunks): - enriched_chunks.append({ - "id": f"chunk_{i}", - "text": chunk["text"], - "metadata": { - "heading": chunk.get("heading", ""), - "char_count": len(chunk["text"]), - "chunk_index": i, - "source": "megaparse", + enriched_chunks.append( + { + "id": f"chunk_{i}", + "text": chunk["text"], + "metadata": { + "heading": chunk.get("heading", ""), + "char_count": len(chunk["text"]), + "chunk_index": i, + "source": "megaparse", + }, } - }) + ) - print(f"Step 3: Enriched with metadata") + print("Step 3: Enriched with metadata") # Display results print(f"\n{'=' * 60}") @@ -117,6 +119,7 @@ def _get_parsed_content(): """Get parsed content from MegaParse or use sample Markdown.""" try: from megaparse import MegaParse + pdf_path = str(SAMPLES_DIR / "01_pdf" / "sample_docs" / "mixed_content.pdf") megaparse = MegaParse() return megaparse.load(pdf_path) diff --git a/pyproject.toml b/pyproject.toml index 6858d9f..f5f169f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,3 +58,12 @@ llamaparse = [ marker = [ "marker-pdf>=1.0", ] + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = ["E", "W", "F", "I"] + +[tool.ruff.format] +line-ending = "auto" diff --git a/unstructured_documents/01_pdf/01_pypdf_extraction.py b/unstructured_documents/01_pdf/01_pypdf_extraction.py index 3580fb5..a1a0d68 100644 --- a/unstructured_documents/01_pdf/01_pypdf_extraction.py +++ b/unstructured_documents/01_pdf/01_pypdf_extraction.py @@ -125,7 +125,7 @@ def demo_chunking(): # Summary comparison print("\n--- Chunking Strategy Comparison ---") print(f" {'Strategy':<25s} {'Chunks':>8s} {'Avg Size':>10s} {'Min':>6s} {'Max':>6s}") - print(f" {'-'*25} {'-'*8} {'-'*10} {'-'*6} {'-'*6}") + print(f" {'-' * 25} {'-' * 8} {'-' * 10} {'-' * 6} {'-' * 6}") for name, chunks in [ ("Character (500)", char_chunks), ("Sentence (5/chunk)", sent_chunks), diff --git a/unstructured_documents/01_pdf/02_pdfplumber_extraction.py b/unstructured_documents/01_pdf/02_pdfplumber_extraction.py index 85f1821..b969aea 100644 --- a/unstructured_documents/01_pdf/02_pdfplumber_extraction.py +++ b/unstructured_documents/01_pdf/02_pdfplumber_extraction.py @@ -29,6 +29,7 @@ # 1. Text extraction with better formatting # --------------------------------------------------------------------------- + def extract_text_with_layout(pdf_path: Path) -> list[dict]: """ Extract text page by page, preserving layout spacing. @@ -42,18 +43,23 @@ def extract_text_with_layout(pdf_path: Path) -> list[dict]: for i, page in enumerate(pdf.pages): text = page.extract_text() or "" # extract_text with layout settings for better column handling - text_layout = page.extract_text( - layout=True, # Use layout-aware extraction - x_density=7.25, # Horizontal character density - y_density=13, # Vertical line density - ) or "" - results.append({ - "page": i + 1, - "text_default": text, - "text_layout": text_layout, - "width": float(page.width), - "height": float(page.height), - }) + text_layout = ( + page.extract_text( + layout=True, # Use layout-aware extraction + x_density=7.25, # Horizontal character density + y_density=13, # Vertical line density + ) + or "" + ) + results.append( + { + "page": i + 1, + "text_default": text, + "text_layout": text_layout, + "width": float(page.width), + "height": float(page.height), + } + ) return results @@ -90,6 +96,7 @@ def demo_text_extraction(): # 2. Character-level and word-level extraction # --------------------------------------------------------------------------- + def demo_character_level(): """Show character-level and word-level data available via pdfplumber.""" print("\n" + "=" * 70) @@ -104,28 +111,30 @@ def demo_character_level(): print(f"\n Total characters on page 1: {len(chars)}") print("\n First 5 characters with metadata:") print(f" {'Char':<6s} {'x0':>8s} {'y0':>8s} {'x1':>8s} {'y1':>8s} {'Font':<30s} {'Size':>6s}") - print(f" {'-'*6} {'-'*8} {'-'*8} {'-'*8} {'-'*8} {'-'*30} {'-'*6}") + print(f" {'-' * 6} {'-' * 8} {'-' * 8} {'-' * 8} {'-' * 8} {'-' * 30} {'-' * 6}") for c in chars[:5]: - print(f" {repr(c['text']):<6s} {c['x0']:>8.2f} {c['top']:>8.2f} " - f"{c['x1']:>8.2f} {c['bottom']:>8.2f} " - f"{c.get('fontname', 'N/A'):<30s} {c.get('size', 0):>6.1f}") + print( + f" {repr(c['text']):<6s} {c['x0']:>8.2f} {c['top']:>8.2f} " + f"{c['x1']:>8.2f} {c['bottom']:>8.2f} " + f"{c.get('fontname', 'N/A'):<30s} {c.get('size', 0):>6.1f}" + ) # Word-level data words = page.extract_words() print(f"\n Total words on page 1: {len(words)}") print("\n First 10 words with bounding boxes:") print(f" {'Word':<20s} {'x0':>8s} {'top':>8s} {'x1':>8s} {'bottom':>8s}") - print(f" {'-'*20} {'-'*8} {'-'*8} {'-'*8} {'-'*8}") + print(f" {'-' * 20} {'-' * 8} {'-' * 8} {'-' * 8} {'-' * 8}") for w in words[:10]: text = w["text"][:20] - print(f" {text:<20s} {w['x0']:>8.2f} {w['top']:>8.2f} " - f"{w['x1']:>8.2f} {w['bottom']:>8.2f}") + print(f" {text:<20s} {w['x0']:>8.2f} {w['top']:>8.2f} {w['x1']:>8.2f} {w['bottom']:>8.2f}") # --------------------------------------------------------------------------- # 3. Table detection and extraction # --------------------------------------------------------------------------- + def extract_tables_from_pdf(pdf_path: Path) -> list[dict]: """ Detect and extract all tables from a PDF. @@ -138,13 +147,15 @@ def extract_tables_from_pdf(pdf_path: Path) -> list[dict]: for i, page in enumerate(pdf.pages): page_tables = page.extract_tables() for j, table in enumerate(page_tables): - tables.append({ - "page": i + 1, - "table_index": j, - "data": table, # list of rows, each row is a list of cells - "num_rows": len(table), - "num_cols": len(table[0]) if table else 0, - }) + tables.append( + { + "page": i + 1, + "table_index": j, + "data": table, # list of rows, each row is a list of cells + "num_rows": len(table), + "num_cols": len(table[0]) if table else 0, + } + ) return tables @@ -203,13 +214,13 @@ def demo_table_settings(): # Show table finder debug info table_finder = page.debug_tablefinder() - print(f"\n Table finder debug:") + print("\n Table finder debug:") print(f" Tables detected: {len(table_finder.tables)}") for idx, tbl in enumerate(table_finder.tables): bbox = tbl.bbox - print(f" Table {idx + 1} bbox: " - f"x0={bbox[0]:.1f}, top={bbox[1]:.1f}, " - f"x1={bbox[2]:.1f}, bottom={bbox[3]:.1f}") + print( + f" Table {idx + 1} bbox: x0={bbox[0]:.1f}, top={bbox[1]:.1f}, x1={bbox[2]:.1f}, bottom={bbox[3]:.1f}" + ) if __name__ == "__main__": diff --git a/unstructured_documents/01_pdf/03_pymupdf_extraction.py b/unstructured_documents/01_pdf/03_pymupdf_extraction.py index c6e72a8..bd9d276 100644 --- a/unstructured_documents/01_pdf/03_pymupdf_extraction.py +++ b/unstructured_documents/01_pdf/03_pymupdf_extraction.py @@ -29,6 +29,7 @@ # 1. Basic text extraction # --------------------------------------------------------------------------- + def extract_text_basic(pdf_path: Path) -> list[str]: """Extract plain text page by page using get_text().""" doc = fitz.open(str(pdf_path)) @@ -62,6 +63,7 @@ def demo_basic_extraction(): # 2. Layout-preserved extraction using "blocks" mode # --------------------------------------------------------------------------- + def extract_text_with_layout(pdf_path: Path) -> list[str]: """ Extract text preserving the original page layout. @@ -100,6 +102,7 @@ def demo_layout_extraction(): # 3. Block-level extraction with bounding boxes # --------------------------------------------------------------------------- + def extract_blocks(pdf_path: Path, page_num: int = 0) -> list[dict]: """ Extract text as blocks with position information. @@ -146,15 +149,17 @@ def demo_block_extraction(): blocks = extract_blocks(SIMPLE_TEXT_PDF, page_num=0) print(f"\n Total blocks on page 1: {len(blocks)}") print(f"\n {'Block':>5s} {'Type':<6s} {'x0':>7s} {'y0':>7s} {'x1':>7s} {'y1':>7s} Text Preview") - print(f" {'-'*5} {'-'*6} {'-'*7} {'-'*7} {'-'*7} {'-'*7} {'-'*30}") + print(f" {'-' * 5} {'-' * 6} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 30}") for b in blocks[:10]: text_preview = b["text"][:40].replace("\n", " ").strip() if len(b["text"]) > 40: text_preview += "..." - print(f" {b['block_no']:>5d} {b['block_type']:<6s} " - f"{b['x0']:>7.1f} {b['y0']:>7.1f} " - f"{b['x1']:>7.1f} {b['y1']:>7.1f} {text_preview}") + print( + f" {b['block_no']:>5d} {b['block_type']:<6s} " + f"{b['x0']:>7.1f} {b['y0']:>7.1f} " + f"{b['x1']:>7.1f} {b['y1']:>7.1f} {text_preview}" + ) if len(blocks) > 10: print(f"\n ... and {len(blocks) - 10} more blocks") @@ -164,6 +169,7 @@ def demo_block_extraction(): # 4. Structured dict extraction (spans with font info) # --------------------------------------------------------------------------- + def extract_structured_dict(pdf_path: Path, page_num: int = 0) -> dict: """ Extract page content as a structured dictionary with full font info. @@ -198,14 +204,14 @@ def demo_structured_extraction(): for span in line["spans"]: fonts_seen.add((span["font"], round(span["size"], 1))) - print(f"\n Unique font/size combinations found:") + print("\n Unique font/size combinations found:") for font, size in sorted(fonts_seen, key=lambda x: (-x[1], x[0])): print(f" {font:<35s} size={size}") # Show first few spans with full detail - print(f"\n First 8 text spans with details:") + print("\n First 8 text spans with details:") print(f" {'Text':<35s} {'Font':<25s} {'Size':>5s} {'Bold':>5s}") - print(f" {'-'*35} {'-'*25} {'-'*5} {'-'*5}") + print(f" {'-' * 35} {'-' * 25} {'-' * 5} {'-' * 5}") count = 0 for block in data["blocks"]: if block["type"] != 0: @@ -232,6 +238,7 @@ def demo_structured_extraction(): # 5. Fast batch extraction # --------------------------------------------------------------------------- + def demo_batch_extraction(): """Demonstrate fast batch extraction of all pages at once.""" print("\n" + "=" * 70) @@ -262,12 +269,9 @@ def demo_batch_extraction(): texts_c = [page.get_text("rawdict") for page in doc3] time_c = time.perf_counter() - start - print(f"\n Page-by-page (text mode): {time_a*1000:>8.2f} ms " - f"({sum(len(t) for t in texts_a):,} chars)") - print(f" Page-by-page (sorted text): {time_b*1000:>8.2f} ms " - f"({sum(len(t) for t in texts_b):,} chars)") - print(f" Page-by-page (rawdict mode): {time_c*1000:>8.2f} ms " - f"({len(texts_c)} page dicts)") + print(f"\n Page-by-page (text mode): {time_a * 1000:>8.2f} ms ({sum(len(t) for t in texts_a):,} chars)") + print(f" Page-by-page (sorted text): {time_b * 1000:>8.2f} ms ({sum(len(t) for t in texts_b):,} chars)") + print(f" Page-by-page (rawdict mode): {time_c * 1000:>8.2f} ms ({len(texts_c)} page dicts)") doc.close() doc2.close() @@ -282,6 +286,7 @@ def demo_batch_extraction(): # 6. Practical: Detect headings by font size # --------------------------------------------------------------------------- + def detect_headings(pdf_path: Path) -> list[dict]: """ Use font-size analysis to detect headings automatically. @@ -315,13 +320,15 @@ def detect_headings(pdf_path: Path) -> list[dict]: for line in block["lines"]: for span in line["spans"]: if span["size"] > body_size + 1 and span["text"].strip(): - headings.append({ - "page": page_num + 1, - "text": span["text"].strip(), - "font_size": round(span["size"], 1), - "font": span["font"], - "y_position": round(span["origin"][1], 1), - }) + headings.append( + { + "page": page_num + 1, + "text": span["text"].strip(), + "font_size": round(span["size"], 1), + "font": span["font"], + "y_position": round(span["origin"][1], 1), + } + ) doc.close() return headings @@ -336,7 +343,7 @@ def demo_heading_detection(): headings = detect_headings(SIMPLE_TEXT_PDF) print(f"\n Headings detected: {len(headings)}") print(f"\n {'Page':>4s} {'Size':>5s} Text") - print(f" {'-'*4} {'-'*5} {'-'*50}") + print(f" {'-' * 4} {'-' * 5} {'-' * 50}") for h in headings: print(f" {h['page']:>4d} {h['font_size']:>5.1f} {h['text'][:60]}") diff --git a/unstructured_documents/01_pdf/04_table_extraction.py b/unstructured_documents/01_pdf/04_table_extraction.py index bac9552..38d2083 100644 --- a/unstructured_documents/01_pdf/04_table_extraction.py +++ b/unstructured_documents/01_pdf/04_table_extraction.py @@ -33,6 +33,7 @@ # Core extraction # --------------------------------------------------------------------------- + def extract_all_tables(pdf_path: Path) -> list[dict]: """ Extract every table from every page of a PDF. @@ -54,18 +55,18 @@ def extract_all_tables(pdf_path: Path) -> list[dict]: # Clean cell values: replace None with empty string, strip whitespace cleaned = [] for row in table: - cleaned.append([ - (cell.strip() if cell else "") for cell in row - ]) + cleaned.append([(cell.strip() if cell else "") for cell in row]) header = cleaned[0] rows = cleaned[1:] - results.append({ - "page": page_num + 1, - "table_index": table_idx, - "header": header, - "rows": rows, - "raw": table, - }) + results.append( + { + "page": page_num + 1, + "table_index": table_idx, + "header": header, + "rows": rows, + "raw": table, + } + ) return results @@ -73,6 +74,7 @@ def extract_all_tables(pdf_path: Path) -> list[dict]: # Format converters # --------------------------------------------------------------------------- + def table_to_list_of_dicts(header: list[str], rows: list[list[str]]) -> list[dict]: """Convert a table to a list of dictionaries (one per row).""" return [dict(zip(header, row)) for row in rows] @@ -145,6 +147,7 @@ def table_to_natural_language( # Demonstrations # --------------------------------------------------------------------------- + def demo_extract_tables(): """Extract and display all tables.""" print("=" * 70) @@ -266,12 +269,14 @@ def demo_rag_preparation(): # Strategy 1: One passage per table (markdown format) md_passage = f"## {title}\n\n" + table_to_markdown(t["header"], t["rows"]) - all_passages.append({ - "type": "table_markdown", - "source": f"tables.pdf, page {t['page']}", - "title": title, - "content": md_passage, - }) + all_passages.append( + { + "type": "table_markdown", + "source": f"tables.pdf, page {t['page']}", + "title": title, + "content": md_passage, + } + ) # Strategy 2: One passage per row (natural language) for row_idx, row in enumerate(t["rows"]): @@ -280,12 +285,14 @@ def demo_rag_preparation(): if val: parts.append(f"{col} is {val}") sentence = f"In the {title} table: " + ", ".join(parts) + "." - all_passages.append({ - "type": "table_row_nl", - "source": f"tables.pdf, page {t['page']}, row {row_idx + 1}", - "title": title, - "content": sentence, - }) + all_passages.append( + { + "type": "table_row_nl", + "source": f"tables.pdf, page {t['page']}, row {row_idx + 1}", + "title": title, + "content": sentence, + } + ) print(f"\n Total RAG passages generated: {len(all_passages)}") diff --git a/unstructured_documents/01_pdf/05_ocr_extraction.py b/unstructured_documents/01_pdf/05_ocr_extraction.py index 9b31e1f..de2d50f 100644 --- a/unstructured_documents/01_pdf/05_ocr_extraction.py +++ b/unstructured_documents/01_pdf/05_ocr_extraction.py @@ -60,6 +60,7 @@ def check_dependencies() -> tuple[bool, list[str]]: if "pytesseract" not in [m.split()[0] for m in missing]: try: import pytesseract + pytesseract.get_tesseract_version() except Exception: missing.append("tesseract-ocr system binary (brew install tesseract / apt install tesseract-ocr)") @@ -67,7 +68,7 @@ def check_dependencies() -> tuple[bool, list[str]]: # Check poppler (needed by pdf2image) if "pdf2image" not in [m.split()[0] for m in missing]: try: - from pdf2image import convert_from_path + pass # Try a quick conversion to check poppler is available # We do not actually convert here; just check the import path except Exception: @@ -103,6 +104,7 @@ def print_installation_guide(missing: list[str]): # OCR extraction functions # --------------------------------------------------------------------------- + def ocr_extract_text(pdf_path: Path, dpi: int = 300) -> list[dict]: """ Extract text from a PDF using OCR. @@ -117,8 +119,8 @@ def ocr_extract_text(pdf_path: Path, dpi: int = 300) -> list[dict]: dpi: Resolution for page-to-image conversion (higher = better accuracy but slower). 300 DPI is a good default. """ - from pdf2image import convert_from_path import pytesseract + from pdf2image import convert_from_path # Convert PDF pages to images images = convert_from_path(str(pdf_path), dpi=dpi) @@ -132,19 +134,18 @@ def ocr_extract_text(pdf_path: Path, dpi: int = 300) -> list[dict]: data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) # Calculate average confidence (excluding empty/low-confidence entries) - confidences = [ - int(c) for c, t in zip(data["conf"], data["text"]) - if int(c) > 0 and t.strip() - ] + confidences = [int(c) for c, t in zip(data["conf"], data["text"]) if int(c) > 0 and t.strip()] avg_confidence = sum(confidences) / len(confidences) if confidences else 0 - results.append({ - "page": i + 1, - "text": text, - "image_size": image.size, - "avg_confidence": round(avg_confidence, 1), - "word_count": len([t for t in data["text"] if t.strip()]), - }) + results.append( + { + "page": i + 1, + "text": text, + "image_size": image.size, + "avg_confidence": round(avg_confidence, 1), + "word_count": len([t for t in data["text"] if t.strip()]), + } + ) return results @@ -161,9 +162,9 @@ def ocr_extract_with_preprocessing(pdf_path: Path, dpi: int = 300) -> list[dict] These steps are especially helpful for low-quality scans. """ - from pdf2image import convert_from_path - from PIL import Image, ImageFilter import pytesseract + from pdf2image import convert_from_path + from PIL import ImageFilter images = convert_from_path(str(pdf_path), dpi=dpi) @@ -184,11 +185,13 @@ def ocr_extract_with_preprocessing(pdf_path: Path, dpi: int = 300) -> list[dict] # Extract text from preprocessed image text = pytesseract.image_to_string(binarized) - results.append({ - "page": i + 1, - "text": text, - "preprocessing": "grayscale -> sharpen -> binarize", - }) + results.append( + { + "page": i + 1, + "text": text, + "preprocessing": "grayscale -> sharpen -> binarize", + } + ) return results @@ -201,8 +204,8 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]: word, line, and paragraph. This is useful for preserving document layout from scanned PDFs. """ - from pdf2image import convert_from_path import pytesseract + from pdf2image import convert_from_path images = convert_from_path(str(pdf_path), dpi=dpi) @@ -218,7 +221,11 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]: lines = {} for j in range(len(data["text"])): if data["text"][j].strip(): - line_key = (data["block_num"][j], data["par_num"][j], data["line_num"][j]) + line_key = ( + data["block_num"][j], + data["par_num"][j], + data["line_num"][j], + ) if line_key not in lines: lines[line_key] = { "words": [], @@ -227,18 +234,20 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]: } lines[line_key]["words"].append(data["text"][j]) - results.append({ - "page": i + 1, - "num_lines": len(lines), - "lines": [ - { - "text": " ".join(line["words"]), - "position": {"left": line["left"], "top": line["top"]}, - } - for line in lines.values() - ], - "hocr_size": len(hocr), - }) + results.append( + { + "page": i + 1, + "num_lines": len(lines), + "lines": [ + { + "text": " ".join(line["words"]), + "position": {"left": line["left"], "top": line["top"]}, + } + for line in lines.values() + ], + "hocr_size": len(hocr), + } + ) return results @@ -247,6 +256,7 @@ def ocr_extract_layout(pdf_path: Path, dpi: int = 300) -> list[dict]: # Demonstrations # --------------------------------------------------------------------------- + def demo_basic_ocr(): """Demonstrate basic OCR extraction.""" print("\n" + "=" * 70) @@ -297,7 +307,7 @@ def demo_layout_ocr(): print(f"\n --- Page {r['page']} ---") print(f" Lines detected: {r['num_lines']}") print(f" hOCR output size: {r['hocr_size']:,} bytes") - print(f"\n First 10 lines with positions:") + print("\n First 10 lines with positions:") for line in r["lines"][:10]: pos = line["position"] print(f" [{pos['left']:>4d}, {pos['top']:>4d}] {line['text'][:60]}") @@ -318,6 +328,7 @@ def demo_ocr_vs_text(): # Direct extraction with PyMuPDF try: import fitz + doc = fitz.open(str(SIMPLE_TEXT_PDF)) direct_text = "\n".join(page.get_text() for page in doc) doc.close() diff --git a/unstructured_documents/01_pdf/06_comparison.py b/unstructured_documents/01_pdf/06_comparison.py index 59e8b0f..8c1afdd 100644 --- a/unstructured_documents/01_pdf/06_comparison.py +++ b/unstructured_documents/01_pdf/06_comparison.py @@ -34,6 +34,7 @@ # Extraction wrappers # --------------------------------------------------------------------------- + def extract_pypdf(pdf_path: Path) -> str: """Extract text using pypdf.""" reader = PdfReader(str(pdf_path)) @@ -90,6 +91,7 @@ def extract_pymupdf_sorted(pdf_path: Path) -> str: # Timing utility # --------------------------------------------------------------------------- + def time_extraction(func, pdf_path: Path, runs: int = 5) -> tuple[str, float]: """Run an extraction function multiple times and return text + avg time.""" times = [] @@ -107,6 +109,7 @@ def time_extraction(func, pdf_path: Path, runs: int = 5) -> tuple[str, float]: # 1. Text extraction comparison # --------------------------------------------------------------------------- + def compare_text_extraction(): """Compare all methods on simple_text.pdf.""" print("=" * 70) @@ -127,13 +130,15 @@ def compare_text_extraction(): text, avg_ms = time_extraction(func, SIMPLE_TEXT_PDF, runs=5) word_count = len(text.split()) line_count = len(text.strip().split("\n")) - results.append({ - "Method": name, - "Chars": f"{len(text):,}", - "Words": f"{word_count:,}", - "Lines": f"{line_count:,}", - "Avg Time (ms)": f"{avg_ms:.1f}", - }) + results.append( + { + "Method": name, + "Chars": f"{len(text):,}", + "Words": f"{word_count:,}", + "Lines": f"{line_count:,}", + "Avg Time (ms)": f"{avg_ms:.1f}", + } + ) texts[name] = text # Print comparison table @@ -156,6 +161,7 @@ def compare_text_extraction(): # 2. Table extraction comparison # --------------------------------------------------------------------------- + def compare_table_extraction(): """Compare table extraction on tables.pdf.""" print("\n" + "=" * 70) @@ -209,18 +215,30 @@ def compare_table_extraction(): print(f" Text extracted: {len(mupdf_text):,} chars") print(f" Time: {mupdf_time:.1f} ms") - print(f" Note: PyMuPDF extracts table content as text but does not") - print(f" detect table structure (rows/columns). Use pdfplumber for that.") + print(" Note: PyMuPDF extracts table content as text but does not") + print(" detect table structure (rows/columns). Use pdfplumber for that.") # Summary table print("\n --- Table Extraction Summary ---") summary = [ - {"Method": "pdfplumber (default)", "Tables Found": len(plumber_tables), - "Time (ms)": f"{plumber_time:.1f}", "Structured": "Yes"}, - {"Method": "pdfplumber (text strategy)", "Tables Found": len(custom_tables), - "Time (ms)": f"{custom_time:.1f}", "Structured": "Yes"}, - {"Method": "PyMuPDF (text only)", "Tables Found": "N/A", - "Time (ms)": f"{mupdf_time:.1f}", "Structured": "No"}, + { + "Method": "pdfplumber (default)", + "Tables Found": len(plumber_tables), + "Time (ms)": f"{plumber_time:.1f}", + "Structured": "Yes", + }, + { + "Method": "pdfplumber (text strategy)", + "Tables Found": len(custom_tables), + "Time (ms)": f"{custom_time:.1f}", + "Structured": "Yes", + }, + { + "Method": "PyMuPDF (text only)", + "Tables Found": "N/A", + "Time (ms)": f"{mupdf_time:.1f}", + "Structured": "No", + }, ] print(f"\n{tabulate(summary, headers='keys', tablefmt='grid')}") @@ -229,6 +247,7 @@ def compare_table_extraction(): # 3. Mixed content comparison # --------------------------------------------------------------------------- + def compare_mixed_content(): """Compare methods on mixed_content.pdf (text + tables + bullets).""" print("\n" + "=" * 70) @@ -248,12 +267,14 @@ def compare_mixed_content(): results = [] for name, func in methods: text, avg_ms = time_extraction(func, MIXED_CONTENT_PDF, runs=5) - results.append({ - "Method": name, - "Chars": f"{len(text):,}", - "Words": f"{len(text.split()):,}", - "Time (ms)": f"{avg_ms:.1f}", - }) + results.append( + { + "Method": name, + "Chars": f"{len(text):,}", + "Words": f"{len(text.split()):,}", + "Time (ms)": f"{avg_ms:.1f}", + } + ) print(f"\n{tabulate(results, headers='keys', tablefmt='grid')}") @@ -267,6 +288,7 @@ def compare_mixed_content(): # 4. Recommendation summary # --------------------------------------------------------------------------- + def print_recommendations(): """Print a summary of when to use each method.""" print("\n" + "=" * 70) diff --git a/unstructured_documents/01_pdf/sample_docs/generate_samples.py b/unstructured_documents/01_pdf/sample_docs/generate_samples.py index 34fe06c..a207240 100644 --- a/unstructured_documents/01_pdf/sample_docs/generate_samples.py +++ b/unstructured_documents/01_pdf/sample_docs/generate_samples.py @@ -14,10 +14,9 @@ from pathlib import Path from reportlab.lib import colors -from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY, TA_LEFT +from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY from reportlab.lib.pagesizes import letter from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet -from reportlab.lib.units import inch from reportlab.platypus import ( Flowable, Frame, @@ -37,29 +36,48 @@ # 1) simple_text.pdf # --------------------------------------------------------------------------- + def generate_simple_text(): """Create a multi-page document about Artificial Intelligence.""" path = SAMPLE_DIR / "simple_text.pdf" - doc = SimpleDocTemplate(str(path), pagesize=letter, - topMargin=72, bottomMargin=72, - leftMargin=72, rightMargin=72) + doc = SimpleDocTemplate( + str(path), + pagesize=letter, + topMargin=72, + bottomMargin=72, + leftMargin=72, + rightMargin=72, + ) styles = getSampleStyleSheet() title_style = ParagraphStyle( - "CustomTitle", parent=styles["Title"], fontSize=24, - spaceAfter=20, alignment=TA_CENTER, + "CustomTitle", + parent=styles["Title"], + fontSize=24, + spaceAfter=20, + alignment=TA_CENTER, ) heading_style = ParagraphStyle( - "CustomHeading", parent=styles["Heading1"], fontSize=16, - spaceBefore=18, spaceAfter=10, + "CustomHeading", + parent=styles["Heading1"], + fontSize=16, + spaceBefore=18, + spaceAfter=10, ) subheading_style = ParagraphStyle( - "CustomSubheading", parent=styles["Heading2"], fontSize=13, - spaceBefore=14, spaceAfter=8, + "CustomSubheading", + parent=styles["Heading2"], + fontSize=13, + spaceBefore=14, + spaceAfter=8, ) body_style = ParagraphStyle( - "CustomBody", parent=styles["BodyText"], fontSize=11, - leading=16, spaceAfter=10, alignment=TA_JUSTIFY, + "CustomBody", + parent=styles["BodyText"], + fontSize=11, + leading=16, + spaceAfter=10, + alignment=TA_JUSTIFY, ) story = [] @@ -70,245 +88,293 @@ def generate_simple_text(): # ---- Section 1 ---- story.append(Paragraph("1. Introduction to Artificial Intelligence", heading_style)) - story.append(Paragraph( - "Artificial Intelligence (AI) is a branch of computer science that aims to create " - "systems capable of performing tasks that normally require human intelligence. These " - "tasks include visual perception, speech recognition, decision-making, and language " - "translation. The field was founded on the claim that human intelligence can be so " - "precisely described that a machine can be made to simulate it.", - body_style, - )) - story.append(Paragraph( - "The concept of artificial intelligence has been part of human imagination for " - "centuries, but the formal field of AI research was established in 1956 at the " - "Dartmouth Conference. Since then, AI has experienced several cycles of optimism " - "and disappointment, known as AI winters, followed by renewed enthusiasm and " - "funding. The current era of AI, driven by deep learning and big data, has " - "achieved remarkable breakthroughs across numerous domains.", - body_style, - )) - story.append(Paragraph( - "Modern AI systems are powered by machine learning algorithms that learn patterns " - "from vast amounts of data. Unlike traditional software that follows explicit " - "rules written by programmers, machine learning systems improve their performance " - "through experience. This paradigm shift has enabled applications that were " - "previously thought impossible, from self-driving cars to protein structure " - "prediction.", - body_style, - )) + story.append( + Paragraph( + "Artificial Intelligence (AI) is a branch of computer science that aims to create " + "systems capable of performing tasks that normally require human intelligence. These " + "tasks include visual perception, speech recognition, decision-making, and language " + "translation. The field was founded on the claim that human intelligence can be so " + "precisely described that a machine can be made to simulate it.", + body_style, + ) + ) + story.append( + Paragraph( + "The concept of artificial intelligence has been part of human imagination for " + "centuries, but the formal field of AI research was established in 1956 at the " + "Dartmouth Conference. Since then, AI has experienced several cycles of optimism " + "and disappointment, known as AI winters, followed by renewed enthusiasm and " + "funding. The current era of AI, driven by deep learning and big data, has " + "achieved remarkable breakthroughs across numerous domains.", + body_style, + ) + ) + story.append( + Paragraph( + "Modern AI systems are powered by machine learning algorithms that learn patterns " + "from vast amounts of data. Unlike traditional software that follows explicit " + "rules written by programmers, machine learning systems improve their performance " + "through experience. This paradigm shift has enabled applications that were " + "previously thought impossible, from self-driving cars to protein structure " + "prediction.", + body_style, + ) + ) # ---- Section 2 ---- story.append(Paragraph("2. Machine Learning Fundamentals", heading_style)) story.append(Paragraph("2.1 Supervised Learning", subheading_style)) - story.append(Paragraph( - "Supervised learning is the most common form of machine learning. In this " - "paradigm, the algorithm is trained on labeled data, where each input example " - "is paired with the correct output. The model learns to map inputs to outputs " - "by minimizing the difference between its predictions and the actual labels. " - "Common supervised learning tasks include classification, where the goal is to " - "assign inputs to discrete categories, and regression, where the goal is to " - "predict continuous values.", - body_style, - )) - story.append(Paragraph( - "Popular supervised learning algorithms include linear regression, logistic " - "regression, support vector machines, decision trees, random forests, and " - "neural networks. The choice of algorithm depends on the nature of the data, " - "the complexity of the relationship between inputs and outputs, and the amount " - "of available training data. Cross-validation and regularization techniques " - "help prevent overfitting, where the model memorizes training data rather than " - "learning generalizable patterns.", - body_style, - )) + story.append( + Paragraph( + "Supervised learning is the most common form of machine learning. In this " + "paradigm, the algorithm is trained on labeled data, where each input example " + "is paired with the correct output. The model learns to map inputs to outputs " + "by minimizing the difference between its predictions and the actual labels. " + "Common supervised learning tasks include classification, where the goal is to " + "assign inputs to discrete categories, and regression, where the goal is to " + "predict continuous values.", + body_style, + ) + ) + story.append( + Paragraph( + "Popular supervised learning algorithms include linear regression, logistic " + "regression, support vector machines, decision trees, random forests, and " + "neural networks. The choice of algorithm depends on the nature of the data, " + "the complexity of the relationship between inputs and outputs, and the amount " + "of available training data. Cross-validation and regularization techniques " + "help prevent overfitting, where the model memorizes training data rather than " + "learning generalizable patterns.", + body_style, + ) + ) story.append(Paragraph("2.2 Unsupervised Learning", subheading_style)) - story.append(Paragraph( - "Unsupervised learning works with unlabeled data, seeking to discover hidden " - "patterns and structures. Clustering algorithms such as K-means, hierarchical " - "clustering, and DBSCAN group similar data points together. Dimensionality " - "reduction techniques like PCA and t-SNE help visualize high-dimensional data. " - "Generative models learn the underlying distribution of data and can create " - "new samples that resemble the training data.", - body_style, - )) - story.append(Paragraph( - "Unsupervised learning is particularly valuable when labeled data is scarce or " - "expensive to obtain. It is used extensively in customer segmentation, anomaly " - "detection, topic modeling, and feature extraction. Auto-encoders and variational " - "auto-encoders are neural network architectures commonly used for unsupervised " - "representation learning.", - body_style, - )) + story.append( + Paragraph( + "Unsupervised learning works with unlabeled data, seeking to discover hidden " + "patterns and structures. Clustering algorithms such as K-means, hierarchical " + "clustering, and DBSCAN group similar data points together. Dimensionality " + "reduction techniques like PCA and t-SNE help visualize high-dimensional data. " + "Generative models learn the underlying distribution of data and can create " + "new samples that resemble the training data.", + body_style, + ) + ) + story.append( + Paragraph( + "Unsupervised learning is particularly valuable when labeled data is scarce or " + "expensive to obtain. It is used extensively in customer segmentation, anomaly " + "detection, topic modeling, and feature extraction. Auto-encoders and variational " + "auto-encoders are neural network architectures commonly used for unsupervised " + "representation learning.", + body_style, + ) + ) story.append(Paragraph("2.3 Reinforcement Learning", subheading_style)) - story.append(Paragraph( - "Reinforcement learning (RL) involves an agent that learns to make decisions by " - "interacting with an environment. The agent receives rewards or penalties based on " - "its actions and aims to maximize cumulative reward over time. RL has achieved " - "impressive results in game playing, with systems like AlphaGo defeating world " - "champions in Go, and AlphaStar reaching grandmaster level in StarCraft II.", - body_style, - )) - story.append(Paragraph( - "Key concepts in reinforcement learning include the Markov Decision Process (MDP), " - "value functions, policy gradients, and the exploration-exploitation trade-off. " - "Deep reinforcement learning combines neural networks with RL algorithms, enabling " - "agents to handle high-dimensional state spaces. Applications include robotics " - "control, recommendation systems, resource management, and autonomous driving.", - body_style, - )) + story.append( + Paragraph( + "Reinforcement learning (RL) involves an agent that learns to make decisions by " + "interacting with an environment. The agent receives rewards or penalties based on " + "its actions and aims to maximize cumulative reward over time. RL has achieved " + "impressive results in game playing, with systems like AlphaGo defeating world " + "champions in Go, and AlphaStar reaching grandmaster level in StarCraft II.", + body_style, + ) + ) + story.append( + Paragraph( + "Key concepts in reinforcement learning include the Markov Decision Process (MDP), " + "value functions, policy gradients, and the exploration-exploitation trade-off. " + "Deep reinforcement learning combines neural networks with RL algorithms, enabling " + "agents to handle high-dimensional state spaces. Applications include robotics " + "control, recommendation systems, resource management, and autonomous driving.", + body_style, + ) + ) # ---- Section 3 ---- story.append(Paragraph("3. Deep Learning and Neural Networks", heading_style)) - story.append(Paragraph( - "Deep learning is a subset of machine learning based on artificial neural networks " - "with multiple layers. These deep networks can learn hierarchical representations " - "of data, with each layer capturing increasingly abstract features. The input layer " - "receives raw data, hidden layers process and transform it, and the output layer " - "produces the final result. Backpropagation is the primary algorithm for training " - "deep neural networks, computing gradients of the loss function with respect to " - "each weight through the chain rule of calculus.", - body_style, - )) - story.append(Paragraph( - "Convolutional Neural Networks (CNNs) are specialized for processing grid-like data " - "such as images. They use convolutional layers to automatically learn spatial " - "hierarchies of features, from edges and textures to objects and scenes. Recurrent " - "Neural Networks (RNNs) and their variants, Long Short-Term Memory (LSTM) and " - "Gated Recurrent Unit (GRU), are designed for sequential data such as text and " - "time series.", - body_style, - )) - story.append(Paragraph( - "The Transformer architecture, introduced in 2017, has revolutionized natural " - "language processing and is increasingly used in computer vision and other domains. " - "Transformers rely on self-attention mechanisms to capture long-range dependencies " - "in data without the sequential processing limitations of RNNs. Large language " - "models (LLMs) like GPT and BERT are based on the Transformer architecture and " - "have demonstrated remarkable capabilities in text generation, translation, " - "summarization, and question answering.", - body_style, - )) + story.append( + Paragraph( + "Deep learning is a subset of machine learning based on artificial neural networks " + "with multiple layers. These deep networks can learn hierarchical representations " + "of data, with each layer capturing increasingly abstract features. The input layer " + "receives raw data, hidden layers process and transform it, and the output layer " + "produces the final result. Backpropagation is the primary algorithm for training " + "deep neural networks, computing gradients of the loss function with respect to " + "each weight through the chain rule of calculus.", + body_style, + ) + ) + story.append( + Paragraph( + "Convolutional Neural Networks (CNNs) are specialized for processing grid-like data " + "such as images. They use convolutional layers to automatically learn spatial " + "hierarchies of features, from edges and textures to objects and scenes. Recurrent " + "Neural Networks (RNNs) and their variants, Long Short-Term Memory (LSTM) and " + "Gated Recurrent Unit (GRU), are designed for sequential data such as text and " + "time series.", + body_style, + ) + ) + story.append( + Paragraph( + "The Transformer architecture, introduced in 2017, has revolutionized natural " + "language processing and is increasingly used in computer vision and other domains. " + "Transformers rely on self-attention mechanisms to capture long-range dependencies " + "in data without the sequential processing limitations of RNNs. Large language " + "models (LLMs) like GPT and BERT are based on the Transformer architecture and " + "have demonstrated remarkable capabilities in text generation, translation, " + "summarization, and question answering.", + body_style, + ) + ) story.append(PageBreak()) # ---- Section 4 ---- story.append(Paragraph("4. Natural Language Processing", heading_style)) - story.append(Paragraph( - "Natural Language Processing (NLP) is a field at the intersection of computer " - "science, artificial intelligence, and linguistics. It focuses on enabling computers " - "to understand, interpret, and generate human language. NLP encompasses a wide range " - "of tasks, including text classification, named entity recognition, sentiment " - "analysis, machine translation, text summarization, and question answering.", - body_style, - )) - story.append(Paragraph( - "The evolution of NLP has progressed from rule-based systems to statistical methods " - "and finally to deep learning approaches. Word embeddings like Word2Vec, GloVe, and " - "FastText represent words as dense vectors in a continuous space, capturing semantic " - "relationships. Contextual embeddings from models like ELMo, BERT, and GPT provide " - "word representations that vary based on context, significantly improving performance " - "on downstream tasks.", - body_style, - )) - story.append(Paragraph( - "Retrieval-Augmented Generation (RAG) is an emerging paradigm that combines the " - "strengths of retrieval systems and generative models. In RAG, a retrieval component " - "first finds relevant documents from a knowledge base, and then a generative model " - "uses those documents as context to produce accurate, grounded responses. This " - "approach helps mitigate hallucination in language models and enables them to access " - "up-to-date information without retraining.", - body_style, - )) + story.append( + Paragraph( + "Natural Language Processing (NLP) is a field at the intersection of computer " + "science, artificial intelligence, and linguistics. It focuses on enabling computers " + "to understand, interpret, and generate human language. NLP encompasses a wide range " + "of tasks, including text classification, named entity recognition, sentiment " + "analysis, machine translation, text summarization, and question answering.", + body_style, + ) + ) + story.append( + Paragraph( + "The evolution of NLP has progressed from rule-based systems to statistical methods " + "and finally to deep learning approaches. Word embeddings like Word2Vec, GloVe, and " + "FastText represent words as dense vectors in a continuous space, capturing semantic " + "relationships. Contextual embeddings from models like ELMo, BERT, and GPT provide " + "word representations that vary based on context, significantly improving performance " + "on downstream tasks.", + body_style, + ) + ) + story.append( + Paragraph( + "Retrieval-Augmented Generation (RAG) is an emerging paradigm that combines the " + "strengths of retrieval systems and generative models. In RAG, a retrieval component " + "first finds relevant documents from a knowledge base, and then a generative model " + "uses those documents as context to produce accurate, grounded responses. This " + "approach helps mitigate hallucination in language models and enables them to access " + "up-to-date information without retraining.", + body_style, + ) + ) # ---- Section 5 ---- story.append(Paragraph("5. Computer Vision", heading_style)) - story.append(Paragraph( - "Computer vision is a field of AI that enables computers to interpret and understand " - "visual information from the world. Key tasks include image classification, object " - "detection, semantic segmentation, instance segmentation, and image generation. " - "The field has been transformed by deep learning, particularly convolutional neural " - "networks, which have achieved human-level or superhuman performance on many " - "benchmark tasks.", - body_style, - )) - story.append(Paragraph( - "Object detection algorithms such as YOLO, SSD, and Faster R-CNN can identify and " - "localize multiple objects in an image in real time. Image segmentation models like " - "U-Net and Mask R-CNN assign labels to every pixel in an image, enabling precise " - "understanding of scene composition. Generative Adversarial Networks (GANs) and " - "diffusion models can create photorealistic images from text descriptions, " - "opening up new possibilities in creative applications.", - body_style, - )) - story.append(Paragraph( - "Transfer learning has been crucial for computer vision, allowing models pre-trained " - "on large datasets like ImageNet to be fine-tuned for specific tasks with limited " - "data. Vision Transformers (ViTs) apply the Transformer architecture to images by " - "treating image patches as tokens, achieving competitive or superior results " - "compared to CNNs. Multi-modal models that combine vision and language understanding " - "can perform tasks like visual question answering and image captioning.", - body_style, - )) + story.append( + Paragraph( + "Computer vision is a field of AI that enables computers to interpret and understand " + "visual information from the world. Key tasks include image classification, object " + "detection, semantic segmentation, instance segmentation, and image generation. " + "The field has been transformed by deep learning, particularly convolutional neural " + "networks, which have achieved human-level or superhuman performance on many " + "benchmark tasks.", + body_style, + ) + ) + story.append( + Paragraph( + "Object detection algorithms such as YOLO, SSD, and Faster R-CNN can identify and " + "localize multiple objects in an image in real time. Image segmentation models like " + "U-Net and Mask R-CNN assign labels to every pixel in an image, enabling precise " + "understanding of scene composition. Generative Adversarial Networks (GANs) and " + "diffusion models can create photorealistic images from text descriptions, " + "opening up new possibilities in creative applications.", + body_style, + ) + ) + story.append( + Paragraph( + "Transfer learning has been crucial for computer vision, allowing models pre-trained " + "on large datasets like ImageNet to be fine-tuned for specific tasks with limited " + "data. Vision Transformers (ViTs) apply the Transformer architecture to images by " + "treating image patches as tokens, achieving competitive or superior results " + "compared to CNNs. Multi-modal models that combine vision and language understanding " + "can perform tasks like visual question answering and image captioning.", + body_style, + ) + ) # ---- Section 6 ---- story.append(Paragraph("6. Ethics and Societal Impact", heading_style)) - story.append(Paragraph( - "As AI systems become more powerful and pervasive, ethical considerations become " - "increasingly important. Key concerns include algorithmic bias, where AI systems " - "can perpetuate or amplify existing societal biases present in training data. " - "Fairness, accountability, and transparency (FAT) are essential principles for " - "responsible AI development. Privacy concerns arise from the massive data " - "collection required to train AI models, and the potential for surveillance " - "and tracking.", - body_style, - )) - story.append(Paragraph( - "The impact of AI on employment is a subject of ongoing debate. While AI automates " - "certain tasks, it also creates new jobs and augments human capabilities. The " - "challenge lies in managing the transition and ensuring that the benefits of AI " - "are distributed equitably. Education and workforce development programs are " - "essential to prepare workers for an AI-driven economy.", - body_style, - )) - story.append(Paragraph( - "AI safety research focuses on ensuring that advanced AI systems remain aligned " - "with human values and intentions. This includes work on interpretability, making " - "AI decisions understandable to humans; robustness, ensuring AI systems work " - "reliably under various conditions; and alignment, ensuring AI goals match human " - "goals. International cooperation and governance frameworks are being developed " - "to address the global implications of AI technology.", - body_style, - )) + story.append( + Paragraph( + "As AI systems become more powerful and pervasive, ethical considerations become " + "increasingly important. Key concerns include algorithmic bias, where AI systems " + "can perpetuate or amplify existing societal biases present in training data. " + "Fairness, accountability, and transparency (FAT) are essential principles for " + "responsible AI development. Privacy concerns arise from the massive data " + "collection required to train AI models, and the potential for surveillance " + "and tracking.", + body_style, + ) + ) + story.append( + Paragraph( + "The impact of AI on employment is a subject of ongoing debate. While AI automates " + "certain tasks, it also creates new jobs and augments human capabilities. The " + "challenge lies in managing the transition and ensuring that the benefits of AI " + "are distributed equitably. Education and workforce development programs are " + "essential to prepare workers for an AI-driven economy.", + body_style, + ) + ) + story.append( + Paragraph( + "AI safety research focuses on ensuring that advanced AI systems remain aligned " + "with human values and intentions. This includes work on interpretability, making " + "AI decisions understandable to humans; robustness, ensuring AI systems work " + "reliably under various conditions; and alignment, ensuring AI goals match human " + "goals. International cooperation and governance frameworks are being developed " + "to address the global implications of AI technology.", + body_style, + ) + ) # ---- Section 7 ---- story.append(Paragraph("7. Future Directions", heading_style)) - story.append(Paragraph( - "The future of AI holds tremendous promise across many fronts. Artificial General " - "Intelligence (AGI), which would match or exceed human intelligence across all " - "cognitive tasks, remains a long-term goal of the field. Neuro-symbolic AI aims " - "to combine the pattern recognition strengths of neural networks with the " - "reasoning capabilities of symbolic AI systems.", - body_style, - )) - story.append(Paragraph( - "Edge AI brings intelligence to resource-constrained devices, enabling real-time " - "processing without cloud connectivity. Quantum machine learning explores the " - "intersection of quantum computing and AI, potentially offering exponential " - "speedups for certain types of computations. Federated learning enables " - "collaborative model training while keeping data decentralized, addressing " - "privacy concerns in healthcare, finance, and other sensitive domains.", - body_style, - )) - story.append(Paragraph( - "AI for science is accelerating discoveries in physics, chemistry, biology, and " - "materials science. Protein structure prediction by AlphaFold has transformed " - "structural biology. Climate modeling, drug discovery, and mathematical reasoning " - "are all areas where AI is making significant contributions. As these technologies " - "mature, the integration of AI into every aspect of human activity will continue " - "to deepen, making responsible development more important than ever.", - body_style, - )) + story.append( + Paragraph( + "The future of AI holds tremendous promise across many fronts. Artificial General " + "Intelligence (AGI), which would match or exceed human intelligence across all " + "cognitive tasks, remains a long-term goal of the field. Neuro-symbolic AI aims " + "to combine the pattern recognition strengths of neural networks with the " + "reasoning capabilities of symbolic AI systems.", + body_style, + ) + ) + story.append( + Paragraph( + "Edge AI brings intelligence to resource-constrained devices, enabling real-time " + "processing without cloud connectivity. Quantum machine learning explores the " + "intersection of quantum computing and AI, potentially offering exponential " + "speedups for certain types of computations. Federated learning enables " + "collaborative model training while keeping data decentralized, addressing " + "privacy concerns in healthcare, finance, and other sensitive domains.", + body_style, + ) + ) + story.append( + Paragraph( + "AI for science is accelerating discoveries in physics, chemistry, biology, and " + "materials science. Protein structure prediction by AlphaFold has transformed " + "structural biology. Climate modeling, drug discovery, and mathematical reasoning " + "are all areas where AI is making significant contributions. As these technologies " + "mature, the integration of AI into every aspect of human activity will continue " + "to deepen, making responsible development more important than ever.", + body_style, + ) + ) doc.build(story) print(f" Created: {path}") @@ -318,44 +384,66 @@ def generate_simple_text(): # 2) tables.pdf # --------------------------------------------------------------------------- + def generate_tables(): """Create a document with multiple data tables.""" path = SAMPLE_DIR / "tables.pdf" - doc = SimpleDocTemplate(str(path), pagesize=letter, - topMargin=72, bottomMargin=72, - leftMargin=54, rightMargin=54) + doc = SimpleDocTemplate( + str(path), + pagesize=letter, + topMargin=72, + bottomMargin=72, + leftMargin=54, + rightMargin=54, + ) styles = getSampleStyleSheet() title_style = ParagraphStyle( - "TblTitle", parent=styles["Title"], fontSize=20, - spaceAfter=16, alignment=TA_CENTER, + "TblTitle", + parent=styles["Title"], + fontSize=20, + spaceAfter=16, + alignment=TA_CENTER, ) heading_style = ParagraphStyle( - "TblHeading", parent=styles["Heading2"], fontSize=14, - spaceBefore=20, spaceAfter=10, + "TblHeading", + parent=styles["Heading2"], + fontSize=14, + spaceBefore=20, + spaceAfter=10, ) body_style = ParagraphStyle( - "TblBody", parent=styles["BodyText"], fontSize=10, - leading=14, spaceAfter=8, + "TblBody", + parent=styles["BodyText"], + fontSize=10, + leading=14, + spaceAfter=8, ) # Common table style def make_table_style(): - return TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#4472C4")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, 0), 10), - ("BOTTOMPADDING", (0, 0), (-1, 0), 8), - ("TOPPADDING", (0, 0), (-1, 0), 8), - ("BACKGROUND", (0, 1), (-1, -1), colors.HexColor("#D9E2F3")), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#D9E2F3"), colors.white]), - ("FONTSIZE", (0, 1), (-1, -1), 9), - ("GRID", (0, 0), (-1, -1), 0.5, colors.grey), - ("ALIGN", (0, 0), (-1, -1), "CENTER"), - ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), - ("TOPPADDING", (0, 1), (-1, -1), 5), - ("BOTTOMPADDING", (0, 1), (-1, -1), 5), - ]) + return TableStyle( + [ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#4472C4")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, 0), 10), + ("BOTTOMPADDING", (0, 0), (-1, 0), 8), + ("TOPPADDING", (0, 0), (-1, 0), 8), + ("BACKGROUND", (0, 1), (-1, -1), colors.HexColor("#D9E2F3")), + ( + "ROWBACKGROUNDS", + (0, 1), + (-1, -1), + [colors.HexColor("#D9E2F3"), colors.white], + ), + ("FONTSIZE", (0, 1), (-1, -1), 9), + ("GRID", (0, 0), (-1, -1), 0.5, colors.grey), + ("ALIGN", (0, 0), (-1, -1), "CENTER"), + ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), + ("TOPPADDING", (0, 1), (-1, -1), 5), + ("BOTTOMPADDING", (0, 1), (-1, -1), 5), + ] + ) story = [] @@ -364,23 +452,53 @@ def make_table_style(): # --- Table 1: Product Inventory --- story.append(Paragraph("Table 1: Product Inventory", heading_style)) - story.append(Paragraph( - "Current inventory levels across warehouse locations as of Q4 2024.", - body_style, - )) + story.append( + Paragraph( + "Current inventory levels across warehouse locations as of Q4 2024.", + body_style, + ) + ) inventory_data = [ - ["Product ID", "Product Name", "Category", "Quantity", "Unit Price", "Warehouse"], + [ + "Product ID", + "Product Name", + "Category", + "Quantity", + "Unit Price", + "Warehouse", + ], ["PRD-001", "Wireless Mouse", "Electronics", "1,250", "$29.99", "Warehouse A"], - ["PRD-002", "Mechanical Keyboard", "Electronics", "843", "$79.99", "Warehouse A"], + [ + "PRD-002", + "Mechanical Keyboard", + "Electronics", + "843", + "$79.99", + "Warehouse A", + ], ["PRD-003", "USB-C Hub", "Accessories", "2,100", "$44.99", "Warehouse B"], ["PRD-004", "Monitor Stand", "Furniture", "567", "$54.99", "Warehouse C"], ["PRD-005", "Webcam HD", "Electronics", "1,890", "$64.99", "Warehouse A"], ["PRD-006", "Desk Lamp LED", "Lighting", "3,210", "$34.99", "Warehouse B"], ["PRD-007", "Ergonomic Chair", "Furniture", "245", "$349.99", "Warehouse C"], ["PRD-008", "Laptop Stand", "Accessories", "1,670", "$39.99", "Warehouse B"], - ["PRD-009", "Noise-Canceling Headphones", "Electronics", "920", "$149.99", "Warehouse A"], - ["PRD-010", "Power Strip Surge Protector", "Accessories", "4,500", "$24.99", "Warehouse C"], + [ + "PRD-009", + "Noise-Canceling Headphones", + "Electronics", + "920", + "$149.99", + "Warehouse A", + ], + [ + "PRD-010", + "Power Strip Surge Protector", + "Accessories", + "4,500", + "$24.99", + "Warehouse C", + ], ] t1 = Table(inventory_data, repeatRows=1) t1.setStyle(make_table_style()) @@ -389,10 +507,12 @@ def make_table_style(): # --- Table 2: Quarterly Revenue --- story.append(Paragraph("Table 2: Quarterly Revenue by Region (in thousands USD)", heading_style)) - story.append(Paragraph( - "Revenue figures for fiscal year 2024 across all operating regions.", - body_style, - )) + story.append( + Paragraph( + "Revenue figures for fiscal year 2024 across all operating regions.", + body_style, + ) + ) revenue_data = [ ["Region", "Q1 2024", "Q2 2024", "Q3 2024", "Q4 2024", "Annual Total"], @@ -411,23 +531,95 @@ def make_table_style(): # --- Table 3: Employee Records --- story.append(Paragraph("Table 3: Employee Directory", heading_style)) - story.append(Paragraph( - "Key personnel across departments with their roles and contact information.", - body_style, - )) + story.append( + Paragraph( + "Key personnel across departments with their roles and contact information.", + body_style, + ) + ) employee_data = [ ["Emp ID", "Name", "Department", "Title", "Start Date", "Email"], - ["E-101", "Sarah Johnson", "Engineering", "Senior Developer", "2019-03-15", "s.johnson@example.com"], - ["E-102", "Michael Chen", "Engineering", "Tech Lead", "2018-07-22", "m.chen@example.com"], - ["E-103", "Emily Rodriguez", "Marketing", "Marketing Manager", "2020-01-10", "e.rodriguez@example.com"], - ["E-104", "David Kim", "Data Science", "ML Engineer", "2021-05-18", "d.kim@example.com"], - ["E-105", "Jessica Patel", "Product", "Product Manager", "2019-11-03", "j.patel@example.com"], - ["E-106", "Robert Taylor", "Engineering", "DevOps Engineer", "2020-08-25", "r.taylor@example.com"], - ["E-107", "Amanda White", "HR", "HR Director", "2017-02-14", "a.white@example.com"], - ["E-108", "James Wilson", "Finance", "Financial Analyst", "2022-04-01", "j.wilson@example.com"], - ["E-109", "Lisa Brown", "Data Science", "Data Analyst", "2021-09-12", "l.brown@example.com"], - ["E-110", "Thomas Lee", "Engineering", "Frontend Developer", "2023-01-30", "t.lee@example.com"], + [ + "E-101", + "Sarah Johnson", + "Engineering", + "Senior Developer", + "2019-03-15", + "s.johnson@example.com", + ], + [ + "E-102", + "Michael Chen", + "Engineering", + "Tech Lead", + "2018-07-22", + "m.chen@example.com", + ], + [ + "E-103", + "Emily Rodriguez", + "Marketing", + "Marketing Manager", + "2020-01-10", + "e.rodriguez@example.com", + ], + [ + "E-104", + "David Kim", + "Data Science", + "ML Engineer", + "2021-05-18", + "d.kim@example.com", + ], + [ + "E-105", + "Jessica Patel", + "Product", + "Product Manager", + "2019-11-03", + "j.patel@example.com", + ], + [ + "E-106", + "Robert Taylor", + "Engineering", + "DevOps Engineer", + "2020-08-25", + "r.taylor@example.com", + ], + [ + "E-107", + "Amanda White", + "HR", + "HR Director", + "2017-02-14", + "a.white@example.com", + ], + [ + "E-108", + "James Wilson", + "Finance", + "Financial Analyst", + "2022-04-01", + "j.wilson@example.com", + ], + [ + "E-109", + "Lisa Brown", + "Data Science", + "Data Analyst", + "2021-09-12", + "l.brown@example.com", + ], + [ + "E-110", + "Thomas Lee", + "Engineering", + "Frontend Developer", + "2023-01-30", + "t.lee@example.com", + ], ] t3 = Table(employee_data, repeatRows=1) t3.setStyle(make_table_style()) @@ -436,10 +628,12 @@ def make_table_style(): # --- Table 4: Project Status --- story.append(Paragraph("Table 4: Project Status Overview", heading_style)) - story.append(Paragraph( - "Active and upcoming projects with timeline and budget information.", - body_style, - )) + story.append( + Paragraph( + "Active and upcoming projects with timeline and budget information.", + body_style, + ) + ) project_data = [ ["Project", "Lead", "Status", "Start", "Deadline", "Budget"], @@ -462,8 +656,10 @@ def make_table_style(): # 3) multi_column.pdf # --------------------------------------------------------------------------- + class ColumnBreak(Flowable): """Force a break to the next column / frame.""" + def __init__(self): super().__init__() self.width = 0 @@ -491,201 +687,262 @@ def generate_multi_column(): right_frame = Frame(margin + col_w + gutter, margin, col_w, frame_h, id="right") two_col_template = PageTemplate( - id="TwoCol", frames=[left_frame, right_frame], + id="TwoCol", + frames=[left_frame, right_frame], ) - doc = SimpleDocTemplate(str(path), pagesize=letter, - topMargin=margin, bottomMargin=margin, - leftMargin=margin, rightMargin=margin) + doc = SimpleDocTemplate( + str(path), + pagesize=letter, + topMargin=margin, + bottomMargin=margin, + leftMargin=margin, + rightMargin=margin, + ) doc.addPageTemplates([two_col_template]) styles = getSampleStyleSheet() title_style = ParagraphStyle( - "ColTitle", parent=styles["Title"], fontSize=16, - spaceAfter=8, alignment=TA_CENTER, + "ColTitle", + parent=styles["Title"], + fontSize=16, + spaceAfter=8, + alignment=TA_CENTER, ) author_style = ParagraphStyle( - "ColAuthor", parent=styles["Normal"], fontSize=10, - spaceAfter=12, alignment=TA_CENTER, textColor=colors.grey, + "ColAuthor", + parent=styles["Normal"], + fontSize=10, + spaceAfter=12, + alignment=TA_CENTER, + textColor=colors.grey, ) heading_style = ParagraphStyle( - "ColHeading", parent=styles["Heading2"], fontSize=12, - spaceBefore=12, spaceAfter=6, + "ColHeading", + parent=styles["Heading2"], + fontSize=12, + spaceBefore=12, + spaceAfter=6, ) body_style = ParagraphStyle( - "ColBody", parent=styles["BodyText"], fontSize=9, - leading=13, spaceAfter=6, alignment=TA_JUSTIFY, + "ColBody", + parent=styles["BodyText"], + fontSize=9, + leading=13, + spaceAfter=6, + alignment=TA_JUSTIFY, ) abstract_style = ParagraphStyle( - "ColAbstract", parent=styles["BodyText"], fontSize=9, - leading=13, spaceAfter=6, alignment=TA_JUSTIFY, - leftIndent=12, rightIndent=12, + "ColAbstract", + parent=styles["BodyText"], + fontSize=9, + leading=13, + spaceAfter=6, + alignment=TA_JUSTIFY, + leftIndent=12, + rightIndent=12, ) story = [] # Title and authors (span both columns via the left frame, will reflow) - story.append(Paragraph( - "Advances in Document Understanding for Retrieval-Augmented Generation Systems", - title_style, - )) - story.append(Paragraph( - "J. Smith, A. Kumar, L. Zhang — Institute of AI Research, 2024", - author_style, - )) + story.append( + Paragraph( + "Advances in Document Understanding for Retrieval-Augmented Generation Systems", + title_style, + ) + ) + story.append( + Paragraph( + "J. Smith, A. Kumar, L. Zhang — Institute of AI Research, 2024", + author_style, + ) + ) story.append(Spacer(1, 6)) # Abstract story.append(Paragraph("Abstract", heading_style)) - story.append(Paragraph( - "This paper surveys recent advances in document understanding techniques that " - "underpin modern Retrieval-Augmented Generation (RAG) pipelines. We examine " - "methods for parsing unstructured documents including PDFs, web pages, and " - "scanned images, and evaluate their effectiveness for downstream retrieval " - "and generation tasks. Our analysis covers text extraction, layout analysis, " - "table recognition, and multimodal approaches that combine vision and language " - "models. We find that hybrid methods combining rule-based extraction with " - "learned representations achieve the best results across diverse document types.", - abstract_style, - )) + story.append( + Paragraph( + "This paper surveys recent advances in document understanding techniques that " + "underpin modern Retrieval-Augmented Generation (RAG) pipelines. We examine " + "methods for parsing unstructured documents including PDFs, web pages, and " + "scanned images, and evaluate their effectiveness for downstream retrieval " + "and generation tasks. Our analysis covers text extraction, layout analysis, " + "table recognition, and multimodal approaches that combine vision and language " + "models. We find that hybrid methods combining rule-based extraction with " + "learned representations achieve the best results across diverse document types.", + abstract_style, + ) + ) # Section 1 story.append(Paragraph("1. Introduction", heading_style)) - story.append(Paragraph( - "The explosion of digital documents in enterprise and academic settings has " - "created an urgent need for robust document understanding systems. Organizations " - "store vast quantities of knowledge in unstructured formats such as PDFs, Word " - "documents, presentations, and scanned images. Unlocking this knowledge for " - "AI-powered applications, particularly Retrieval-Augmented Generation (RAG), " - "requires sophisticated parsing and extraction pipelines.", - body_style, - )) - story.append(Paragraph( - "RAG systems combine retrieval from a document corpus with generative language " - "models to produce accurate, grounded responses. The quality of the retrieval " - "step depends critically on how well source documents have been parsed, chunked, " - "and indexed. Poor extraction leads to noisy passages that degrade both retrieval " - "precision and generation quality.", - body_style, - )) - story.append(Paragraph( - "In this paper, we provide a comprehensive analysis of document understanding " - "techniques relevant to RAG pipelines. We focus on PDF documents, which remain " - "the most prevalent format for sharing structured knowledge in business and " - "academia. We evaluate multiple extraction libraries and approaches, measuring " - "their fidelity in preserving text content, layout structure, and tabular data.", - body_style, - )) + story.append( + Paragraph( + "The explosion of digital documents in enterprise and academic settings has " + "created an urgent need for robust document understanding systems. Organizations " + "store vast quantities of knowledge in unstructured formats such as PDFs, Word " + "documents, presentations, and scanned images. Unlocking this knowledge for " + "AI-powered applications, particularly Retrieval-Augmented Generation (RAG), " + "requires sophisticated parsing and extraction pipelines.", + body_style, + ) + ) + story.append( + Paragraph( + "RAG systems combine retrieval from a document corpus with generative language " + "models to produce accurate, grounded responses. The quality of the retrieval " + "step depends critically on how well source documents have been parsed, chunked, " + "and indexed. Poor extraction leads to noisy passages that degrade both retrieval " + "precision and generation quality.", + body_style, + ) + ) + story.append( + Paragraph( + "In this paper, we provide a comprehensive analysis of document understanding " + "techniques relevant to RAG pipelines. We focus on PDF documents, which remain " + "the most prevalent format for sharing structured knowledge in business and " + "academia. We evaluate multiple extraction libraries and approaches, measuring " + "their fidelity in preserving text content, layout structure, and tabular data.", + body_style, + ) + ) # Section 2 story.append(Paragraph("2. Related Work", heading_style)) - story.append(Paragraph( - "Document understanding has a rich history in the document analysis and " - "recognition community. Early systems relied on rule-based approaches with " - "hand-crafted heuristics for layout segmentation. The introduction of deep " - "learning brought significant improvements, with models like LayoutLM and " - "DocFormer learning joint representations of text and layout.", - body_style, - )) - story.append(Paragraph( - "Table extraction has received particular attention due to the structured " - "nature of tabular data. Methods range from heuristic line detection to " - "deep learning approaches such as TableNet and DETR-based table detectors. " - "Recent multimodal models like Donut and Nougat can parse documents end-to-end " - "without relying on OCR as an intermediate step.", - body_style, - )) - story.append(Paragraph( - "The RAG paradigm was introduced by Lewis et al. (2020) and has since become " - "a standard approach for knowledge-intensive NLP tasks. Subsequent work has " - "explored various aspects of RAG including retrieval strategies, chunk size " - "optimization, and re-ranking methods. However, relatively little attention " - "has been paid to the document parsing stage that precedes retrieval.", - body_style, - )) + story.append( + Paragraph( + "Document understanding has a rich history in the document analysis and " + "recognition community. Early systems relied on rule-based approaches with " + "hand-crafted heuristics for layout segmentation. The introduction of deep " + "learning brought significant improvements, with models like LayoutLM and " + "DocFormer learning joint representations of text and layout.", + body_style, + ) + ) + story.append( + Paragraph( + "Table extraction has received particular attention due to the structured " + "nature of tabular data. Methods range from heuristic line detection to " + "deep learning approaches such as TableNet and DETR-based table detectors. " + "Recent multimodal models like Donut and Nougat can parse documents end-to-end " + "without relying on OCR as an intermediate step.", + body_style, + ) + ) + story.append( + Paragraph( + "The RAG paradigm was introduced by Lewis et al. (2020) and has since become " + "a standard approach for knowledge-intensive NLP tasks. Subsequent work has " + "explored various aspects of RAG including retrieval strategies, chunk size " + "optimization, and re-ranking methods. However, relatively little attention " + "has been paid to the document parsing stage that precedes retrieval.", + body_style, + ) + ) # Section 3 story.append(Paragraph("3. Methodology", heading_style)) - story.append(Paragraph( - "We evaluate five PDF extraction approaches: (1) PyPDF for basic text extraction, " - "(2) pdfplumber for layout-aware extraction and table detection, (3) PyMuPDF for " - "high-performance extraction with position information, (4) Tesseract OCR for " - "scanned documents, and (5) a hybrid pipeline combining multiple methods.", - body_style, - )) - story.append(Paragraph( - "Our evaluation corpus consists of 500 documents spanning four categories: " - "academic papers, technical reports, financial statements, and product manuals. " - "Each document was manually annotated with ground-truth text, table structures, " - "and layout regions. We measure extraction quality using character error rate " - "(CER), table structure recognition (TSR) accuracy, and reading order accuracy.", - body_style, - )) - story.append(Paragraph( - "For the RAG evaluation, we chunk extracted text using four strategies: " - "fixed-size character windows, sentence-based splitting, recursive splitting " - "with semantic boundaries, and heading-aware chunking. We then embed chunks " - "using a sentence transformer model and evaluate retrieval quality on a set " - "of 200 question-answer pairs derived from the document corpus.", - body_style, - )) + story.append( + Paragraph( + "We evaluate five PDF extraction approaches: (1) PyPDF for basic text extraction, " + "(2) pdfplumber for layout-aware extraction and table detection, (3) PyMuPDF for " + "high-performance extraction with position information, (4) Tesseract OCR for " + "scanned documents, and (5) a hybrid pipeline combining multiple methods.", + body_style, + ) + ) + story.append( + Paragraph( + "Our evaluation corpus consists of 500 documents spanning four categories: " + "academic papers, technical reports, financial statements, and product manuals. " + "Each document was manually annotated with ground-truth text, table structures, " + "and layout regions. We measure extraction quality using character error rate " + "(CER), table structure recognition (TSR) accuracy, and reading order accuracy.", + body_style, + ) + ) + story.append( + Paragraph( + "For the RAG evaluation, we chunk extracted text using four strategies: " + "fixed-size character windows, sentence-based splitting, recursive splitting " + "with semantic boundaries, and heading-aware chunking. We then embed chunks " + "using a sentence transformer model and evaluate retrieval quality on a set " + "of 200 question-answer pairs derived from the document corpus.", + body_style, + ) + ) # Section 4 story.append(Paragraph("4. Results", heading_style)) - story.append(Paragraph( - "Our experiments reveal significant differences between extraction methods " - "across document categories. PyMuPDF consistently achieves the lowest character " - "error rate, averaging 2.3% across all documents. pdfplumber provides the best " - "table extraction with 87% TSR accuracy, compared to 45% for PyPDF and 76% for " - "PyMuPDF. Tesseract OCR, while essential for scanned documents, introduces " - "higher error rates of 5-8% on born-digital PDFs.", - body_style, - )) - story.append(Paragraph( - "The hybrid pipeline, which selects the best extraction method based on document " - "characteristics, achieves the highest overall quality with 1.8% CER and 89% TSR " - "accuracy. Documents are first classified as born-digital or scanned using image " - "analysis, and then processed accordingly.", - body_style, - )) - story.append(Paragraph( - "For RAG retrieval quality, heading-aware chunking combined with PyMuPDF extraction " - "yields the best results, with a mean reciprocal rank (MRR) of 0.82 and recall@5 " - "of 0.91. Fixed-size chunking performs worst at MRR 0.68, while recursive splitting " - "achieves MRR 0.79. These results underscore the importance of respecting document " - "structure during chunking.", - body_style, - )) + story.append( + Paragraph( + "Our experiments reveal significant differences between extraction methods " + "across document categories. PyMuPDF consistently achieves the lowest character " + "error rate, averaging 2.3% across all documents. pdfplumber provides the best " + "table extraction with 87% TSR accuracy, compared to 45% for PyPDF and 76% for " + "PyMuPDF. Tesseract OCR, while essential for scanned documents, introduces " + "higher error rates of 5-8% on born-digital PDFs.", + body_style, + ) + ) + story.append( + Paragraph( + "The hybrid pipeline, which selects the best extraction method based on document " + "characteristics, achieves the highest overall quality with 1.8% CER and 89% TSR " + "accuracy. Documents are first classified as born-digital or scanned using image " + "analysis, and then processed accordingly.", + body_style, + ) + ) + story.append( + Paragraph( + "For RAG retrieval quality, heading-aware chunking combined with PyMuPDF extraction " + "yields the best results, with a mean reciprocal rank (MRR) of 0.82 and recall@5 " + "of 0.91. Fixed-size chunking performs worst at MRR 0.68, while recursive splitting " + "achieves MRR 0.79. These results underscore the importance of respecting document " + "structure during chunking.", + body_style, + ) + ) # Section 5 story.append(Paragraph("5. Discussion", heading_style)) - story.append(Paragraph( - "Our findings highlight several key insights for practitioners building RAG systems. " - "First, no single extraction method dominates across all document types, suggesting " - "that adaptive pipelines are necessary for production systems. Second, table " - "extraction remains a significant challenge, and converting tables to natural " - "language descriptions substantially improves retrieval performance.", - body_style, - )) - story.append(Paragraph( - "Third, the choice of chunking strategy has a measurable impact on RAG quality, " - "with structure-aware approaches outperforming naive splitting. Fourth, multi-column " - "layouts require special handling to preserve reading order; without layout analysis, " - "text from different columns may be interleaved, producing incoherent passages.", - body_style, - )) + story.append( + Paragraph( + "Our findings highlight several key insights for practitioners building RAG systems. " + "First, no single extraction method dominates across all document types, suggesting " + "that adaptive pipelines are necessary for production systems. Second, table " + "extraction remains a significant challenge, and converting tables to natural " + "language descriptions substantially improves retrieval performance.", + body_style, + ) + ) + story.append( + Paragraph( + "Third, the choice of chunking strategy has a measurable impact on RAG quality, " + "with structure-aware approaches outperforming naive splitting. Fourth, multi-column " + "layouts require special handling to preserve reading order; without layout analysis, " + "text from different columns may be interleaved, producing incoherent passages.", + body_style, + ) + ) # Section 6 story.append(Paragraph("6. Conclusion", heading_style)) - story.append(Paragraph( - "We have presented a comprehensive evaluation of document understanding techniques " - "for RAG systems. Our results demonstrate that careful attention to the parsing " - "stage significantly impacts downstream retrieval and generation quality. We " - "recommend a hybrid approach that combines multiple extraction methods with " - "structure-aware chunking for optimal results. Future work should explore " - "end-to-end learned parsing systems and their integration with RAG pipelines.", - body_style, - )) + story.append( + Paragraph( + "We have presented a comprehensive evaluation of document understanding techniques " + "for RAG systems. Our results demonstrate that careful attention to the parsing " + "stage significantly impacts downstream retrieval and generation quality. We " + "recommend a hybrid approach that combines multiple extraction methods with " + "structure-aware chunking for optimal results. Future work should explore " + "end-to-end learned parsing systems and their integration with RAG pipelines.", + body_style, + ) + ) # References story.append(Paragraph("References", heading_style)) @@ -697,9 +954,18 @@ def generate_multi_column(): "[5] Blecher, L. et al. (2023). Nougat: Neural Optical Understanding for Academic Documents. arXiv.", ] for ref in refs: - story.append(Paragraph(ref, ParagraphStyle( - "Ref", parent=body_style, fontSize=8, leading=11, spaceAfter=3, - ))) + story.append( + Paragraph( + ref, + ParagraphStyle( + "Ref", + parent=body_style, + fontSize=8, + leading=11, + spaceAfter=3, + ), + ) + ) doc.build(story) print(f" Created: {path}") @@ -709,51 +975,82 @@ def generate_multi_column(): # 4) mixed_content.pdf # --------------------------------------------------------------------------- + def generate_mixed_content(): """Create a document mixing text, tables, bullet points, and headers.""" path = SAMPLE_DIR / "mixed_content.pdf" - doc = SimpleDocTemplate(str(path), pagesize=letter, - topMargin=72, bottomMargin=72, - leftMargin=72, rightMargin=72) + doc = SimpleDocTemplate( + str(path), + pagesize=letter, + topMargin=72, + bottomMargin=72, + leftMargin=72, + rightMargin=72, + ) styles = getSampleStyleSheet() title_style = ParagraphStyle( - "MixTitle", parent=styles["Title"], fontSize=22, - spaceAfter=16, alignment=TA_CENTER, + "MixTitle", + parent=styles["Title"], + fontSize=22, + spaceAfter=16, + alignment=TA_CENTER, ) heading_style = ParagraphStyle( - "MixHeading", parent=styles["Heading1"], fontSize=15, - spaceBefore=18, spaceAfter=10, + "MixHeading", + parent=styles["Heading1"], + fontSize=15, + spaceBefore=18, + spaceAfter=10, ) subheading_style = ParagraphStyle( - "MixSubheading", parent=styles["Heading2"], fontSize=12, - spaceBefore=12, spaceAfter=6, + "MixSubheading", + parent=styles["Heading2"], + fontSize=12, + spaceBefore=12, + spaceAfter=6, ) body_style = ParagraphStyle( - "MixBody", parent=styles["BodyText"], fontSize=11, - leading=16, spaceAfter=10, alignment=TA_JUSTIFY, + "MixBody", + parent=styles["BodyText"], + fontSize=11, + leading=16, + spaceAfter=10, + alignment=TA_JUSTIFY, ) bullet_style = ParagraphStyle( - "MixBullet", parent=styles["BodyText"], fontSize=11, - leading=16, spaceAfter=4, leftIndent=36, - bulletIndent=18, bulletFontSize=11, + "MixBullet", + parent=styles["BodyText"], + fontSize=11, + leading=16, + spaceAfter=4, + leftIndent=36, + bulletIndent=18, + bulletFontSize=11, ) def make_table_style(): - return TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2E75B6")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, 0), 10), - ("BOTTOMPADDING", (0, 0), (-1, 0), 8), - ("TOPPADDING", (0, 0), (-1, 0), 8), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#DAEEF3"), colors.white]), - ("GRID", (0, 0), (-1, -1), 0.5, colors.grey), - ("ALIGN", (0, 0), (-1, -1), "CENTER"), - ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), - ("TOPPADDING", (0, 1), (-1, -1), 5), - ("BOTTOMPADDING", (0, 1), (-1, -1), 5), - ]) + return TableStyle( + [ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2E75B6")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, 0), 10), + ("BOTTOMPADDING", (0, 0), (-1, 0), 8), + ("TOPPADDING", (0, 0), (-1, 0), 8), + ( + "ROWBACKGROUNDS", + (0, 1), + (-1, -1), + [colors.HexColor("#DAEEF3"), colors.white], + ), + ("GRID", (0, 0), (-1, -1), 0.5, colors.grey), + ("ALIGN", (0, 0), (-1, -1), "CENTER"), + ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), + ("TOPPADDING", (0, 1), (-1, -1), 5), + ("BOTTOMPADDING", (0, 1), (-1, -1), 5), + ] + ) story = [] @@ -763,14 +1060,16 @@ def make_table_style(): # --- Introduction --- story.append(Paragraph("Executive Summary", heading_style)) - story.append(Paragraph( - "This report provides an analysis of the most significant technology trends " - "shaping the industry in 2024. From the rapid adoption of generative AI to " - "advances in quantum computing, these trends are reshaping how businesses " - "operate and compete. Understanding these trends is essential for strategic " - "planning and technology investment decisions.", - body_style, - )) + story.append( + Paragraph( + "This report provides an analysis of the most significant technology trends " + "shaping the industry in 2024. From the rapid adoption of generative AI to " + "advances in quantum computing, these trends are reshaping how businesses " + "operate and compete. Understanding these trends is essential for strategic " + "planning and technology investment decisions.", + body_style, + ) + ) # --- Key Findings (bullet points) --- story.append(Paragraph("Key Findings", heading_style)) @@ -789,23 +1088,27 @@ def make_table_style(): # --- Section with text --- story.append(Paragraph("Generative AI in the Enterprise", heading_style)) - story.append(Paragraph( - "Generative AI has emerged as the defining technology of 2024. Large language " - "models (LLMs) are being deployed across industries for tasks ranging from " - "customer support automation to code generation and content creation. " - "Retrieval-Augmented Generation (RAG) has become the preferred architecture " - "for enterprise AI applications, combining the fluency of generative models " - "with the accuracy of information retrieval.", - body_style, - )) - story.append(Paragraph( - "Organizations are investing heavily in data infrastructure to support their " - "AI initiatives. Vector databases, embedding pipelines, and document processing " - "systems form the backbone of modern RAG deployments. The ability to accurately " - "extract and structure information from unstructured documents is a critical " - "capability that directly impacts AI application quality.", - body_style, - )) + story.append( + Paragraph( + "Generative AI has emerged as the defining technology of 2024. Large language " + "models (LLMs) are being deployed across industries for tasks ranging from " + "customer support automation to code generation and content creation. " + "Retrieval-Augmented Generation (RAG) has become the preferred architecture " + "for enterprise AI applications, combining the fluency of generative models " + "with the accuracy of information retrieval.", + body_style, + ) + ) + story.append( + Paragraph( + "Organizations are investing heavily in data infrastructure to support their " + "AI initiatives. Vector databases, embedding pipelines, and document processing " + "systems form the backbone of modern RAG deployments. The ability to accurately " + "extract and structure information from unstructured documents is a critical " + "capability that directly impacts AI application quality.", + body_style, + ) + ) # --- Table: AI Adoption --- story.append(Paragraph("AI Adoption by Industry", subheading_style)) @@ -825,14 +1128,16 @@ def make_table_style(): # --- Another text section --- story.append(Paragraph("Cloud and Infrastructure Trends", heading_style)) - story.append(Paragraph( - "The cloud computing landscape continues to evolve rapidly. Multi-cloud and " - "hybrid-cloud strategies have become the norm, with organizations distributing " - "workloads across multiple providers to optimize cost, performance, and " - "resilience. Kubernetes has solidified its position as the de facto standard " - "for container orchestration.", - body_style, - )) + story.append( + Paragraph( + "The cloud computing landscape continues to evolve rapidly. Multi-cloud and " + "hybrid-cloud strategies have become the norm, with organizations distributing " + "workloads across multiple providers to optimize cost, performance, and " + "resilience. Kubernetes has solidified its position as the de facto standard " + "for container orchestration.", + body_style, + ) + ) # --- Bullet points for cloud --- story.append(Paragraph("Top Cloud Priorities for 2024", subheading_style)) @@ -863,22 +1168,26 @@ def make_table_style(): # --- Cybersecurity section --- story.append(Paragraph("Cybersecurity Landscape", heading_style)) - story.append(Paragraph( - "The cybersecurity threat landscape has become increasingly complex. " - "Ransomware attacks continue to rise in frequency and sophistication, while " - "AI-powered threats present new challenges for defense teams. Zero-trust " - "architecture has moved from concept to implementation, with organizations " - "adopting identity-centric security models that verify every access request.", - body_style, - )) - story.append(Paragraph( - "Supply chain security has emerged as a critical concern following high-profile " - "incidents. Software Bill of Materials (SBOM) requirements are becoming " - "standard, and organizations are implementing stricter controls over " - "third-party dependencies. AI is being used both offensively and defensively, " - "creating an arms race between attackers and defenders.", - body_style, - )) + story.append( + Paragraph( + "The cybersecurity threat landscape has become increasingly complex. " + "Ransomware attacks continue to rise in frequency and sophistication, while " + "AI-powered threats present new challenges for defense teams. Zero-trust " + "architecture has moved from concept to implementation, with organizations " + "adopting identity-centric security models that verify every access request.", + body_style, + ) + ) + story.append( + Paragraph( + "Supply chain security has emerged as a critical concern following high-profile " + "incidents. Software Bill of Materials (SBOM) requirements are becoming " + "standard, and organizations are implementing stricter controls over " + "third-party dependencies. AI is being used both offensively and defensively, " + "creating an arms race between attackers and defenders.", + body_style, + ) + ) # --- Conclusion --- story.append(Paragraph("Recommendations", heading_style)) diff --git a/unstructured_documents/02_docx/01_python_docx_extraction.py b/unstructured_documents/02_docx/01_python_docx_extraction.py index d5c92c6..e45aba4 100644 --- a/unstructured_documents/02_docx/01_python_docx_extraction.py +++ b/unstructured_documents/02_docx/01_python_docx_extraction.py @@ -35,6 +35,7 @@ # 1. Extract paragraphs with style information # =================================================================== + def extract_paragraphs(doc_path: Path) -> list[dict]: """ Extract every paragraph together with its style name and full text. @@ -50,20 +51,22 @@ def extract_paragraphs(doc_path: Path) -> list[dict]: text = para.text.strip() if not text: continue - paragraphs.append({ - "style": para.style.name, - "text": text, - # Run-level detail: capture bold/italic spans - "runs": [ - { - "text": run.text, - "bold": run.bold, - "italic": run.italic, - } - for run in para.runs - if run.text.strip() - ], - }) + paragraphs.append( + { + "style": para.style.name, + "text": text, + # Run-level detail: capture bold/italic spans + "runs": [ + { + "text": run.text, + "bold": run.bold, + "italic": run.italic, + } + for run in para.runs + if run.text.strip() + ], + } + ) return paragraphs @@ -71,6 +74,7 @@ def extract_paragraphs(doc_path: Path) -> list[dict]: # 2. Extract tables # =================================================================== + def extract_tables(doc_path: Path) -> list[list[list[str]]]: """ Extract all tables in the document. @@ -93,6 +97,7 @@ def extract_tables(doc_path: Path) -> list[list[list[str]]]: # 3. Build a heading-based document hierarchy # =================================================================== + def build_heading_hierarchy(doc_path: Path) -> list[dict]: """ Walk through the document and group content under headings. @@ -131,11 +136,13 @@ def build_heading_hierarchy(doc_path: Path) -> list[dict]: "body_parts": [], } else: - current_section["body_parts"].append({ - "type": "paragraph", - "style": style_name, - "text": text, - }) + current_section["body_parts"].append( + { + "type": "paragraph", + "style": style_name, + "text": text, + } + ) # Don't forget the last section if current_section["body_parts"]: @@ -148,6 +155,7 @@ def build_heading_hierarchy(doc_path: Path) -> list[dict]: # 4. Convert to markdown text (for heading-based chunking) # =================================================================== + def docx_to_markdown(doc_path: Path) -> str: """ Convert the DOCX to a simple markdown string so we can reuse @@ -189,6 +197,7 @@ def docx_to_markdown(doc_path: Path) -> str: # Main demonstration # =================================================================== + def main() -> None: print("=" * 70) print("DOCX Extraction with python-docx") @@ -214,7 +223,7 @@ def main() -> None: flags.append("BOLD") if r["italic"]: flags.append("ITALIC") - print(f" {'':20s} ^ {', '.join(flags)}: \"{r['text'][:60]}\"") + print(f' {"":20s} ^ {", ".join(flags)}: "{r["text"][:60]}"') if len(paragraphs) > 8: print(f"\n ... and {len(paragraphs) - 8} more paragraphs") @@ -245,8 +254,10 @@ def main() -> None: for sec in sections: indent = " " * sec["heading_level"] n_parts = len(sec["body_parts"]) - print(f" {indent}H{sec['heading_level']}: {sec['heading_text']} " - f"({n_parts} body element{'s' if n_parts != 1 else ''})") + print( + f" {indent}H{sec['heading_level']}: {sec['heading_text']} " + f"({n_parts} body element{'s' if n_parts != 1 else ''})" + ) # ------------------------------------------------------------------ # 4. Chunking by headings (via markdown conversion) diff --git a/unstructured_documents/02_docx/02_mammoth_extraction.py b/unstructured_documents/02_docx/02_mammoth_extraction.py index f9d31c9..6496505 100644 --- a/unstructured_documents/02_docx/02_mammoth_extraction.py +++ b/unstructured_documents/02_docx/02_mammoth_extraction.py @@ -37,6 +37,7 @@ # 1. Convert DOCX to HTML # =================================================================== + def docx_to_html(doc_path: Path) -> tuple[str, list[str]]: """ Convert the DOCX file to clean, semantic HTML. @@ -60,6 +61,7 @@ def docx_to_html(doc_path: Path) -> tuple[str, list[str]]: # 2. Convert DOCX to markdown # =================================================================== + def docx_to_markdown(doc_path: Path) -> tuple[str, list[str]]: """ Convert the DOCX file to markdown using mammoth's built-in converter. @@ -79,6 +81,7 @@ def docx_to_markdown(doc_path: Path) -> tuple[str, list[str]]: # 3. Heading-aware chunking on markdown output # =================================================================== + def chunk_markdown_by_headings(markdown_text: str) -> list[dict]: """ Use the shared heading-aware chunker on mammoth's markdown output. @@ -93,6 +96,7 @@ def chunk_markdown_by_headings(markdown_text: str) -> list[dict]: # Main demonstration # =================================================================== + def main() -> None: print("=" * 70) print("DOCX Extraction with mammoth") @@ -112,7 +116,7 @@ def main() -> None: else: print("No conversion warnings.") - print(f"\nHTML output (first 800 chars):") + print("\nHTML output (first 800 chars):") print("-" * 50) print(html[:800]) print("-" * 50) @@ -131,7 +135,7 @@ def main() -> None: else: print("No conversion warnings.") - print(f"\nMarkdown output (first 800 chars):") + print("\nMarkdown output (first 800 chars):") print("-" * 50) print(markdown[:800]) print("-" * 50) @@ -148,17 +152,13 @@ def main() -> None: # Count headings in each format html_headings = html.count("

") + html.count("

") + html.count("

") - md_headings = sum( - 1 for line in markdown.splitlines() - if line.strip().startswith("#") - ) + md_headings = sum(1 for line in markdown.splitlines() if line.strip().startswith("#")) print(f"{'Heading elements':<30s} {html_headings:>12d} {md_headings:>12d}") # Count list items html_list_items = html.count("
  • ") md_list_items = sum( - 1 for line in markdown.splitlines() - if line.strip().startswith("- ") or line.strip().startswith("* ") + 1 for line in markdown.splitlines() if line.strip().startswith("- ") or line.strip().startswith("* ") ) print(f"{'List items':<30s} {html_list_items:>12d} {md_list_items:>12d}") diff --git a/unstructured_documents/02_docx/03_docx2txt_extraction.py b/unstructured_documents/02_docx/03_docx2txt_extraction.py index bcab9c0..8be5566 100644 --- a/unstructured_documents/02_docx/03_docx2txt_extraction.py +++ b/unstructured_documents/02_docx/03_docx2txt_extraction.py @@ -26,8 +26,8 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from unstructured_documents.shared.chunking import ( chunk_by_characters, - chunk_by_sentences, chunk_by_recursive_split, + chunk_by_sentences, preview_chunks, ) @@ -38,6 +38,7 @@ # 1. Basic text extraction # =================================================================== + def extract_text(doc_path: Path) -> str: """ Extract the full plain-text content of a DOCX file. @@ -53,20 +54,21 @@ def extract_text(doc_path: Path) -> str: # 2. Analysis: what is preserved vs. lost # =================================================================== + def analyse_extraction(text: str) -> dict: """ Run simple heuristics to illustrate what docx2txt keeps and drops. """ - lines = [l for l in text.splitlines() if l.strip()] + lines = [line for line in text.splitlines() if line.strip()] words = text.split() return { "total_characters": len(text), "total_words": len(words), "non_empty_lines": len(lines), - "contains_tabs": "\t" in text, # tables survive as tab-separated - "contains_bullet_markers": False, # bullet symbols are lost - "heading_markers_present": False, # heading markup is lost + "contains_tabs": "\t" in text, # tables survive as tab-separated + "contains_bullet_markers": False, # bullet symbols are lost + "heading_markers_present": False, # heading markup is lost } @@ -74,6 +76,7 @@ def analyse_extraction(text: str) -> dict: # Main demonstration # =================================================================== + def main() -> None: print("=" * 70) print("DOCX Extraction with docx2txt") diff --git a/unstructured_documents/02_docx/sample_docs/generate_samples.py b/unstructured_documents/02_docx/sample_docs/generate_samples.py index bdaf1af..0674260 100644 --- a/unstructured_documents/02_docx/sample_docs/generate_samples.py +++ b/unstructured_documents/02_docx/sample_docs/generate_samples.py @@ -13,10 +13,8 @@ from pathlib import Path from docx import Document -from docx.shared import Inches, Pt, RGBColor -from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT - +from docx.shared import Pt, RGBColor SAMPLE_DIR = Path(__file__).resolve().parent @@ -25,6 +23,7 @@ # 1. simple_document.docx # --------------------------------------------------------------------------- + def create_simple_document() -> None: """Create a document with headings, paragraphs, bullet points, and rich text.""" @@ -67,17 +66,12 @@ def create_simple_document() -> None: # --- H3: Key Indicators --- doc.add_heading("Key Climate Indicators", level=3) - doc.add_paragraph( - "Scientists track several indicators to monitor the state of the climate system:" - ) + doc.add_paragraph("Scientists track several indicators to monitor the state of the climate system:") bullets = [ - "Global mean surface temperature has risen approximately 1.1 °C above " - "pre-industrial levels as of 2023.", - "Arctic sea-ice extent has declined by roughly 13 % per decade since " - "satellite records began in 1979.", - "Global mean sea level has risen about 20 cm since 1900, with the rate of " - "rise accelerating in recent decades.", + "Global mean surface temperature has risen approximately 1.1 °C above pre-industrial levels as of 2023.", + "Arctic sea-ice extent has declined by roughly 13 % per decade since satellite records began in 1979.", + "Global mean sea level has risen about 20 cm since 1900, with the rate of rise accelerating in recent decades.", "Ocean heat content has increased steadily, with the upper 2,000 metres of " "the ocean absorbing over 90 % of the excess heat.", "Atmospheric methane concentrations have more than doubled since " @@ -93,9 +87,7 @@ def create_simple_document() -> None: para = doc.add_paragraph("The effects of climate change are ") run = para.add_run("already being felt") run.italic = True - para.add_run( - " across every continent and ocean. Some of the most significant impacts include:" - ) + para.add_run(" across every continent and ocean. Some of the most significant impacts include:") doc.add_heading("Extreme Weather Events", level=3) @@ -148,8 +140,7 @@ def create_simple_document() -> None: "Building resilient infrastructure designed for future climate conditions.", "Developing drought-resistant crop varieties and sustainable water management.", "Strengthening early-warning systems for extreme weather events.", - "Implementing nature-based solutions such as mangrove restoration for coastal " - "protection.", + "Implementing nature-based solutions such as mangrove restoration for coastal protection.", ] for point in adaptation_points: doc.add_paragraph(point, style="List Bullet") @@ -164,8 +155,7 @@ def create_simple_document() -> None: "if rapid, far-reaching action is taken across all sectors of the economy. " ) run = para.add_run( - "The choices made in the next decade will determine the trajectory of the " - "climate for centuries to come." + "The choices made in the next decade will determine the trajectory of the climate for centuries to come." ) run.bold = True run.italic = True @@ -179,6 +169,7 @@ def create_simple_document() -> None: # 2. tables_document.docx # --------------------------------------------------------------------------- + def create_tables_document() -> None: """Create a document with multiple tables and explanatory text.""" @@ -203,11 +194,11 @@ def create_tables_document() -> None: financial_headers = ["Metric", "Q3 2025", "Q2 2025", "Q3 2024", "YoY Change"] financial_data = [ - ["Revenue", "$12,450,000", "$11,820,000", "$10,900,000", "+14.2 %"], - ["Cost of Goods Sold", "$7,470,000", "$7,210,000", "$6,760,000", "+10.5 %"], - ["Gross Profit", "$4,980,000", "$4,610,000", "$4,140,000", "+20.3 %"], - ["Operating Expenses", "$2,850,000", "$2,790,000", "$2,610,000", "+9.2 %"], - ["Net Income", "$2,130,000", "$1,820,000", "$1,530,000", "+39.2 %"], + ["Revenue", "$12,450,000", "$11,820,000", "$10,900,000", "+14.2 %"], + ["Cost of Goods Sold", "$7,470,000", "$7,210,000", "$6,760,000", "+10.5 %"], + ["Gross Profit", "$4,980,000", "$4,610,000", "$4,140,000", "+20.3 %"], + ["Operating Expenses", "$2,850,000", "$2,790,000", "$2,610,000", "+9.2 %"], + ["Net Income", "$2,130,000", "$1,820,000", "$1,530,000", "+39.2 %"], ] table = doc.add_table(rows=1, cols=len(financial_headers)) @@ -234,15 +225,20 @@ def create_tables_document() -> None: "current stock levels across our main product categories." ) - inventory_headers = ["Product Category", "SKU Count", "Units in Stock", - "Reorder Point", "Status"] + inventory_headers = [ + "Product Category", + "SKU Count", + "Units in Stock", + "Reorder Point", + "Status", + ] inventory_data = [ - ["Electronics", "142", "34,500", "10,000", "Adequate"], - ["Home Appliances", "87", "12,200", "5,000", "Adequate"], - ["Office Supplies", "215", "98,000", "25,000", "Adequate"], - ["Industrial Tools", "63", "4,800", "5,000", "Low — reorder initiated"], - ["Automotive Parts", "178", "22,100", "8,000", "Adequate"], - ["Health & Safety", "54", "7,300", "7,000", "Marginal"], + ["Electronics", "142", "34,500", "10,000", "Adequate"], + ["Home Appliances", "87", "12,200", "5,000", "Adequate"], + ["Office Supplies", "215", "98,000", "25,000", "Adequate"], + ["Industrial Tools", "63", "4,800", "5,000", "Low — reorder initiated"], + ["Automotive Parts", "178", "22,100", "8,000", "Adequate"], + ["Health & Safety", "54", "7,300", "7,000", "Marginal"], ] table = doc.add_table(rows=1, cols=len(inventory_headers)) @@ -272,13 +268,25 @@ def create_tables_document() -> None: emp_headers = ["Name", "Title", "Department", "Office", "Email"] emp_data = [ - ["Sarah Chen", "CEO", "Executive", "New York", "s.chen@acme.com"], - ["James Okafor", "CFO", "Finance", "New York", "j.okafor@acme.com"], - ["Maria Gonzalez", "VP of Engineering", "Engineering", "San Francisco","m.gonzalez@acme.com"], - ["David Kim", "VP of Sales", "Sales", "Chicago", "d.kim@acme.com"], - ["Priya Patel", "VP of Operations", "Operations", "Dallas", "p.patel@acme.com"], - ["Thomas Weber", "Head of HR", "Human Resources","London", "t.weber@acme.com"], - ["Aisha Mohammed", "General Counsel", "Legal", "New York", "a.mohammed@acme.com"], + ["Sarah Chen", "CEO", "Executive", "New York", "s.chen@acme.com"], + ["James Okafor", "CFO", "Finance", "New York", "j.okafor@acme.com"], + [ + "Maria Gonzalez", + "VP of Engineering", + "Engineering", + "San Francisco", + "m.gonzalez@acme.com", + ], + ["David Kim", "VP of Sales", "Sales", "Chicago", "d.kim@acme.com"], + ["Priya Patel", "VP of Operations", "Operations", "Dallas", "p.patel@acme.com"], + ["Thomas Weber", "Head of HR", "Human Resources", "London", "t.weber@acme.com"], + [ + "Aisha Mohammed", + "General Counsel", + "Legal", + "New York", + "a.mohammed@acme.com", + ], ] table = doc.add_table(rows=1, cols=len(emp_headers)) @@ -309,6 +317,7 @@ def create_tables_document() -> None: # 3. styled_document.docx # --------------------------------------------------------------------------- + def create_styled_document() -> None: """Create a document exercising many built-in Word styles.""" @@ -316,9 +325,7 @@ def create_styled_document() -> None: # Title & subtitle doc.add_paragraph("The Art of Software Architecture", style="Title") - doc.add_paragraph( - "A Practical Guide to Designing Maintainable Systems", style="Subtitle" - ) + doc.add_paragraph("A Practical Guide to Designing Maintainable Systems", style="Subtitle") # --- Heading 1 --- doc.add_heading("Introduction", level=1) @@ -335,15 +342,15 @@ def create_styled_document() -> None: "This guide distils decades of collective industry experience into concise, " "actionable advice. It is intended for developers who are transitioning into " "architecture roles, as well as seasoned architects looking for a refresher.", - style="Normal" + style="Normal", ) # --- Quote style --- doc.add_paragraph( '"Architecture is the decisions that you wish you could get right early in ' - 'a project, but that you are not necessarily more likely to get them right ' + "a project, but that you are not necessarily more likely to get them right " 'than any other." — Ralph Johnson', - style="Quote" + style="Quote", ) # --- Heading 1 --- @@ -369,20 +376,15 @@ def create_styled_document() -> None: doc.add_heading("SOLID Principles", level=2) - doc.add_paragraph( - "The SOLID principles provide a foundation for object-oriented design:" - ) + doc.add_paragraph("The SOLID principles provide a foundation for object-oriented design:") # Numbered list numbered_items = [ - "Single Responsibility Principle — a class should have one, and only one, " - "reason to change.", - "Open/Closed Principle — software entities should be open for extension but " - "closed for modification.", + "Single Responsibility Principle — a class should have one, and only one, reason to change.", + "Open/Closed Principle — software entities should be open for extension but closed for modification.", "Liskov Substitution Principle — subtypes must be substitutable for their " "base types without altering program correctness.", - "Interface Segregation Principle — clients should not be forced to depend on " - "interfaces they do not use.", + "Interface Segregation Principle — clients should not be forced to depend on interfaces they do not use.", "Dependency Inversion Principle — high-level modules should not depend on " "low-level modules; both should depend on abstractions.", ] @@ -424,22 +426,20 @@ def create_styled_document() -> None: # --- Code-style text --- doc.add_heading("Code Example: Dependency Injection", level=2) - doc.add_paragraph( - "The following pseudo-code shows constructor-based dependency injection:" - ) + doc.add_paragraph("The following pseudo-code shows constructor-based dependency injection:") # Use a monospace font for code code_para = doc.add_paragraph() code_run = code_para.add_run( - 'class OrderService:\n' - ' def __init__(self, repository: OrderRepository,\n' - ' notifier: NotificationService):\n' - ' self._repository = repository\n' - ' self._notifier = notifier\n' - '\n' - ' def place_order(self, order: Order) -> None:\n' - ' self._repository.save(order)\n' - ' self._notifier.send_confirmation(order)\n' + "class OrderService:\n" + " def __init__(self, repository: OrderRepository,\n" + " notifier: NotificationService):\n" + " self._repository = repository\n" + " self._notifier = notifier\n" + "\n" + " def place_order(self, order: Order) -> None:\n" + " self._repository.save(order)\n" + " self._notifier.send_confirmation(order)\n" ) code_run.font.name = "Courier New" code_run.font.size = Pt(9) @@ -452,10 +452,7 @@ def create_styled_document() -> None: ) # --- Another Quote --- - doc.add_paragraph( - '"Make each program do one thing well." — Unix Philosophy', - style="Quote" - ) + doc.add_paragraph('"Make each program do one thing well." — Unix Philosophy', style="Quote") # --- Heading 1 --- doc.add_heading("Quality Attributes", level=1) @@ -488,7 +485,7 @@ def create_styled_document() -> None: doc.add_paragraph( '"The best architectures are grown, not designed." — Adapted from Fred Brooks', - style="Quote" + style="Quote", ) path = SAMPLE_DIR / "styled_document.docx" diff --git a/unstructured_documents/03_pptx/01_python_pptx_extraction.py b/unstructured_documents/03_pptx/01_python_pptx_extraction.py index 15feee7..3959b50 100644 --- a/unstructured_documents/03_pptx/01_python_pptx_extraction.py +++ b/unstructured_documents/03_pptx/01_python_pptx_extraction.py @@ -16,15 +16,14 @@ # --- shared chunking import ------------------------------------------------ sys.path.insert(0, str(Path(__file__).resolve().parents[2])) +from pptx import Presentation + from unstructured_documents.shared.chunking import ( - chunk_by_sentences, chunk_by_recursive_split, + chunk_by_sentences, preview_chunks, ) -from pptx import Presentation - - # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- @@ -36,6 +35,7 @@ # Extraction helpers # --------------------------------------------------------------------------- + def extract_text_from_shape(shape) -> list[dict]: """ Recursively extract text from a single shape. @@ -101,6 +101,7 @@ def extract_table_data(slide) -> list[list[list[str]]]: # Full extraction # --------------------------------------------------------------------------- + def extract_all_slides(pptx_path: Path) -> list[dict]: """ Walk every slide and extract text, tables, and notes. @@ -120,11 +121,13 @@ def extract_all_slides(pptx_path: Path) -> list[dict]: for shape in slide.shapes: shape_extracts.extend(extract_text_from_shape(shape)) - slides_data.append({ - "slide_number": idx, - "shapes": shape_extracts, - "notes": extract_notes(slide), - }) + slides_data.append( + { + "slide_number": idx, + "shapes": shape_extracts, + "notes": extract_notes(slide), + } + ) return slides_data @@ -133,12 +136,13 @@ def extract_all_slides(pptx_path: Path) -> list[dict]: # Display # --------------------------------------------------------------------------- + def print_extraction_results(slides_data: list[dict]) -> None: """Pretty-print the extraction results.""" for slide in slides_data: - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f" SLIDE {slide['slide_number']}") - print(f"{'='*60}") + print(f"{'=' * 60}") if not slide["shapes"]: print(" (no extractable text)") @@ -150,7 +154,7 @@ def print_extraction_results(slides_data: list[dict]) -> None: print(f" {line}") if slide["notes"]: - print(f"\n [SPEAKER NOTES]") + print("\n [SPEAKER NOTES]") for line in slide["notes"].split("\n"): print(f" {line}") @@ -175,9 +179,9 @@ def print_extraction_results(slides_data: list[dict]) -> None: print_extraction_results(slides_data) # ── 2. Table extraction detail ──────────────────────────────────────── - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" TABLE EXTRACTION DETAIL") - print(f"{'='*60}") + print(f"{'=' * 60}") prs = Presentation(str(PPTX_PATH)) for idx, slide in enumerate(prs.slides, start=1): tables = extract_table_data(slide) @@ -197,14 +201,14 @@ def print_extraction_results(slides_data: list[dict]) -> None: full_text = "\n\n".join(all_text_parts) - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" CHUNKING DEMO — Sentence-based") - print(f"{'='*60}") + print(f"{'=' * 60}") sentence_chunks = chunk_by_sentences(full_text, sentences_per_chunk=4, overlap_sentences=1) preview_chunks(sentence_chunks, max_preview=4, max_chars=300) - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" CHUNKING DEMO — Recursive split") - print(f"{'='*60}") + print(f"{'=' * 60}") recursive_chunks = chunk_by_recursive_split(full_text, chunk_size=400) preview_chunks(recursive_chunks, max_preview=4, max_chars=300) diff --git a/unstructured_documents/03_pptx/02_slide_structured_extraction.py b/unstructured_documents/03_pptx/02_slide_structured_extraction.py index ed77a4a..5fdc327 100644 --- a/unstructured_documents/03_pptx/02_slide_structured_extraction.py +++ b/unstructured_documents/03_pptx/02_slide_structured_extraction.py @@ -18,13 +18,13 @@ # --- shared chunking import ------------------------------------------------ sys.path.insert(0, str(Path(__file__).resolve().parents[2])) +from pptx import Presentation + from unstructured_documents.shared.chunking import ( chunk_by_sentences, preview_chunks, ) -from pptx import Presentation - # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- @@ -36,6 +36,7 @@ # Structured extraction # --------------------------------------------------------------------------- + def _collect_body_text(shape) -> list[str]: """ Recursively collect body text from a shape (skipping titles). @@ -116,13 +117,15 @@ def extract_structured_slides(pptx_path: Path) -> list[dict]: if slide.has_notes_slide: notes = slide.notes_slide.notes_text_frame.text.strip() - structured.append({ - "slide_number": idx, - "title": title, - "body_text": body_text, - "table_data": tables, - "notes": notes, - }) + structured.append( + { + "slide_number": idx, + "title": title, + "body_text": body_text, + "table_data": tables, + "notes": notes, + } + ) return structured @@ -131,6 +134,7 @@ def extract_structured_slides(pptx_path: Path) -> list[dict]: # RAG-ready conversion # --------------------------------------------------------------------------- + def table_to_text(table: list[list[str]]) -> str: """ Convert a 2-D table into a readable text block. @@ -184,15 +188,17 @@ def slides_to_rag_chunks(slides: list[dict], include_notes: bool = True) -> list if not text: continue - chunks.append({ - "text": text, - "metadata": { - "slide_number": slide["slide_number"], - "title": slide["title"], - "has_table": len(slide["table_data"]) > 0, - "has_notes": bool(slide["notes"]), - }, - }) + chunks.append( + { + "text": text, + "metadata": { + "slide_number": slide["slide_number"], + "title": slide["title"], + "has_table": len(slide["table_data"]) > 0, + "has_notes": bool(slide["notes"]), + }, + } + ) return chunks @@ -217,15 +223,16 @@ def build_slide_summaries(slides: list[dict]) -> list[str]: # Display helpers # --------------------------------------------------------------------------- + def print_structured_slides(slides: list[dict]) -> None: """Pretty-print the structured slide data.""" for slide in slides: - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f" SLIDE {slide['slide_number']}: {slide['title'] or '(no title)'}") - print(f"{'='*60}") + print(f"{'=' * 60}") if slide["body_text"]: - print(f"\n Body text:") + print("\n Body text:") for line in slide["body_text"].split("\n"): print(f" {line}") @@ -236,7 +243,7 @@ def print_structured_slides(slides: list[dict]) -> None: print(f" {row}") if slide["notes"]: - print(f"\n Speaker notes:") + print("\n Speaker notes:") for line in slide["notes"].split("\n"): print(f" {line}") @@ -245,11 +252,13 @@ def print_rag_chunks(chunks: list[dict]) -> None: """Print RAG chunks with metadata.""" for i, chunk in enumerate(chunks, start=1): meta = chunk["metadata"] - print(f"\n{'- '*30}") - print(f" Chunk {i} | Slide {meta['slide_number']} | " - f"Title: {meta['title'] or 'N/A'} | " - f"Table: {meta['has_table']} | Notes: {meta['has_notes']}") - print(f"{'- '*30}") + print(f"\n{'- ' * 30}") + print( + f" Chunk {i} | Slide {meta['slide_number']} | " + f"Title: {meta['title'] or 'N/A'} | " + f"Table: {meta['has_table']} | Notes: {meta['has_notes']}" + ) + print(f"{'- ' * 30}") # Truncate for display text = chunk["text"] if len(text) > 400: @@ -280,30 +289,30 @@ def print_rag_chunks(chunks: list[dict]) -> None: # ── 2. Slide summaries ──────────────────────────────────────────────── summaries = build_slide_summaries(slides) - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" SLIDE SUMMARIES") - print(f"{'='*60}") + print(f"{'=' * 60}") for s in summaries: print(f" {s}") # ── 3. RAG-ready chunks (one per slide) ─────────────────────────────── rag_chunks = slides_to_rag_chunks(slides, include_notes=True) - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" RAG-READY CHUNKS (one per slide, notes included)") - print(f"{'='*60}") + print(f"{'=' * 60}") print_rag_chunks(rag_chunks) # ── 4. Sentence-based chunking on full text ─────────────────────────── full_text = "\n\n".join(chunk["text"] for chunk in rag_chunks) - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" SENTENCE-BASED CHUNKING (merged text)") - print(f"{'='*60}") + print(f"{'=' * 60}") sentence_chunks = chunk_by_sentences(full_text, sentences_per_chunk=5, overlap_sentences=1) preview_chunks(sentence_chunks, max_preview=5, max_chars=350) # ── 5. JSON output sample ───────────────────────────────────────────── - print(f"\n\n{'='*60}") + print(f"\n\n{'=' * 60}") print(" JSON OUTPUT (first 2 slides)") - print(f"{'='*60}") + print(f"{'=' * 60}") json_sample = json.dumps(rag_chunks[:2], indent=2, ensure_ascii=False) print(json_sample) diff --git a/unstructured_documents/03_pptx/sample_docs/generate_samples.py b/unstructured_documents/03_pptx/sample_docs/generate_samples.py index 3db6d46..6803c76 100644 --- a/unstructured_documents/03_pptx/sample_docs/generate_samples.py +++ b/unstructured_documents/03_pptx/sample_docs/generate_samples.py @@ -12,9 +12,8 @@ from pathlib import Path from pptx import Presentation -from pptx.util import Inches, Pt, Emu from pptx.enum.text import PP_ALIGN - +from pptx.util import Emu, Inches, Pt # --------------------------------------------------------------------------- # Helpers @@ -39,6 +38,7 @@ def _add_textbox(slide, left, top, width, height, text, font_size=14, bold=False # Presentation 1 – Introduction to Machine Learning (6 slides) # --------------------------------------------------------------------------- + def create_ml_presentation() -> Path: prs = Presentation() @@ -46,9 +46,7 @@ def create_ml_presentation() -> Path: slide_layout = prs.slide_layouts[0] # Title Slide layout slide = prs.slides.add_slide(slide_layout) slide.shapes.title.text = "Introduction to Machine Learning" - slide.placeholders[1].text = ( - "A practical overview of ML concepts, algorithms, and applications" - ) + slide.placeholders[1].text = "A practical overview of ML concepts, algorithms, and applications" # ── Slide 2: Bullet points – ML types ───────────────────────────────── slide_layout = prs.slide_layouts[1] # Title and Content @@ -70,13 +68,19 @@ def create_ml_presentation() -> Path: # ── Slide 3: Table – Algorithm comparison ───────────────────────────── slide_layout = prs.slide_layouts[5] # Blank layout slide = prs.slides.add_slide(slide_layout) - _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6), - "Comparison of ML Algorithms", font_size=24, bold=True) + _add_textbox( + slide, + Inches(0.5), + Inches(0.3), + Inches(9), + Inches(0.6), + "Comparison of ML Algorithms", + font_size=24, + bold=True, + ) rows, cols = 6, 4 - table_shape = slide.shapes.add_table(rows, cols, - Inches(0.5), Inches(1.2), - Inches(9), Inches(4)) + table_shape = slide.shapes.add_table(rows, cols, Inches(0.5), Inches(1.2), Inches(9), Inches(4)) table = table_shape.table headers = ["Algorithm", "Type", "Use Case", "Complexity"] @@ -125,19 +129,35 @@ def create_ml_presentation() -> Path: # ── Slide 5: Grouped text boxes – key takeaways ─────────────────────── slide_layout = prs.slide_layouts[5] # Blank slide = prs.slides.add_slide(slide_layout) - _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6), - "Key Takeaways", font_size=24, bold=True) + _add_textbox( + slide, + Inches(0.5), + Inches(0.3), + Inches(9), + Inches(0.6), + "Key Takeaways", + font_size=24, + bold=True, + ) # Create a group shape containing three text boxes group = slide.shapes.add_group_shape() takeaways = [ - ("Data is King", "The quality and quantity of your data matters more than the algorithm you choose."), - ("Start Simple", "Begin with simple models like linear regression before moving to complex architectures."), - ("Iterate Fast", "Rapid experimentation and iteration lead to better results than perfecting a single approach."), + ( + "Data is King", + "The quality and quantity of your data matters more than the algorithm you choose.", + ), + ( + "Start Simple", + "Begin with simple models like linear regression before moving to complex architectures.", + ), + ( + "Iterate Fast", + "Rapid experimentation and iteration lead to better results than perfecting a single approach.", + ), ] box_width = Emu(Inches(2.8).emu) - box_height = Emu(Inches(3).emu) top = Emu(Inches(1.2).emu) for idx, (title, desc) in enumerate(takeaways): @@ -149,8 +169,7 @@ def create_ml_presentation() -> Path: tb_title.text_frame.paragraphs[0].font.bold = True tb_title.text_frame.paragraphs[0].alignment = PP_ALIGN.CENTER # Description box - tb_desc = group.shapes.add_textbox(left, Emu(top.emu + Inches(0.7).emu), - box_width, Emu(Inches(2).emu)) + tb_desc = group.shapes.add_textbox(left, Emu(top.emu + Inches(0.7).emu), box_width, Emu(Inches(2).emu)) tb_desc.text_frame.word_wrap = True tb_desc.text_frame.paragraphs[0].text = desc tb_desc.text_frame.paragraphs[0].font.size = Pt(14) @@ -186,6 +205,7 @@ def create_ml_presentation() -> Path: # Presentation 2 – Q4 Financial Review (4 slides) # --------------------------------------------------------------------------- + def create_data_presentation() -> Path: prs = Presentation() @@ -198,13 +218,19 @@ def create_data_presentation() -> Path: # ── Slide 2: Revenue table ──────────────────────────────────────────── slide_layout = prs.slide_layouts[5] # Blank slide = prs.slides.add_slide(slide_layout) - _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6), - "Quarterly Revenue Breakdown", font_size=24, bold=True) + _add_textbox( + slide, + Inches(0.5), + Inches(0.3), + Inches(9), + Inches(0.6), + "Quarterly Revenue Breakdown", + font_size=24, + bold=True, + ) rows, cols = 5, 4 - table_shape = slide.shapes.add_table(rows, cols, - Inches(0.5), Inches(1.2), - Inches(9), Inches(3.5)) + table_shape = slide.shapes.add_table(rows, cols, Inches(0.5), Inches(1.2), Inches(9), Inches(3.5)) table = table_shape.table headers = ["Region", "Q1 ($M)", "Q2 ($M)", "Q3 ($M)"] @@ -223,8 +249,16 @@ def create_data_presentation() -> Path: # ── Slide 3: Key metrics text boxes ─────────────────────────────────── slide_layout = prs.slide_layouts[5] slide = prs.slides.add_slide(slide_layout) - _add_textbox(slide, Inches(0.5), Inches(0.3), Inches(9), Inches(0.6), - "Key Financial Metrics", font_size=24, bold=True) + _add_textbox( + slide, + Inches(0.5), + Inches(0.3), + Inches(9), + Inches(0.6), + "Key Financial Metrics", + font_size=24, + bold=True, + ) metrics = [ ("Total Revenue", "$140.4M", "Up 12% YoY"), @@ -234,12 +268,36 @@ def create_data_presentation() -> Path: ] for idx, (label, value, note) in enumerate(metrics): row_top = Inches(1.3 + idx * 1.1) - _add_textbox(slide, Inches(0.8), row_top, Inches(3), Inches(0.5), - label, font_size=16, bold=True) - _add_textbox(slide, Inches(4.0), row_top, Inches(2), Inches(0.5), - value, font_size=16, bold=False) - _add_textbox(slide, Inches(6.2), row_top, Inches(3), Inches(0.5), - note, font_size=14, bold=False) + _add_textbox( + slide, + Inches(0.8), + row_top, + Inches(3), + Inches(0.5), + label, + font_size=16, + bold=True, + ) + _add_textbox( + slide, + Inches(4.0), + row_top, + Inches(2), + Inches(0.5), + value, + font_size=16, + bold=False, + ) + _add_textbox( + slide, + Inches(6.2), + row_top, + Inches(3), + Inches(0.5), + note, + font_size=14, + bold=False, + ) # ── Slide 4: Conclusion with bullets ────────────────────────────────── slide_layout = prs.slide_layouts[1] diff --git a/unstructured_documents/04_html/01_beautifulsoup_extraction.py b/unstructured_documents/04_html/01_beautifulsoup_extraction.py index 85b9e06..9dd29ee 100644 --- a/unstructured_documents/04_html/01_beautifulsoup_extraction.py +++ b/unstructured_documents/04_html/01_beautifulsoup_extraction.py @@ -63,10 +63,12 @@ def extract_article_content(html_path: Path) -> dict: # Extract headings for heading in content.find_all(["h1", "h2", "h3", "h4"]): - result["headings"].append({ - "level": heading.name, - "text": heading.get_text(strip=True), - }) + result["headings"].append( + { + "level": heading.name, + "text": heading.get_text(strip=True), + } + ) # Extract paragraphs for p in content.find_all("p"): @@ -170,7 +172,7 @@ def html_to_markdown_like(html_path: Path) -> str: print("=" * 60) tables = extract_tables_only(table_path) for i, table in enumerate(tables): - print(f"\nTable {i+1} ({len(table)} rows):") + print(f"\nTable {i + 1} ({len(table)} rows):") for row in table[:3]: print(f" {row}") if len(table) > 3: diff --git a/unstructured_documents/04_html/02_html2text_extraction.py b/unstructured_documents/04_html/02_html2text_extraction.py index ae122cf..442c850 100644 --- a/unstructured_documents/04_html/02_html2text_extraction.py +++ b/unstructured_documents/04_html/02_html2text_extraction.py @@ -42,12 +42,12 @@ def extract_clean_text(html_path: Path) -> str: def extract_with_custom_settings(html_path: Path) -> str: """Convert HTML to markdown with RAG-optimized settings.""" converter = html2text.HTML2Text() - converter.body_width = 0 # No line wrapping (better for chunking) - converter.ignore_links = True # Links add noise for RAG - converter.ignore_images = True # Can't embed images in text chunks + converter.body_width = 0 # No line wrapping (better for chunking) + converter.ignore_links = True # Links add noise for RAG + converter.ignore_images = True # Can't embed images in text chunks converter.ignore_emphasis = False # Keep bold/italic for context converter.protect_links = False - converter.unicode_snob = True # Use unicode instead of ASCII approximations + converter.unicode_snob = True # Use unicode instead of ASCII approximations converter.skip_internal_links = True return converter.handle(html_path.read_text()) diff --git a/unstructured_documents/04_html/03_trafilatura_extraction.py b/unstructured_documents/04_html/03_trafilatura_extraction.py index 4e0f1e3..7ca5a4d 100644 --- a/unstructured_documents/04_html/03_trafilatura_extraction.py +++ b/unstructured_documents/04_html/03_trafilatura_extraction.py @@ -16,7 +16,6 @@ import trafilatura from unstructured_documents.shared.chunking import ( - chunk_by_headings, chunk_by_recursive_split, preview_chunks, ) @@ -42,6 +41,7 @@ def extract_with_metadata(html_path: Path) -> dict | None: ) if result: import json + return json.loads(result) return None diff --git a/unstructured_documents/04_html/sample_docs/generate_samples.py b/unstructured_documents/04_html/sample_docs/generate_samples.py index a812983..14b1fff 100644 --- a/unstructured_documents/04_html/sample_docs/generate_samples.py +++ b/unstructured_documents/04_html/sample_docs/generate_samples.py @@ -43,7 +43,8 @@ def generate_article_page():

    Key NLP Tasks