Document_Summarizer_Using_Langchain/create_sample_documents.py at master · sumanth0095/Document_Summarizer_Using_Langchain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# create_sample_documents.py
"""
Create sample documents for testing the Document Loader
"""

from pathlib import Path
import pandas as pd
from docx import Document
from pptx import Presentation
from pptx.util import Inches

print("📄 Creating Sample Documents for Testing...\n")

# Create samples directory
samples_dir = Path("sample_documents")
samples_dir.mkdir(exist_ok=True)

# 1. Create a sample Word document
print("📝 Creating sample Word document...")
doc = Document()
doc.add_heading("Sample Document for Testing", 0)
doc.add_paragraph(
    "This is a sample Word document created for testing the Document Loader."
)
doc.add_heading("Features", level=1)
doc.add_paragraph("• Text extraction from paragraphs")
doc.add_paragraph("• Table data processing")
doc.add_paragraph("• Heading recognition")

# Add a simple table
table = doc.add_table(rows=3, cols=2)
table.cell(0, 0).text = "Feature"
table.cell(0, 1).text = "Status"
table.cell(1, 0).text = "PDF Support"
table.cell(1, 1).text = "Working"
table.cell(2, 0).text = "Word Support"
table.cell(2, 1).text = "Working"

doc.save(samples_dir / "sample_document.docx")
print("  ✅ Created: sample_document.docx")

# 2. Create a sample Excel file
print("📊 Creating sample Excel file...")
data = {
    "Product": ["Laptop", "Mouse", "Keyboard", "Monitor"],
    "Price": [1200, 25, 75, 300],
    "Quantity": [10, 50, 30, 15],
    "Total": [12000, 1250, 2250, 4500],
}
df = pd.DataFrame(data)
df.to_excel(samples_dir / "sample_data.xlsx", index=False)
print("  ✅ Created: sample_data.xlsx")

# 3. Create a sample PowerPoint
print("🎥 Creating sample PowerPoint...")
prs = Presentation()

# Slide 1
slide1 = prs.slides.add_slide(prs.slide_layouts[0])
slide1.shapes.title.text = "Document Loader Test"
slide1.shapes.placeholders[1].text = "Sample presentation for testing text extraction"

# Slide 2
slide2 = prs.slides.add_slide(prs.slide_layouts[1])
slide2.shapes.title.text = "Features"
slide2.shapes.placeholders[1].text = (
    "• Multi-format support\n• Text extraction\n• Error handling"
)

prs.save(samples_dir / "sample_presentation.pptx")
print("  ✅ Created: sample_presentation.pptx")

# 4. Create more complex text samples
print("📋 Creating additional text samples...")

# Rich text file
rich_text = """
DOCUMENT SUMMARIZER - FEATURE OVERVIEW
=====================================

Introduction
------------
The Document Summarizer is an advanced AI-powered tool designed to extract
and summarize content from various document formats.

Supported Formats:
• PDF documents (.pdf)
• Microsoft Word (.docx)
• Excel spreadsheets (.xlsx)
• PowerPoint presentations (.pptx)
• HTML web pages (.html)
• Markdown files (.md)
• Plain text (.txt)

Key Features:
1. Multi-format document loading
2. Intelligent text extraction
3. AI-powered summarization
4. User-friendly web interface
5. Comprehensive error handling

Technical Details:
- Built with Python and Streamlit
- Uses transformer models for AI summarization
- Supports files up to 10MB
- Real-time processing and feedback

For more information, visit our documentation.
"""

with open(samples_dir / "rich_text_sample.txt", "w", encoding="utf-8") as f:
    f.write(rich_text)
print("  ✅ Created: rich_text_sample.txt")

# Advanced HTML sample
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Document Summarizer - Advanced Test</title>
</head>
<body>
    <header>
        <h1>Document Summarizer</h1>
        <nav>
            <ul>
                <li><a href="#features">Features</a></li>
                <li><a href="#formats">Supported Formats</a></li>
                <li><a href="#usage">Usage</a></li>
            </ul>
        </nav>
    </header>

    <main>
        <section id="features">
            <h2>Key Features</h2>
            <p>Our document summarizer offers several powerful features:</p>
            <ul>
                <li><strong>Multi-format Support:</strong> Process PDF, Word, Excel, and more</li>
                <li><strong>AI-Powered:</strong> Uses advanced transformer models</li>
                <li><strong>Fast Processing:</strong> Quick text extraction and summarization</li>
            </ul>
        </section>

        <section id="formats">
            <h2>Supported File Formats</h2>
            <table>
                <tr><th>Format</th><th>Extension</th><th>Description</th></tr>
                <tr><td>PDF</td><td>.pdf</td><td>Portable Document Format</td></tr>
                <tr><td>Word</td><td>.docx</td><td>Microsoft Word Document</td></tr>
                <tr><td>Excel</td><td>.xlsx</td><td>Microsoft Excel Spreadsheet</td></tr>
            </table>
        </section>
    </main>

    <footer>
        <p>&copy; 2024 Document Summarizer Project</p>
    </footer>
</body>
</html>
"""

with open(samples_dir / "advanced_sample.html", "w", encoding="utf-8") as f:
    f.write(html_content)
print("  ✅ Created: advanced_sample.html")

print(f"\n🎉 All sample documents created successfully!")
print(f"📁 Location: {samples_dir.absolute()}")
print(f"📊 Total files created: {len(list(samples_dir.glob('*')))}")
print("\n💡 You can now use these files to test the Document Loader!")