-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_processor.py
More file actions
160 lines (130 loc) · 5.26 KB
/
document_processor.py
File metadata and controls
160 lines (130 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Document processing utilities for PDF and DOCX files."""
import os
from typing import List, Optional
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from loguru import logger
from config.settings import get_settings
class DocumentProcessor:
"""Handles document loading, parsing, and chunking."""
def __init__(self):
self.settings = get_settings()
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.settings.CHUNK_SIZE,
chunk_overlap=self.settings.CHUNK_OVERLAP,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
self._supported_extensions = {'.pdf', '.docx'}
@property
def supported_extensions(self) -> set:
"""Get supported file extensions."""
return self._supported_extensions.copy()
def is_supported(self, file_path: str) -> bool:
"""Check if file extension is supported."""
return Path(file_path).suffix.lower() in self._supported_extensions
def validate_file(self, file_path: str) -> bool:
"""Validate file exists and is accessible."""
path = Path(file_path)
if not path.exists():
logger.error(f"File not found: {file_path}")
return False
if path.stat().st_size > self.settings.MAX_FILE_SIZE_MB * 1024 * 1024:
logger.error(f"File exceeds max size: {file_path}")
return False
return True
def load_document(self, file_path: str) -> List[Document]:
"""
Load a document from file path.
Args:
file_path: Path to the document file
Returns:
List of LangChain Document objects
"""
if not self.validate_file(file_path):
raise ValueError(f"Invalid file: {file_path}")
file_ext = Path(file_path).suffix.lower()
try:
if file_ext == '.pdf':
return self._load_pdf(file_path)
elif file_ext == '.docx':
return self._load_docx(file_path)
else:
raise ValueError(f"Unsupported file type: {file_ext}")
except Exception as e:
logger.error(f"Error loading document {file_path}: {e}")
raise
def _load_pdf(self, file_path: str) -> List[Document]:
"""Load and extract text from PDF file."""
logger.info(f"Loading PDF: {file_path}")
loader = PyPDFLoader(file_path)
documents = loader.load()
logger.info(f"Loaded {len(documents)} pages from PDF")
return documents
def _load_docx(self, file_path: str) -> List[Document]:
"""Load and extract text from DOCX file."""
logger.info(f"Loading DOCX: {file_path}")
loader = Docx2txtLoader(file_path)
documents = loader.load()
logger.info(f"Loaded {len(documents)} sections from DOCX")
return documents
def split_documents(self, documents: List[Document]) -> List[Document]:
"""
Split documents into smaller chunks.
Args:
documents: List of Document objects to split
Returns:
List of chunked Document objects
"""
logger.info(f"Splitting {len(documents)} documents into chunks...")
chunks = self.text_splitter.split_documents(documents)
logger.info(f"Created {len(chunks)} chunks")
return chunks
def process_file(self, file_path: str, metadata: Optional[dict] = None) -> List[Document]:
"""
Complete pipeline: load -> split -> add metadata.
Args:
file_path: Path to document file
metadata: Optional metadata to add to chunks
Returns:
List of processed Document chunks
"""
logger.info(f"Processing file: {file_path}")
# Load document
documents = self.load_document(file_path)
# Split into chunks
chunks = self.split_documents(documents)
# Add file metadata to each chunk
file_name = Path(file_path).name
for chunk in chunks:
if metadata:
chunk.metadata.update(metadata)
chunk.metadata['source_file'] = file_name
chunk.metadata['file_path'] = str(file_path)
logger.info(f"Successfully processed {len(chunks)} chunks from {file_name}")
return chunks
def sanitize_input(text: str) -> str:
"""
Sanitize user input to prevent prompt injection.
Args:
text: Raw user input
Returns:
Sanitized text safe for use in prompts
"""
# Remove potential prompt injection patterns
dangerous_patterns = [
"ignore previous instructions",
"ignore all previous",
"disregard your instructions",
"system prompt",
"you are now",
"pretend you are",
"ignore system",
]
sanitized = text
for pattern in dangerous_patterns:
sanitized = sanitized.replace(pattern, "")
# Strip extra whitespace
return " ".join(sanitized.split())