Skip to content

Commit 433b284

Browse files
authored
Add s3 source (#67)
* Adding S3 source Signed-off-by: Denis Jannot <denis.jannot@solo.io> * Small refactor Signed-off-by: Denis Jannot <denis.jannot@solo.io> * Recover config file Signed-off-by: Denis Jannot <denis.jannot@solo.io> --------- Signed-off-by: Denis Jannot <denis.jannot@solo.io>
1 parent 7618dc4 commit 433b284

9 files changed

Lines changed: 3405 additions & 1040 deletions

File tree

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ Configuration is managed through two files:
176176
**Structure:**
177177

178178
* `sources`: An array of source configurations.
179-
* `type`: Either `'website'`, `'github'`, `'local_directory'`, `'code'`, or `'zendesk'`
179+
* `type`: Either `'website'`, `'github'`, `'local_directory'`, `'code'`, `'zendesk'`, or `'s3'`
180180

181181
For websites (`type: 'website'`):
182182
* `url`: The starting URL for crawling the documentation site.
@@ -219,6 +219,20 @@ Configuration is managed through two files:
219219
* `ticket_status`: (Optional) Filter tickets by status (defaults to `['new', 'open', 'pending', 'hold', 'solved']`).
220220
* `ticket_priority`: (Optional) Filter tickets by priority (defaults to all priorities).
221221

222+
For S3 buckets (`type: 's3'`):
223+
* `bucket`: The S3 bucket name.
224+
* `prefix`: (Optional) Key prefix to filter objects (e.g., `'docs/'`). Only objects under this prefix will be processed.
225+
* `region`: (Optional) AWS region (defaults to `AWS_DEFAULT_REGION` environment variable or `'us-east-1'`).
226+
* `endpoint`: (Optional) Custom S3 endpoint for S3-compatible services (MinIO, LocalStack, etc.).
227+
* `include_extensions`: (Optional) Array of file extensions to include (e.g., `['.md', '.txt', '.pdf']`). Defaults to `['.md', '.txt', '.html', '.htm', '.pdf', '.doc', '.docx']`.
228+
* `exclude_extensions`: (Optional) Array of file extensions to exclude.
229+
* `encoding`: (Optional) Text file encoding (defaults to `'utf8'`). Does not apply to binary files (PDF, DOC, DOCX).
230+
* `url_rewrite_prefix`: (Optional) URL prefix to rewrite `s3://` URLs (e.g., `'https://docs.example.com'`).
231+
232+
Authentication uses the AWS SDK default credential chain: environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`), `~/.aws/credentials`, IAM roles, etc.
233+
234+
Incremental sync tracks object `LastModified` timestamps so only new or updated objects are processed on subsequent runs. Deleted objects are automatically cleaned up.
235+
222236
Common configuration for all types:
223237
* `product_name`: A string identifying the product (used in metadata).
224238
* `version`: A string identifying the product version (used in metadata).
@@ -349,6 +363,21 @@ Configuration is managed through two files:
349363
params:
350364
db_path: './zendesk-kb.db'
351365
366+
# S3 bucket source example
367+
- type: 's3'
368+
product_name: 'my-docs'
369+
version: 'latest'
370+
bucket: 'my-documentation-bucket'
371+
prefix: 'docs/v2/'
372+
region: 'us-west-2'
373+
include_extensions: ['.md', '.txt', '.pdf', '.html']
374+
url_rewrite_prefix: 'https://docs.example.com'
375+
max_size: 1048576
376+
database_config:
377+
type: 'sqlite'
378+
params:
379+
db_path: './s3-docs.db'
380+
352381
# Qdrant example
353382
- type: 'website'
354383
product_name: 'Istio'

content-processor.ts

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1442,6 +1442,36 @@ export class ContentProcessor {
14421442
}
14431443
}
14441444

1445+
async convertFileToMarkdown(filePath: string, extension: string, logger: Logger): Promise<string> {
1446+
const ext = extension.toLowerCase();
1447+
if (ext === '.pdf') {
1448+
return this.convertPdfToMarkdown(filePath, logger);
1449+
} else if (ext === '.doc') {
1450+
return this.convertDocToMarkdown(filePath, logger);
1451+
} else if (ext === '.docx') {
1452+
return this.convertDocxToMarkdown(filePath, logger);
1453+
} else if (ext === '.html' || ext === '.htm') {
1454+
const content = fs.readFileSync(filePath, { encoding: 'utf8' });
1455+
const cleanHtml = sanitizeHtml(content, {
1456+
allowedTags: [
1457+
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'ul', 'ol',
1458+
'li', 'b', 'i', 'strong', 'em', 'code', 'pre',
1459+
'div', 'span', 'table', 'thead', 'tbody', 'tr', 'th', 'td'
1460+
],
1461+
allowedAttributes: {
1462+
'a': ['href'],
1463+
'pre': ['class', 'data-language'],
1464+
'code': ['class', 'data-language'],
1465+
'div': ['class'],
1466+
'span': ['class']
1467+
}
1468+
});
1469+
return this.turndownService.turndown(cleanHtml);
1470+
} else {
1471+
throw new Error(`Unsupported file extension for conversion: ${extension}`);
1472+
}
1473+
}
1474+
14451475
private async downloadAndConvertPdfFromUrl(url: string, logger: Logger): Promise<string> {
14461476
logger.debug(`Downloading and converting PDF from URL: ${url}`);
14471477

@@ -1599,49 +1629,29 @@ export class ContentProcessor {
15991629
let content: string;
16001630
let processedContent: string;
16011631

1602-
if (extension === '.pdf') {
1603-
// Handle PDF files
1604-
logger.debug(`Processing PDF file: ${filePath}`);
1605-
processedContent = await this.convertPdfToMarkdown(filePath, logger);
1606-
} else if (extension === '.doc') {
1607-
// Handle legacy Word DOC files
1608-
logger.debug(`Processing DOC file: ${filePath}`);
1609-
processedContent = await this.convertDocToMarkdown(filePath, logger);
1610-
} else if (extension === '.docx') {
1611-
// Handle modern Word DOCX files
1612-
logger.debug(`Processing DOCX file: ${filePath}`);
1613-
processedContent = await this.convertDocxToMarkdown(filePath, logger);
1632+
const convertibleExtensions = ['.pdf', '.doc', '.docx', '.html', '.htm'];
1633+
if (convertibleExtensions.includes(extension)) {
1634+
if (extension === '.html' || extension === '.htm') {
1635+
// For HTML, check raw file size before converting
1636+
content = fs.readFileSync(filePath, { encoding: encoding as BufferEncoding });
1637+
if (content.length > config.max_size) {
1638+
logger.warn(`File content (${content.length} chars) exceeds max size (${config.max_size}). Skipping ${filePath}.`);
1639+
skippedFiles++;
1640+
continue;
1641+
}
1642+
}
1643+
processedContent = await this.convertFileToMarkdown(filePath, extension, logger);
16141644
} else {
16151645
// Handle text-based files
16161646
content = fs.readFileSync(filePath, { encoding: encoding as BufferEncoding });
1617-
1647+
16181648
if (content.length > config.max_size) {
16191649
logger.warn(`File content (${content.length} chars) exceeds max size (${config.max_size}). Skipping ${filePath}.`);
16201650
skippedFiles++;
16211651
continue;
16221652
}
1623-
1624-
// Convert HTML to Markdown if needed
1625-
if (extension === '.html' || extension === '.htm') {
1626-
logger.debug(`Converting HTML to Markdown for ${filePath}`);
1627-
const cleanHtml = sanitizeHtml(content, {
1628-
allowedTags: [
1629-
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'ul', 'ol',
1630-
'li', 'b', 'i', 'strong', 'em', 'code', 'pre',
1631-
'div', 'span', 'table', 'thead', 'tbody', 'tr', 'th', 'td'
1632-
],
1633-
allowedAttributes: {
1634-
'a': ['href'],
1635-
'pre': ['class', 'data-language'],
1636-
'code': ['class', 'data-language'],
1637-
'div': ['class'],
1638-
'span': ['class']
1639-
}
1640-
});
1641-
processedContent = this.turndownService.turndown(cleanHtml);
1642-
} else {
1643-
processedContent = content;
1644-
}
1653+
1654+
processedContent = content;
16451655
}
16461656

16471657
// Check size limit for processed content

database.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,9 +263,10 @@ export class DatabaseManager {
263263
dbConnection: DatabaseConnection,
264264
repo: string,
265265
logger: Logger,
266-
embeddingDimension: number
266+
embeddingDimension: number,
267+
date: string
267268
): Promise<void> {
268-
const now = new Date().toISOString();
269+
const now = date;
269270

270271
try {
271272
if (dbConnection.type === 'sqlite') {

0 commit comments

Comments
 (0)