forked from phildougherty/local_tts_reader
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextProcessor.js
More file actions
101 lines (82 loc) · 3.18 KB
/
textProcessor.js
File metadata and controls
101 lines (82 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/**
* Text processor for TTS
* - Removes markdown formatting
* - Cleans up URLs
* - Removes special characters that don't read well
*/
class TextProcessor {
/**
* Process text for TTS
* @param {string} text - The text to process
* @returns {string} - The processed text
*/
static process(text) {
if (!text) return '';
let processedText = text;
// Remove markdown headers
processedText = processedText.replace(/^#{1,6}\s+(.+)$/gm, '$1');
// Remove markdown bold/italic
processedText = processedText.replace(/(\*\*|__)(.*?)\1/g, '$2');
processedText = processedText.replace(/(\*|_)(.*?)\1/g, '$2');
// Remove markdown code blocks
processedText = processedText.replace(/```[\s\S]*?```/g, 'code block omitted');
processedText = processedText.replace(/`([^`]+)`/g, '$1');
// Remove markdown links but keep the text
processedText = processedText.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1');
// Process URLs
processedText = this.processUrls(processedText);
// Remove special characters that don't read well
processedText = processedText.replace(/[|*~`]/g, ' ');
// Remove excessive whitespace
processedText = processedText.replace(/\s+/g, ' ').trim();
// Remove markdown list markers
processedText = processedText.replace(/^[\s-]*[-*+]\s+/gm, '');
processedText = processedText.replace(/^\s*\d+\.\s+/gm, '');
// Remove HTML tags
processedText = processedText.replace(/<[^>]*>/g, '');
// Replace common symbols with words
processedText = processedText.replace(/&/g, ' and ');
processedText = processedText.replace(/\$/g, ' dollars ');
processedText = processedText.replace(/%/g, ' percent ');
processedText = processedText.replace(/\^/g, ' ');
// Replace multiple dots with a single period
processedText = processedText.replace(/\.{2,}/g, '.');
return processedText;
}
/**
* Process URLs in text
* @param {string} text - The text containing URLs
* @returns {string} - Text with processed URLs
*/
static processUrls(text) {
// Regular expression for URLs
const urlRegex = /(https?:\/\/[^\s]+)/g;
return text.replace(urlRegex, (url) => {
try {
const urlObj = new URL(url);
const domain = urlObj.hostname;
// Extract the domain name without subdomains
let domainName = domain.split('.');
if (domainName.length > 2) {
// Handle cases like www.example.com
if (domainName[0] === 'www') {
domainName = domainName.slice(1);
}
// Get the main domain part
domainName = domainName.slice(-2, -1)[0];
} else {
// Handle cases like example.com
domainName = domainName[0];
}
// Capitalize first letter
domainName = domainName.charAt(0).toUpperCase() + domainName.slice(1);
return `[${domainName} dot ${domain.split('.').pop()} link]`;
} catch (e) {
// If URL parsing fails, return a generic placeholder
return '[web link]';
}
});
}
}
// Make available globally
window.TextProcessor = TextProcessor;