Skip to content

Commit c5fd732

Browse files
factorydroidechobt
authored andcommitted
fix(cli): exclude hidden elements from scrape command output
Fixes bounty issue #1707 The scrape command was including text from hidden DOM elements in its output. This fix adds visibility checks before processing elements to skip content that is hidden via: - hidden attribute - style="display:none" - style="visibility:hidden" - aria-hidden="true"
1 parent ca80056 commit c5fd732

1 file changed

Lines changed: 94 additions & 0 deletions

File tree

cortex-cli/src/scrape_cmd.rs

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,11 @@ fn process_node_to_markdown(
304304
scraper::Node::Element(elem) => {
305305
let tag = elem.name.local.as_ref();
306306
if let Some(element_ref) = scraper::ElementRef::wrap(child) {
307+
// Skip hidden elements
308+
if is_element_hidden(elem) {
309+
continue;
310+
}
311+
307312
match tag {
308313
// Skip unwanted elements
309314
"script" | "style" | "nav" | "footer" | "header" | "aside" | "noscript" => {
@@ -683,6 +688,11 @@ fn extract_text_content(node: scraper::ElementRef, output: &mut String) {
683688
scraper::Node::Element(elem) => {
684689
let tag = elem.name.local.as_ref();
685690
if let Some(element_ref) = scraper::ElementRef::wrap(child) {
691+
// Skip hidden elements
692+
if is_element_hidden(elem) {
693+
continue;
694+
}
695+
686696
match tag {
687697
// Skip unwanted elements
688698
"script" | "style" | "nav" | "footer" | "header" | "aside" | "noscript" => {
@@ -830,6 +840,35 @@ fn unwrap_tags(html: &str, tag: &str) -> String {
830840
output
831841
}
832842

843+
/// Check if an HTML element is hidden via various visibility attributes.
844+
/// Returns true if the element should be skipped during content extraction.
845+
fn is_element_hidden(elem: &scraper::node::Element) -> bool {
846+
// Check for hidden attribute
847+
if elem.attr("hidden").is_some() {
848+
return true;
849+
}
850+
851+
// Check for aria-hidden="true"
852+
if elem.attr("aria-hidden") == Some("true") {
853+
return true;
854+
}
855+
856+
// Check for style attribute containing visibility-hiding properties
857+
if let Some(style) = elem.attr("style") {
858+
let style_lower = style.to_lowercase();
859+
// Check for display:none (with or without spaces)
860+
if style_lower.contains("display:none") || style_lower.contains("display: none") {
861+
return true;
862+
}
863+
// Check for visibility:hidden (with or without spaces)
864+
if style_lower.contains("visibility:hidden") || style_lower.contains("visibility: hidden") {
865+
return true;
866+
}
867+
}
868+
869+
false
870+
}
871+
833872
/// Normalize whitespace in text.
834873
fn normalize_whitespace(text: &str) -> String {
835874
let mut result = String::new();
@@ -1003,4 +1042,59 @@ mod tests {
10031042
let result2 = normalize_whitespace("no\n\nextra\t\tspaces");
10041043
assert!(result2.contains("no") && result2.contains("extra") && result2.contains("spaces"));
10051044
}
1045+
1046+
#[test]
1047+
fn test_hidden_elements_excluded_from_markdown() {
1048+
// Test hidden attribute
1049+
let html = r#"<p>Visible</p><p hidden>Hidden by attribute</p>"#;
1050+
let md = html_to_markdown(html, false, false);
1051+
assert!(md.contains("Visible"));
1052+
assert!(!md.contains("Hidden by attribute"));
1053+
1054+
// Test style="display:none"
1055+
let html = r#"<p>Visible</p><p style="display:none">Hidden by display none</p>"#;
1056+
let md = html_to_markdown(html, false, false);
1057+
assert!(md.contains("Visible"));
1058+
assert!(!md.contains("Hidden by display none"));
1059+
1060+
// Test style="display: none" (with space)
1061+
let html =
1062+
r#"<p>Visible</p><p style="display: none">Hidden by display none with space</p>"#;
1063+
let md = html_to_markdown(html, false, false);
1064+
assert!(md.contains("Visible"));
1065+
assert!(!md.contains("Hidden by display none with space"));
1066+
1067+
// Test style="visibility:hidden"
1068+
let html = r#"<p>Visible</p><p style="visibility:hidden">Hidden by visibility</p>"#;
1069+
let md = html_to_markdown(html, false, false);
1070+
assert!(md.contains("Visible"));
1071+
assert!(!md.contains("Hidden by visibility"));
1072+
1073+
// Test aria-hidden="true"
1074+
let html = r#"<p>Visible</p><p aria-hidden="true">Hidden by aria</p>"#;
1075+
let md = html_to_markdown(html, false, false);
1076+
assert!(md.contains("Visible"));
1077+
assert!(!md.contains("Hidden by aria"));
1078+
}
1079+
1080+
#[test]
1081+
fn test_hidden_elements_excluded_from_text() {
1082+
// Test hidden attribute
1083+
let html = r#"<p>Visible</p><p hidden>Hidden by attribute</p>"#;
1084+
let text = html_to_text(html);
1085+
assert!(text.contains("Visible"));
1086+
assert!(!text.contains("Hidden by attribute"));
1087+
1088+
// Test style="display:none"
1089+
let html = r#"<p>Visible</p><p style="display:none">Hidden by display none</p>"#;
1090+
let text = html_to_text(html);
1091+
assert!(text.contains("Visible"));
1092+
assert!(!text.contains("Hidden by display none"));
1093+
1094+
// Test aria-hidden="true"
1095+
let html = r#"<p>Visible</p><p aria-hidden="true">Hidden by aria</p>"#;
1096+
let text = html_to_text(html);
1097+
assert!(text.contains("Visible"));
1098+
assert!(!text.contains("Hidden by aria"));
1099+
}
10061100
}

0 commit comments

Comments
 (0)