@@ -304,6 +304,11 @@ fn process_node_to_markdown(
304304 scraper:: Node :: Element ( elem) => {
305305 let tag = elem. name . local . as_ref ( ) ;
306306 if let Some ( element_ref) = scraper:: ElementRef :: wrap ( child) {
307+ // Skip hidden elements
308+ if is_element_hidden ( elem) {
309+ continue ;
310+ }
311+
307312 match tag {
308313 // Skip unwanted elements
309314 "script" | "style" | "nav" | "footer" | "header" | "aside" | "noscript" => {
@@ -683,6 +688,11 @@ fn extract_text_content(node: scraper::ElementRef, output: &mut String) {
683688 scraper:: Node :: Element ( elem) => {
684689 let tag = elem. name . local . as_ref ( ) ;
685690 if let Some ( element_ref) = scraper:: ElementRef :: wrap ( child) {
691+ // Skip hidden elements
692+ if is_element_hidden ( elem) {
693+ continue ;
694+ }
695+
686696 match tag {
687697 // Skip unwanted elements
688698 "script" | "style" | "nav" | "footer" | "header" | "aside" | "noscript" => {
@@ -830,6 +840,35 @@ fn unwrap_tags(html: &str, tag: &str) -> String {
830840 output
831841}
832842
843+ /// Check if an HTML element is hidden via various visibility attributes.
844+ /// Returns true if the element should be skipped during content extraction.
845+ fn is_element_hidden ( elem : & scraper:: node:: Element ) -> bool {
846+ // Check for hidden attribute
847+ if elem. attr ( "hidden" ) . is_some ( ) {
848+ return true ;
849+ }
850+
851+ // Check for aria-hidden="true"
852+ if elem. attr ( "aria-hidden" ) == Some ( "true" ) {
853+ return true ;
854+ }
855+
856+ // Check for style attribute containing visibility-hiding properties
857+ if let Some ( style) = elem. attr ( "style" ) {
858+ let style_lower = style. to_lowercase ( ) ;
859+ // Check for display:none (with or without spaces)
860+ if style_lower. contains ( "display:none" ) || style_lower. contains ( "display: none" ) {
861+ return true ;
862+ }
863+ // Check for visibility:hidden (with or without spaces)
864+ if style_lower. contains ( "visibility:hidden" ) || style_lower. contains ( "visibility: hidden" ) {
865+ return true ;
866+ }
867+ }
868+
869+ false
870+ }
871+
833872/// Normalize whitespace in text.
834873fn normalize_whitespace ( text : & str ) -> String {
835874 let mut result = String :: new ( ) ;
@@ -1003,4 +1042,59 @@ mod tests {
10031042 let result2 = normalize_whitespace ( "no\n \n extra\t \t spaces" ) ;
10041043 assert ! ( result2. contains( "no" ) && result2. contains( "extra" ) && result2. contains( "spaces" ) ) ;
10051044 }
1045+
1046+ #[ test]
1047+ fn test_hidden_elements_excluded_from_markdown ( ) {
1048+ // Test hidden attribute
1049+ let html = r#"<p>Visible</p><p hidden>Hidden by attribute</p>"# ;
1050+ let md = html_to_markdown ( html, false , false ) ;
1051+ assert ! ( md. contains( "Visible" ) ) ;
1052+ assert ! ( !md. contains( "Hidden by attribute" ) ) ;
1053+
1054+ // Test style="display:none"
1055+ let html = r#"<p>Visible</p><p style="display:none">Hidden by display none</p>"# ;
1056+ let md = html_to_markdown ( html, false , false ) ;
1057+ assert ! ( md. contains( "Visible" ) ) ;
1058+ assert ! ( !md. contains( "Hidden by display none" ) ) ;
1059+
1060+ // Test style="display: none" (with space)
1061+ let html =
1062+ r#"<p>Visible</p><p style="display: none">Hidden by display none with space</p>"# ;
1063+ let md = html_to_markdown ( html, false , false ) ;
1064+ assert ! ( md. contains( "Visible" ) ) ;
1065+ assert ! ( !md. contains( "Hidden by display none with space" ) ) ;
1066+
1067+ // Test style="visibility:hidden"
1068+ let html = r#"<p>Visible</p><p style="visibility:hidden">Hidden by visibility</p>"# ;
1069+ let md = html_to_markdown ( html, false , false ) ;
1070+ assert ! ( md. contains( "Visible" ) ) ;
1071+ assert ! ( !md. contains( "Hidden by visibility" ) ) ;
1072+
1073+ // Test aria-hidden="true"
1074+ let html = r#"<p>Visible</p><p aria-hidden="true">Hidden by aria</p>"# ;
1075+ let md = html_to_markdown ( html, false , false ) ;
1076+ assert ! ( md. contains( "Visible" ) ) ;
1077+ assert ! ( !md. contains( "Hidden by aria" ) ) ;
1078+ }
1079+
1080+ #[ test]
1081+ fn test_hidden_elements_excluded_from_text ( ) {
1082+ // Test hidden attribute
1083+ let html = r#"<p>Visible</p><p hidden>Hidden by attribute</p>"# ;
1084+ let text = html_to_text ( html) ;
1085+ assert ! ( text. contains( "Visible" ) ) ;
1086+ assert ! ( !text. contains( "Hidden by attribute" ) ) ;
1087+
1088+ // Test style="display:none"
1089+ let html = r#"<p>Visible</p><p style="display:none">Hidden by display none</p>"# ;
1090+ let text = html_to_text ( html) ;
1091+ assert ! ( text. contains( "Visible" ) ) ;
1092+ assert ! ( !text. contains( "Hidden by display none" ) ) ;
1093+
1094+ // Test aria-hidden="true"
1095+ let html = r#"<p>Visible</p><p aria-hidden="true">Hidden by aria</p>"# ;
1096+ let text = html_to_text ( html) ;
1097+ assert ! ( text. contains( "Visible" ) ) ;
1098+ assert ! ( !text. contains( "Hidden by aria" ) ) ;
1099+ }
10061100}
0 commit comments