From 47369b60913186ecac2d5dbb88ceeb23d2d319ae Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 21 Mar 2026 00:15:33 +0000 Subject: [PATCH] Fix annotation projection failing on sanitized HTML with multiple roots (#110) XElement.Parse() requires valid XML with a single root element, but sanitized HTML (e.g., DOMPurify output) strips / wrappers, leaving multiple top-level elements. Also handle HTML named entities ( , –, etc.) that are invalid in XML. https://claude.ai/code/session_01B3Yy9hY9o3iUBf5KD8wXUP --- CHANGELOG.md | 3 + Docxodus.Tests/ExternalAnnotationTests.cs | 170 ++++++++++++++++++++++ Docxodus/ExternalAnnotationProjector.cs | 105 ++++++++++++- 3 files changed, 272 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 419c498..5183b10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ All notable changes to this project will be documented in this file. - CSS-based label filtering enables responsive toggle without any re-rendering ### Fixed +- **Annotation projection fails on sanitized HTML (Issue #110)** - `ProjectAnnotationsOntoHtml`, `AddAnnotationToHtml`, and `RemoveAnnotationFromHtml` now handle HTML fragments with multiple root elements (e.g., DOMPurify-sanitized output) and HTML named entities (` `, `–`, etc.) + - Root cause: `XElement.Parse()` requires valid XML with a single root element; sanitized HTML strips ``/`` wrappers leaving multiple roots + - Fix: Auto-wraps multi-root HTML in a synthetic container for parsing, unwraps on serialization; replaces common HTML entities with numeric XML equivalents - **Table container missing top margin (Issue #108)** - Tables preceded by paragraphs with no after-spacing now get a default `margin-top: 7.5pt` for visual separation - Also handles floating table spacing from `w:tblpPr` (`topFromText`/`bottomFromText` attributes) - Tables preceded by paragraphs with explicit after-spacing correctly skip the default margin diff --git a/Docxodus.Tests/ExternalAnnotationTests.cs b/Docxodus.Tests/ExternalAnnotationTests.cs index 0aae41b..83b94cf 100644 --- a/Docxodus.Tests/ExternalAnnotationTests.cs +++ b/Docxodus.Tests/ExternalAnnotationTests.cs @@ -748,6 +748,176 @@ public void EA025_ProjectAnnotationsOntoHtml_ThenRemove_PreservesText() } #endregion + + #region HTML Fragment Tests (Issue #110) + + [Fact] + public void EA030_ProjectAnnotationsOntoHtml_MultipleRoots_DoesNotThrow() + { + // Arrange - simulate DOMPurify-sanitized HTML with multiple root elements + var sanitizedHtml = "

Hello, world!

"; + var set = new ExternalAnnotationSet + { + DocumentId = "test", + DocumentHash = "abc", + Content = "Hello, world!", + LabelledText = new List(), + TextLabels = new Dictionary() + }; + + set.TextLabels["GREETING"] = new AnnotationLabel + { + Id = "GREETING", + Text = "Greeting", + Color = "#FFEB3B" + }; + + var annotation = new OpenContractsAnnotation + { + Id = "ann-frag", + AnnotationLabel = "GREETING", + RawText = "Hello", + Page = 0, + AnnotationJson = new TextSpan { Id = "ann-frag", Start = 0, End = 5, Text = "Hello" }, + Structural = false + }; + set.LabelledText.Add(annotation); + + // Act - should not throw Xml_MultipleRoots + var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(sanitizedHtml, set); + + // Assert + Assert.Contains("Hello", result); + Assert.Contains("data-annotation-id=\"ann-frag\"", result); + // Should not contain the synthetic wrapper element + Assert.DoesNotContain("docxodus-fragment-root", result); + } + + [Fact] + public void EA031_AddAnnotationToHtml_MultipleRoots_DoesNotThrow() + { + // Arrange + var sanitizedHtml = "

Test content here.

"; + var annotation = new OpenContractsAnnotation + { + Id = "ann-add", + AnnotationLabel = "CLAUSE", + RawText = "Test", + Page = 0, + AnnotationJson = new TextSpan { Id = "ann-add", Start = 0, End = 4, Text = "Test" }, + Structural = false + }; + var label = new AnnotationLabel { Id = "CLAUSE", Text = "Clause", Color = "#FF5722" }; + + // Act + var result = ExternalAnnotationProjector.AddAnnotationToHtml( + sanitizedHtml, annotation, label); + + // Assert + Assert.Contains("data-annotation-id=\"ann-add\"", result); + Assert.DoesNotContain("docxodus-fragment-root", result); + } + + [Fact] + public void EA032_RemoveAnnotationFromHtml_MultipleRoots_DoesNotThrow() + { + // Arrange - HTML fragment with an annotation already projected + var htmlWithAnnotation = "

" + + "" + + "Hello, world!

"; + + // Act + var result = ExternalAnnotationProjector.RemoveAnnotationFromHtml( + htmlWithAnnotation, "ann-rm"); + + // Assert + Assert.DoesNotContain("data-annotation-id=\"ann-rm\"", result); + Assert.Contains("Hello", result); + Assert.DoesNotContain("docxodus-fragment-root", result); + } + + [Fact] + public void EA033_ProjectAnnotationsOntoHtml_HtmlEntities_DoesNotThrow() + { + // Arrange - HTML with named entities that are invalid in XML + var htmlWithEntities = "

Price is 5 dollars – cheap!

"; + var set = new ExternalAnnotationSet + { + DocumentId = "test", + DocumentHash = "abc", + Content = "Price is 5\u00A0dollars \u2013 cheap!", + LabelledText = new List(), + TextLabels = new Dictionary() + }; + + // Act - should not throw Xml_UndeclaredEntity + var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(htmlWithEntities, set); + + // Assert + Assert.Contains("dollars", result); + } + + [Fact] + public void EA034_ProjectAnnotationsOntoHtml_NbspEntity_DoesNotThrow() + { + // Arrange - HTML with   entity (most common case) + var htmlWithNbsp = "

Hello world!

"; + var set = new ExternalAnnotationSet + { + DocumentId = "test", + DocumentHash = "abc", + Content = "Hello\u00A0world!", + LabelledText = new List(), + TextLabels = new Dictionary() + }; + + // Act + var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(htmlWithNbsp, set); + + // Assert + Assert.Contains("world", result); + } + + [Fact] + public void EA035_ProjectAnnotationsOntoHtml_SingleRoot_StillWorks() + { + // Arrange - standard single-root HTML should still work + var html = "

Hello, world!

"; + var set = new ExternalAnnotationSet + { + DocumentId = "test", + DocumentHash = "abc", + Content = "Hello, world!", + LabelledText = new List(), + TextLabels = new Dictionary() + }; + + set.TextLabels["GREETING"] = new AnnotationLabel + { + Id = "GREETING", + Text = "Greeting", + Color = "#FFEB3B" + }; + + var annotation = new OpenContractsAnnotation + { + Id = "ann-single-root", + AnnotationLabel = "GREETING", + RawText = "Hello", + Page = 0, + AnnotationJson = new TextSpan { Id = "ann-single-root", Start = 0, End = 5, Text = "Hello" }, + Structural = false + }; + set.LabelledText.Add(annotation); + + // Act + var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(html, set); + + // Assert + Assert.Contains("data-annotation-id=\"ann-single-root\"", result); + } + + #endregion } } diff --git a/Docxodus/ExternalAnnotationProjector.cs b/Docxodus/ExternalAnnotationProjector.cs index fcae55b..816e980 100644 --- a/Docxodus/ExternalAnnotationProjector.cs +++ b/Docxodus/ExternalAnnotationProjector.cs @@ -390,9 +390,9 @@ public static string ProjectAnnotationsOntoHtml( if (annotationSet == null) throw new ArgumentNullException(nameof(annotationSet)); settings ??= new ExternalAnnotationProjectionSettings(); - var htmlDoc = XElement.Parse(html); + var htmlDoc = ParseHtmlString(html, out var wasWrapped); var result = ProjectAnnotations(htmlDoc, annotationSet, settings); - return result.ToString(); + return SerializeHtmlString(result, wasWrapped); } /// @@ -414,7 +414,7 @@ public static string AddAnnotationToHtml( if (annotation == null) throw new ArgumentNullException(nameof(annotation)); settings ??= new ExternalAnnotationProjectionSettings(); - var htmlDoc = XElement.Parse(html); + var htmlDoc = ParseHtmlString(html, out var wasWrapped); // Build text map and find annotation location var textMap = BuildTextMap(htmlDoc); @@ -449,7 +449,7 @@ public static string AddAnnotationToHtml( AddSingleAnnotationCss(htmlDoc, annotation, label, settings); } - return htmlDoc.ToString(); + return SerializeHtmlString(htmlDoc, wasWrapped); } /// @@ -468,7 +468,7 @@ public static string RemoveAnnotationFromHtml( if (string.IsNullOrEmpty(html)) throw new ArgumentNullException(nameof(html)); if (string.IsNullOrEmpty(annotationId)) throw new ArgumentNullException(nameof(annotationId)); - var htmlDoc = XElement.Parse(html); + var htmlDoc = ParseHtmlString(html, out var wasWrapped); // Find all spans with data-annotation-id matching var annotationSpans = htmlDoc.Descendants("span") @@ -504,7 +504,7 @@ public static string RemoveAnnotationFromHtml( } } - return htmlDoc.ToString(); + return SerializeHtmlString(htmlDoc, wasWrapped); } /// @@ -590,6 +590,99 @@ private static void AddSingleAnnotationCss( #endregion + #region HTML Fragment Parsing + + // Synthetic wrapper element used to handle HTML fragments with multiple root elements. + // XElement.Parse() requires a single root, but sanitized HTML (e.g., DOMPurify output) + // often has multiple top-level elements like