diff --git a/CHANGELOG.md b/CHANGELOG.md
index 419c498..5183b10 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,9 @@ All notable changes to this project will be documented in this file.
- CSS-based label filtering enables responsive toggle without any re-rendering
### Fixed
+- **Annotation projection fails on sanitized HTML (Issue #110)** - `ProjectAnnotationsOntoHtml`, `AddAnnotationToHtml`, and `RemoveAnnotationFromHtml` now handle HTML fragments with multiple root elements (e.g., DOMPurify-sanitized output) and HTML named entities (` `, `–`, etc.)
+ - Root cause: `XElement.Parse()` requires valid XML with a single root element; sanitized HTML strips ``/`
` wrappers leaving multiple roots
+ - Fix: Auto-wraps multi-root HTML in a synthetic container for parsing, unwraps on serialization; replaces common HTML entities with numeric XML equivalents
- **Table container missing top margin (Issue #108)** - Tables preceded by paragraphs with no after-spacing now get a default `margin-top: 7.5pt` for visual separation
- Also handles floating table spacing from `w:tblpPr` (`topFromText`/`bottomFromText` attributes)
- Tables preceded by paragraphs with explicit after-spacing correctly skip the default margin
diff --git a/Docxodus.Tests/ExternalAnnotationTests.cs b/Docxodus.Tests/ExternalAnnotationTests.cs
index 0aae41b..83b94cf 100644
--- a/Docxodus.Tests/ExternalAnnotationTests.cs
+++ b/Docxodus.Tests/ExternalAnnotationTests.cs
@@ -748,6 +748,176 @@ public void EA025_ProjectAnnotationsOntoHtml_ThenRemove_PreservesText()
}
#endregion
+
+ #region HTML Fragment Tests (Issue #110)
+
+ [Fact]
+ public void EA030_ProjectAnnotationsOntoHtml_MultipleRoots_DoesNotThrow()
+ {
+ // Arrange - simulate DOMPurify-sanitized HTML with multiple root elements
+ var sanitizedHtml = "";
+ var set = new ExternalAnnotationSet
+ {
+ DocumentId = "test",
+ DocumentHash = "abc",
+ Content = "Hello, world!",
+ LabelledText = new List(),
+ TextLabels = new Dictionary()
+ };
+
+ set.TextLabels["GREETING"] = new AnnotationLabel
+ {
+ Id = "GREETING",
+ Text = "Greeting",
+ Color = "#FFEB3B"
+ };
+
+ var annotation = new OpenContractsAnnotation
+ {
+ Id = "ann-frag",
+ AnnotationLabel = "GREETING",
+ RawText = "Hello",
+ Page = 0,
+ AnnotationJson = new TextSpan { Id = "ann-frag", Start = 0, End = 5, Text = "Hello" },
+ Structural = false
+ };
+ set.LabelledText.Add(annotation);
+
+ // Act - should not throw Xml_MultipleRoots
+ var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(sanitizedHtml, set);
+
+ // Assert
+ Assert.Contains("Hello", result);
+ Assert.Contains("data-annotation-id=\"ann-frag\"", result);
+ // Should not contain the synthetic wrapper element
+ Assert.DoesNotContain("docxodus-fragment-root", result);
+ }
+
+ [Fact]
+ public void EA031_AddAnnotationToHtml_MultipleRoots_DoesNotThrow()
+ {
+ // Arrange
+ var sanitizedHtml = "";
+ var annotation = new OpenContractsAnnotation
+ {
+ Id = "ann-add",
+ AnnotationLabel = "CLAUSE",
+ RawText = "Test",
+ Page = 0,
+ AnnotationJson = new TextSpan { Id = "ann-add", Start = 0, End = 4, Text = "Test" },
+ Structural = false
+ };
+ var label = new AnnotationLabel { Id = "CLAUSE", Text = "Clause", Color = "#FF5722" };
+
+ // Act
+ var result = ExternalAnnotationProjector.AddAnnotationToHtml(
+ sanitizedHtml, annotation, label);
+
+ // Assert
+ Assert.Contains("data-annotation-id=\"ann-add\"", result);
+ Assert.DoesNotContain("docxodus-fragment-root", result);
+ }
+
+ [Fact]
+ public void EA032_RemoveAnnotationFromHtml_MultipleRoots_DoesNotThrow()
+ {
+ // Arrange - HTML fragment with an annotation already projected
+ var htmlWithAnnotation = "" +
+ "" +
+ "Hello, world!
";
+
+ // Act
+ var result = ExternalAnnotationProjector.RemoveAnnotationFromHtml(
+ htmlWithAnnotation, "ann-rm");
+
+ // Assert
+ Assert.DoesNotContain("data-annotation-id=\"ann-rm\"", result);
+ Assert.Contains("Hello", result);
+ Assert.DoesNotContain("docxodus-fragment-root", result);
+ }
+
+ [Fact]
+ public void EA033_ProjectAnnotationsOntoHtml_HtmlEntities_DoesNotThrow()
+ {
+ // Arrange - HTML with named entities that are invalid in XML
+ var htmlWithEntities = "Price is 5 dollars – cheap!
";
+ var set = new ExternalAnnotationSet
+ {
+ DocumentId = "test",
+ DocumentHash = "abc",
+ Content = "Price is 5\u00A0dollars \u2013 cheap!",
+ LabelledText = new List(),
+ TextLabels = new Dictionary()
+ };
+
+ // Act - should not throw Xml_UndeclaredEntity
+ var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(htmlWithEntities, set);
+
+ // Assert
+ Assert.Contains("dollars", result);
+ }
+
+ [Fact]
+ public void EA034_ProjectAnnotationsOntoHtml_NbspEntity_DoesNotThrow()
+ {
+ // Arrange - HTML with entity (most common case)
+ var htmlWithNbsp = "";
+ var set = new ExternalAnnotationSet
+ {
+ DocumentId = "test",
+ DocumentHash = "abc",
+ Content = "Hello\u00A0world!",
+ LabelledText = new List(),
+ TextLabels = new Dictionary()
+ };
+
+ // Act
+ var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(htmlWithNbsp, set);
+
+ // Assert
+ Assert.Contains("world", result);
+ }
+
+ [Fact]
+ public void EA035_ProjectAnnotationsOntoHtml_SingleRoot_StillWorks()
+ {
+ // Arrange - standard single-root HTML should still work
+ var html = "Hello, world!
";
+ var set = new ExternalAnnotationSet
+ {
+ DocumentId = "test",
+ DocumentHash = "abc",
+ Content = "Hello, world!",
+ LabelledText = new List(),
+ TextLabels = new Dictionary()
+ };
+
+ set.TextLabels["GREETING"] = new AnnotationLabel
+ {
+ Id = "GREETING",
+ Text = "Greeting",
+ Color = "#FFEB3B"
+ };
+
+ var annotation = new OpenContractsAnnotation
+ {
+ Id = "ann-single-root",
+ AnnotationLabel = "GREETING",
+ RawText = "Hello",
+ Page = 0,
+ AnnotationJson = new TextSpan { Id = "ann-single-root", Start = 0, End = 5, Text = "Hello" },
+ Structural = false
+ };
+ set.LabelledText.Add(annotation);
+
+ // Act
+ var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(html, set);
+
+ // Assert
+ Assert.Contains("data-annotation-id=\"ann-single-root\"", result);
+ }
+
+ #endregion
}
}
diff --git a/Docxodus/ExternalAnnotationProjector.cs b/Docxodus/ExternalAnnotationProjector.cs
index fcae55b..816e980 100644
--- a/Docxodus/ExternalAnnotationProjector.cs
+++ b/Docxodus/ExternalAnnotationProjector.cs
@@ -390,9 +390,9 @@ public static string ProjectAnnotationsOntoHtml(
if (annotationSet == null) throw new ArgumentNullException(nameof(annotationSet));
settings ??= new ExternalAnnotationProjectionSettings();
- var htmlDoc = XElement.Parse(html);
+ var htmlDoc = ParseHtmlString(html, out var wasWrapped);
var result = ProjectAnnotations(htmlDoc, annotationSet, settings);
- return result.ToString();
+ return SerializeHtmlString(result, wasWrapped);
}
///
@@ -414,7 +414,7 @@ public static string AddAnnotationToHtml(
if (annotation == null) throw new ArgumentNullException(nameof(annotation));
settings ??= new ExternalAnnotationProjectionSettings();
- var htmlDoc = XElement.Parse(html);
+ var htmlDoc = ParseHtmlString(html, out var wasWrapped);
// Build text map and find annotation location
var textMap = BuildTextMap(htmlDoc);
@@ -449,7 +449,7 @@ public static string AddAnnotationToHtml(
AddSingleAnnotationCss(htmlDoc, annotation, label, settings);
}
- return htmlDoc.ToString();
+ return SerializeHtmlString(htmlDoc, wasWrapped);
}
///
@@ -468,7 +468,7 @@ public static string RemoveAnnotationFromHtml(
if (string.IsNullOrEmpty(html)) throw new ArgumentNullException(nameof(html));
if (string.IsNullOrEmpty(annotationId)) throw new ArgumentNullException(nameof(annotationId));
- var htmlDoc = XElement.Parse(html);
+ var htmlDoc = ParseHtmlString(html, out var wasWrapped);
// Find all spans with data-annotation-id matching
var annotationSpans = htmlDoc.Descendants("span")
@@ -504,7 +504,7 @@ public static string RemoveAnnotationFromHtml(
}
}
- return htmlDoc.ToString();
+ return SerializeHtmlString(htmlDoc, wasWrapped);
}
///
@@ -590,6 +590,99 @@ private static void AddSingleAnnotationCss(
#endregion
+ #region HTML Fragment Parsing
+
+ // Synthetic wrapper element used to handle HTML fragments with multiple root elements.
+ // XElement.Parse() requires a single root, but sanitized HTML (e.g., DOMPurify output)
+ // often has multiple top-level elements like