; close the
+ // last open one regardless of which counter tracked it.
+ if (pageCount + slPageCount > 0) {
doFooter();
xhtml.endElement("div");
}
@@ -85,7 +87,17 @@ public void startElement(String uri, String localName, String qName, Attributes
} else if ("sf:metadata".equals(qName)) {
inPart = DocumentPart.METADATA;
} else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
- if (pageCount > 0) {
+ // sf:p paragraphs can span page boundaries in iWork's XML schema.
+ // If a
is still open when the page changes, force-close it
+ // before the page
and reopen it in the new page so both
+ // pages have balanced tag pairs.
+ boolean reopenP = inPart == DocumentPart.PARSABLE_TEXT;
+ if (reopenP) {
+ xhtml.endElement("p");
+ }
+ // Use the combined counter so we close the prior
regardless
+ // of whether it was opened by sf:page-start or sl:page-group.
+ if (pageCount + slPageCount > 0) {
doFooter();
xhtml.endElement("div");
}
@@ -96,6 +108,9 @@ public void startElement(String uri, String localName, String qName, Attributes
pageCount++;
}
doHeader();
+ if (reopenP) {
+ xhtml.startElement("p");
+ }
} else if ("sf:p".equals(qName)) {
if (pageCount + slPageCount > 0) {
inPart = DocumentPart.PARSABLE_TEXT;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 676575ce405..5aeb992188e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -77,53 +77,60 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata
ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
- Last5 l5 = new Last5();
- int read;
+ xhtml.startDocument();
+ // try/finally so endDocument fires even if a header read, characters
+ // emit, or recursive helper throws IOException/SAXException/TikaException
+ // mid-parse. Without this the captured XHTML would be left unterminated.
+ try {
+ Last5 l5 = new Last5();
+ int read;
- // Try to get the creation date, which is YYYYMMDDhhmm
- byte[] header = new byte[30];
- IOUtils.readFully(tis, header);
- byte[] date = new byte[12];
- IOUtils.readFully(tis, date);
+ // Try to get the creation date, which is YYYYMMDDhhmm
+ byte[] header = new byte[30];
+ IOUtils.readFully(tis, header);
+ byte[] date = new byte[12];
+ IOUtils.readFully(tis, date);
- String dateStr = new String(date, US_ASCII);
- if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
- String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" +
- dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
- dateStr.substring(10, 12) + ":00";
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- // TODO Metadata.DATE is used as modified, should it be here?
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- }
- metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+ String dateStr = new String(date, US_ASCII);
+ if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
+ String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" +
+ dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
+ dateStr.substring(10, 12) + ":00";
+ metadata.set(TikaCoreProperties.CREATED, formattedDate);
+ metadata.set(TikaCoreProperties.MODIFIED, formattedDate);
+ }
+ metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
- // The description, if set, is the next up-to-500 bytes
- byte[] desc = new byte[500];
- IOUtils.readFully(tis, desc);
- String description = extractText(desc, true);
- if (description.length() > 0) {
- metadata.set(TikaCoreProperties.DESCRIPTION, description);
- }
+ // The description, if set, is the next up-to-500 bytes
+ byte[] desc = new byte[500];
+ IOUtils.readFully(tis, desc);
+ String description = extractText(desc, true);
+ if (description.length() > 0) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ }
- // Now look for text
- while ((read = tis.read()) > -1) {
- if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
- int nread = tis.read();
- if (nread == 0x3f || nread == 0xbf) {
- // Looks promising, check back for a suitable value
- if (read == 0xe3 && nread == 0x3f) {
- if (l5.is33()) {
- // Bingo, note text
- handleNoteText(tis, xhtml);
+ // Now look for text
+ while ((read = tis.read()) > -1) {
+ if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
+ int nread = tis.read();
+ if (nread == 0x3f || nread == 0xbf) {
+ // Looks promising, check back for a suitable value
+ if (read == 0xe3 && nread == 0x3f) {
+ if (l5.is33()) {
+ // Bingo, note text
+ handleNoteText(tis, xhtml);
+ }
+ } else if (l5.is00()) {
+ // Likely view name
+ handleViewName(read, nread, tis, xhtml, l5);
}
- } else if (l5.is00()) {
- // Likely view name
- handleViewName(read, nread, tis, xhtml, l5);
}
+ } else {
+ l5.record(read);
}
- } else {
- l5.record(read);
}
+ } finally {
+ xhtml.endDocument();
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index e5ef13da29f..27c0b92efb2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -165,6 +165,21 @@ private String parserAuthor(String line) {
}
private static class TikaNodeFilter implements NodeFilter {
+
+ // JHighlight wraps its highlighted code in a full HTML document. Of
+ // those wrappers:
+ // - html/body are dropped as elements but their descendants are
+ // still emitted (that's where the highlighted code lives).
+ // - head/meta/title/link/style are dropped *with* their subtrees,
+ // otherwise their text (CSS, the filename, etc.) leaks into the
+ // outer XHTML as bare character data.
+ // All of these collide with Tika's outer XHTMLContentHandler when
+ // emitted, producing malformed XHTML downstream.
+ private static final Set WRAPPER_TAGS_DROP_ELEMENT =
+ Set.of("html", "body");
+ private static final Set WRAPPER_TAGS_DROP_SUBTREE =
+ Set.of("head", "meta", "title", "link", "style");
+
boolean ignore = true;
ContentHandler handler;
@@ -181,6 +196,12 @@ public NodeFilter.FilterResult head(Node node, int i) {
if (ignore) {
return FilterResult.CONTINUE;
}
+ if (WRAPPER_TAGS_DROP_SUBTREE.contains(node.nodeName())) {
+ return FilterResult.SKIP_ENTIRELY;
+ }
+ if (WRAPPER_TAGS_DROP_ELEMENT.contains(node.nodeName())) {
+ return FilterResult.CONTINUE;
+ }
if (node instanceof TextNode) {
String txt = ((TextNode) node).getWholeText();
if (txt != null) {
@@ -234,6 +255,10 @@ public NodeFilter.FilterResult tail(Node node, int i) {
if (ignore) {
return FilterResult.CONTINUE;
}
+ if (WRAPPER_TAGS_DROP_ELEMENT.contains(node.nodeName())
+ || WRAPPER_TAGS_DROP_SUBTREE.contains(node.nodeName())) {
+ return FilterResult.CONTINUE;
+ }
if (node instanceof TextNode || node instanceof DataNode) {
return NodeFilter.FilterResult.CONTINUE;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index 36948b71f89..03023cc0ed4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -56,7 +56,6 @@ public void testHTMLRenderWithReturnLine() throws Exception {
createMetadata("text/x-java-source")).xml;
assertTrue(htmlContent.indexOf("public") >
0);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index c686414ed10..07c74553386 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -62,6 +62,7 @@
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLBalancingHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
@@ -665,11 +666,23 @@ void handleGeneralTextContainingPart(String contentType, String xhtmlClassLabel,
if (relatedPartPart == null) {
continue;
}
+ // Wrap the contentHandler so we can close anything the
+ // inner parser left open if it throws mid-element. Without
+ // this, the
emitted after the loop would land on
+ // top of an open
/
/etc. from the failed sub-parse.
+ XHTMLBalancingHandler balancer =
+ new XHTMLBalancingHandler(contentHandler);
try (InputStream stream = relatedPartPart.getInputStream()) {
XMLReaderUtils.parseSAX(stream,
- new EmbeddedContentHandler(contentHandler), context);
+ new EmbeddedContentHandler(balancer), context);
} catch (IOException | TikaException e) {
+ balancer.drainOpenElements();
+ parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ } catch (SAXException e) {
+ balancer.drainOpenElements();
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index a18f52a4d27..1e11e80b78a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -64,6 +64,14 @@ public class OOXMLTikaBodyPartHandler
//...
private int tableCellDepth = 0;
private int pWithinCell = 0;
+ // Stack of structural elements (paragraphs, tables, rows, cells) this
+ // handler has emitted to the xhtml stream and not yet closed. Used by
+ // closeAnyPending() to drain the stack in reverse order so the captured
+ // XHTML stays balanced when a caller's parseSAX call throws part-way.
+ // Tags emitted by FormattingTagManager (////) are not
+ // tracked here -- closeAnyPending closes them via formattingTags.closeAll()
+ // before draining this stack.
+ private final java.util.Deque openStructuralTags = new java.util.ArrayDeque<>();
//will need to replace this with a stack
//if we're marking more that the first level element
@@ -163,6 +171,7 @@ public void startParagraph(ParagraphProperties paragraphProperties) throws SAXEx
} else {
xhtml.startElement(paragraphTag, "class", styleClass);
}
+ openStructuralTags.push(paragraphTag);
}
writeParagraphNumber(paragraphProperties.getNumId(), paragraphProperties.getIlvl(),
@@ -176,6 +185,7 @@ public void endParagraph() throws SAXException {
formattingTags.closeAll();
if (pDepth == 1 && tableDepth == 0) {
xhtml.endElement(paragraphTag);
+ popExpected(paragraphTag);
} else if (tableCellDepth > 0 && pWithinCell > 0) {
xhtml.characters(NEWLINE, 0, 1);
} else if (tableCellDepth == 0) {
@@ -214,10 +224,50 @@ public java.util.Set getEmittedCommentIds() {
return emittedCommentIds;
}
+ /**
+ * Closes any XHTML elements this handler opened but didn't get a chance to
+ * close, in the proper nesting order. Intended ONLY for the catch arm of a
+ * caller that swallowed a {@link SAXException} from the inner SAX parser;
+ * the normal happy-path flow keeps the trackers in sync via endParagraph
+ * / endTableCell / endTableRow / endTable / FormattingTagManager.closeAll.
+ * Without this, swallowed exceptions leave dangling {@code }, {@code },
+ * {@code | }, {@code }, or formatting tags on the wire that
+ * collide with the outer {@code |