diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index 5852630d0bd..194d863d2d1 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -40,6 +40,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.SAXOutputConfig; +import org.apache.tika.sax.XHTMLBalancingHandler; /** * Helper class for parsers of package archives or other compound document @@ -163,11 +164,22 @@ public void parseEmbedded( handler.endElement(XHTML, "h1", "h1"); } + // Wrap the delegate's handler so we can close anything it left open if + // it throws mid-element. Without this, the emitted in finally + // could land on top of an open

//etc. from the failed + // sub-parse and produce malformed XHTML. + XHTMLBalancingHandler balancer = + outputHtml ? new XHTMLBalancingHandler(handler) : null; + ContentHandler delegateHandler = outputHtml ? balancer : handler; + // Use the delegate parser to parse this entry + boolean parsedCleanly = false; try { tis.setCloseShield(); - DELEGATING_PARSER.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(handler)), + DELEGATING_PARSER.parse(tis, + new EmbeddedContentHandler(new BodyContentHandler(delegateHandler)), metadata, context); + parsedCleanly = true; } catch (EncryptedDocumentException ede) { recordException(ede, context); } catch (CorruptedFileException e) { @@ -178,10 +190,17 @@ public void parseEmbedded( recordException(e, context); } finally { tis.removeCloseShield(); - } - - if (outputHtml) { - handler.endElement(XHTML, "div", "div"); + if (outputHtml) { + // Only an aborted parse can leave elements open; on a clean parse + // the balancer stack is empty. Draining only on abort keeps the + // package-entry div well-formed when the inner parse throws, while + // letting StrictXHTMLValidator still catch genuine imbalances on the + // happy path (TIKA-4728). + if (!parsedCleanly) { + balancer.drainOpenElements(); + } + handler.endElement(XHTML, "div", "div"); + } } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 337eba15ab7..53d382b2e02 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -42,6 +42,7 @@ public class BasicContentHandlerFactory implements StreamingContentHandlerFactor private HANDLER_TYPE type = HANDLER_TYPE.MARKDOWN; private int writeLimit = -1; private boolean throwOnWriteLimitReached = true; + private boolean validateXHTML = false; private transient ParseContext parseContext; /** @@ -83,6 +84,23 @@ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, } } + /** + * When true, every handler produced by this factory is wrapped in a + * {@link StrictXHTMLValidator} so that any malformed XHTML event sequence + * emitted by a parser throws a {@link org.xml.sax.SAXException} at the + * offending event. Defaults to false. Intended for catching parser bugs + * in development / corpus scans; off in production for performance and + * because not every caller wants validation failures to surface as + * parser exceptions. + */ + public void setValidateXHTML(boolean validateXHTML) { + this.validateXHTML = validateXHTML; + } + + public boolean isValidateXHTML() { + return validateXHTML; + } + /** * Creates a new BasicContentHandlerFactory configured from OutputLimits in the ParseContext. *

@@ -138,7 +156,10 @@ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE @Override public ContentHandler createHandler() { + return maybeValidate(createHandlerInner()); + } + private ContentHandler createHandlerInner() { if (type == HANDLER_TYPE.BODY) { return new BodyContentHandler( new WriteOutContentHandler(new ToTextContentHandler(), writeLimit, @@ -154,6 +175,10 @@ public ContentHandler createHandler() { parseContext); } + private ContentHandler maybeValidate(ContentHandler h) { + return validateXHTML ? new StrictXHTMLValidator(h) : h; + } + private ContentHandler getFormatHandler() { switch (type) { case TEXT: @@ -171,7 +196,10 @@ private ContentHandler getFormatHandler() { @Override public ContentHandler createHandler(OutputStream os, Charset charset) { + return maybeValidate(createHandlerInner(os, charset)); + } + private ContentHandler createHandlerInner(OutputStream os, Charset charset) { if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } @@ -292,6 +320,7 @@ public boolean equals(Object o) { BasicContentHandlerFactory that = (BasicContentHandlerFactory) o; return writeLimit == that.writeLimit && throwOnWriteLimitReached == that.throwOnWriteLimitReached && + validateXHTML == that.validateXHTML && type == that.type; } @@ -300,6 +329,7 @@ public int hashCode() { int result = type != null ? type.hashCode() : 0; result = 31 * result + writeLimit; result = 31 * result + (throwOnWriteLimitReached ? 1 : 0); + result = 31 * result + (validateXHTML ? 1 : 0); return result; } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java b/tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java new file mode 100644 index 00000000000..12cf3b629ff --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.HashSet; +import java.util.Set; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A SAX content handler decorator that enforces XHTML well-formedness on the + * incoming event stream. Any parser that emits an event sequence that would + * produce malformed XHTML triggers a {@link SAXException} synchronously — the + * stack trace points at the parser code that made the offending call, instead + * of surfacing later as a parse error on the serialized output. + *

+ * Invariants enforced: + *

+ * Use as a decorator wrapping the real handler. It passes every event through + * to the downstream handler after validation, so any normal text/XHTML capture + * still works. + */ +public class StrictXHTMLValidator extends ContentHandlerDecorator { + + private final Deque openElements = new ArrayDeque<>(); + private boolean documentStarted; + private boolean documentEnded; + + public StrictXHTMLValidator(ContentHandler handler) { + super(handler); + } + + @Override + public void startDocument() throws SAXException { + if (documentStarted) { + throw new SAXException("StrictXHTMLValidator: startDocument called twice"); + } + if (documentEnded) { + throw new SAXException( + "StrictXHTMLValidator: startDocument after endDocument"); + } + documentStarted = true; + super.startDocument(); + } + + @Override + public void endDocument() throws SAXException { + if (documentEnded) { + throw new SAXException("StrictXHTMLValidator: endDocument called twice"); + } + if (!openElements.isEmpty()) { + throw new SAXException( + "StrictXHTMLValidator: endDocument with " + openElements.size() + + " unclosed element(s); topmost was <" + + openElements.peek().qOrLocal() + ">"); + } + documentEnded = true; + super.endDocument(); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) + throws SAXException { + ensureNotEnded("startElement <" + display(qName, localName) + ">"); + checkAttributesUnique(qName, localName, attrs); + openElements.push(new QName(uri, localName, qName)); + super.startElement(uri, localName, qName, attrs); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + ensureNotEnded("endElement "); + if (openElements.isEmpty()) { + throw new SAXException( + "StrictXHTMLValidator: endElement with no matching startElement"); + } + QName top = openElements.pop(); + if (!top.matches(uri, localName, qName)) { + throw new SAXException( + "StrictXHTMLValidator: endElement does not match topmost open element <" + + top.qOrLocal() + ">"); + } + super.endElement(uri, localName, qName); + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + ensureNotEnded("characters"); + super.characters(ch, start, length); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + ensureNotEnded("ignorableWhitespace"); + super.ignorableWhitespace(ch, start, length); + } + + @Override + public void processingInstruction(String target, String data) throws SAXException { + ensureNotEnded("processingInstruction"); + super.processingInstruction(target, data); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + ensureNotEnded("startPrefixMapping"); + super.startPrefixMapping(prefix, uri); + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + ensureNotEnded("endPrefixMapping"); + super.endPrefixMapping(prefix); + } + + @Override + public void skippedEntity(String name) throws SAXException { + ensureNotEnded("skippedEntity"); + super.skippedEntity(name); + } + + private void ensureNotEnded(String event) throws SAXException { + if (documentEnded) { + throw new SAXException( + "StrictXHTMLValidator: " + event + " arrived after endDocument"); + } + } + + private void checkAttributesUnique(String elementQName, String elementLocalName, + Attributes attrs) throws SAXException { + int n = attrs.getLength(); + if (n < 2) { + return; + } + // (uri, localName) pairs must be unique per the XML namespaces spec. + // We also check raw qnames because Tika's serializers emit by qname and + // duplicate qnames produce malformed XHTML even when localnames differ. + Set seenUriLocal = new HashSet<>(n); + Set seenQNames = new HashSet<>(n); + for (int i = 0; i < n; i++) { + String uri = nullSafe(attrs.getURI(i)); + String local = nullSafe(attrs.getLocalName(i)); + String qn = nullSafe(attrs.getQName(i)); + // U+0001 cannot appear in a valid XML uri/localName, so it joins the + // two unambiguously without risk of a key collision. + String key = uri + "\u0001" + local; + if (!seenUriLocal.add(key)) { + throw new SAXException( + "StrictXHTMLValidator: duplicate attribute on <" + + display(elementQName, elementLocalName) + ">: " + + (uri.isEmpty() ? local : ("{" + uri + "}" + local))); + } + if (!qn.isEmpty() && !seenQNames.add(qn)) { + throw new SAXException( + "StrictXHTMLValidator: duplicate attribute qname on <" + + display(elementQName, elementLocalName) + ">: " + qn); + } + } + } + + private static String nullSafe(String s) { + return s == null ? "" : s; + } + + private static String display(String qName, String localName) { + if (qName != null && !qName.isEmpty()) { + return qName; + } + return localName == null ? "" : localName; + } + + private static final class QName { + final String uri; + final String localName; + final String qName; + + QName(String uri, String localName, String qName) { + this.uri = nullSafe(uri); + this.localName = nullSafe(localName); + this.qName = nullSafe(qName); + } + + boolean matches(String u, String l, String q) { + // SAX parsers can vary in which fields they populate. Accept a + // match on either (uri, localName) or qName, whichever is present. + String otherU = nullSafe(u); + String otherL = nullSafe(l); + String otherQ = nullSafe(q); + boolean uriLocalMatch = uri.equals(otherU) && localName.equals(otherL) + && !localName.isEmpty(); + boolean qNameMatch = !qName.isEmpty() && qName.equals(otherQ); + return uriLocalMatch || qNameMatch; + } + + String qOrLocal() { + return qName.isEmpty() ? localName : qName; + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java new file mode 100644 index 00000000000..14a0a9e9bcd --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.util.ArrayDeque; +import java.util.Deque; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * SAX decorator that tracks open elements so a parser can recover well-formed + * XHTML when an exception interrupts the SAX stream mid-element. + *

+ * The decorator is a thin passthrough on the happy path: it pushes and pops an + * internal stack on {@code startElement}/{@code endElement} and otherwise forwards + * every event to the wrapped handler unchanged. It deliberately does NOT mask + * bad event sequences (mismatched or excess endElement, duplicate attributes, + * etc.) -- those remain visible to {@link StrictXHTMLValidator} so parser bugs + * still surface as test failures. + *

+ * The unhappy path -- a per-part SAX parser throwing mid-element after emitting + * one or more start tags -- is handled via {@link #drainOpenElements()}, which + * emits a matching {@code endElement} (with the original uri/localName/qName) + * for every element still on the stack, in reverse open order. The wrapped + * handler is left in a well-formed state with no dangling elements from the + * failed sub-parse. + *

+ * Typical use wraps the handler that receives events from an inner SAX parser, + * inside the catch arm that swallows the inner parser's exception: + *

{@code
+ * XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(contentHandler);
+ * try {
+ *     XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(balancer), context);
+ * } catch (SAXException e) {
+ *     balancer.drainOpenElements();
+ *     // ... log and continue ...
+ * }
+ * }
+ * This handler does not touch {@code startDocument}/{@code endDocument}; the + * caller still owns the document lifecycle. + */ +public class XHTMLBalancingHandler extends ContentHandlerDecorator { + + private final Deque openElements = new ArrayDeque<>(); + + public XHTMLBalancingHandler(ContentHandler handler) { + super(handler); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) + throws SAXException { + openElements.push(new QName(uri, localName, qName)); + super.startElement(uri, localName, qName, attrs); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + // Pop best-effort: an unbalanced endElement (e.g., emitted after the + // matching startElement was swallowed) still forwards downstream so a + // wrapping StrictXHTMLValidator sees the violation. + if (!openElements.isEmpty()) { + openElements.pop(); + } + super.endElement(uri, localName, qName); + } + + /** + * Emits a matching {@code endElement} for every element still on the open + * stack, in reverse open order. After this call the stack is empty. + *

+ * Intended for the catch arm of a caller that swallowed a + * {@link SAXException} from an inner SAX parser: the inner parser may have + * left one or more elements open mid-stream, and downstream serialization + * needs matching closers before any further events. + *

+ * Does NOT emit {@code endDocument} -- document lifecycle stays with the + * caller. + */ + public void drainOpenElements() throws SAXException { + while (!openElements.isEmpty()) { + QName q = openElements.pop(); + super.endElement(q.uri, q.localName, q.qName); + } + } + + /** + * Number of elements currently open through this handler. Exposed for + * tests and for callers that want to know whether + * {@link #drainOpenElements()} would emit anything. + */ + public int openElementCount() { + return openElements.size(); + } + + private static final class QName { + final String uri; + final String localName; + final String qName; + + QName(String uri, String localName, String qName) { + this.uri = uri == null ? "" : uri; + this.localName = localName == null ? "" : localName; + this.qName = qName == null ? "" : qName; + } + } +} diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index a8e37a85b2e..91f3c236c58 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -58,6 +58,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.sax.StrictXHTMLValidator; import org.apache.tika.sax.ToXMLContentHandler; /** @@ -66,6 +67,19 @@ public abstract class TikaTest { protected static Parser AUTO_DETECT_PARSER = new AutoDetectParser(); + + /** + * Build a {@link BasicContentHandlerFactory} with the strict XHTML + * validator enabled so that any malformed XHTML emitted during a + * recursive parse throws at the offending SAX event. All TikaTest + * helpers route through this so individual tests don't have to opt in. + */ + private static BasicContentHandlerFactory validatingFactory( + BasicContentHandlerFactory.HANDLER_TYPE type) { + BasicContentHandlerFactory f = new BasicContentHandlerFactory(type, -1); + f.setValidateXHTML(true); + return f; + } public static void assertContainsCount(String needle, String haystack, int targetCount) { int i = haystack.indexOf(needle); int count = 0; @@ -330,9 +344,13 @@ protected XMLResult getXML(TikaInputStream input, Parser parser, Metadata metada } try (input) { - ContentHandler handler = new ToXMLContentHandler(); + ToXMLContentHandler xml = new ToXMLContentHandler(); + // Wrap with the strict validator so parsers that emit malformed + // XHTML (duplicate attrs, unclosed tags, cross-nested elements) + // throw at the offending SAX event with a parser stack frame. + ContentHandler handler = new StrictXHTMLValidator(xml); parser.parse(input, handler, metadata, context); - return new XMLResult(handler.toString(), metadata); + return new XMLResult(xml.toString(), metadata); } } @@ -462,7 +480,7 @@ protected List getRecursiveMetadata(TikaInputStream tis, Parser p, Met throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + validatingFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML)); try (tis) { wrapper.parse(tis, handler, metadata, context); } catch (Exception e) { @@ -479,7 +497,7 @@ protected List getRecursiveMetadata(TikaInputStream tis, Parser p, Met throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerType, -1)); + validatingFactory(handlerType)); try (tis) { wrapper.parse(tis, handler, metadata, context); } catch (Exception e) { @@ -495,7 +513,7 @@ protected List getRecursiveMetadata(String filePath, ParseContext cont RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + validatingFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML)); Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); try (TikaInputStream tis = getResourceAsStream("/test-documents/" + filePath)) { @@ -523,7 +541,7 @@ protected List getRecursiveMetadata(String filePath, Parser parserToWr ParseContext context) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1)); + new RecursiveParserWrapperHandler(validatingFactory(handlerType)); Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); try (TikaInputStream tis = getResourceAsStream("/test-documents/" + filePath)) { @@ -536,7 +554,7 @@ protected List getRecursiveMetadata(String filePath, Parser parserToWr ParseContext parseContext) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + validatingFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML)); try (TikaInputStream tis = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(tis, handler, new Metadata(), parseContext); @@ -578,7 +596,8 @@ protected String getText(String filePath, Parser parser, Metadata metadata, */ public String getText(TikaInputStream tis, Parser parser, ParseContext context, Metadata metadata) throws Exception { - ContentHandler handler = new BodyContentHandler(1000000); + BodyContentHandler body = new BodyContentHandler(1000000); + ContentHandler handler = new StrictXHTMLValidator(body); try (tis) { parser.parse(tis, handler, metadata, context); } catch (SAXException e) { @@ -586,7 +605,7 @@ public String getText(TikaInputStream tis, Parser parser, ParseContext context, throw e; } } - return handler.toString(); + return body.toString(); } public String getText(TikaInputStream tis, Parser parser, Metadata metadata) throws Exception { diff --git a/tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java new file mode 100644 index 00000000000..3688616c421 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.Test; +import org.xml.sax.helpers.AttributesImpl; + +public class XHTMLBalancingHandlerTest { + + private static AttributesImpl noAtts() { + return new AttributesImpl(); + } + + @Test + public void happyPathIsPassthrough() throws Exception { + ToXMLContentHandler out = new ToXMLContentHandler(); + XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out); + + balancer.startDocument(); + balancer.startElement("", "p", "p", noAtts()); + balancer.characters("hello".toCharArray(), 0, 5); + balancer.endElement("", "p", "p"); + balancer.endDocument(); + + assertEquals(0, balancer.openElementCount()); + // No drain needed -- stack should already be empty. + balancer.drainOpenElements(); + // ToXMLContentHandler.toString() returns the serialized form. + String xml = out.toString(); + assertEquals(true, xml.contains("

hello

"), xml); + } + + @Test + public void drainClosesElementsInReverseOpenOrder() throws Exception { + ToXMLContentHandler out = new ToXMLContentHandler(); + XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out); + + balancer.startDocument(); + balancer.startElement("", "div", "div", noAtts()); + balancer.startElement("", "p", "p", noAtts()); + balancer.startElement("", "span", "span", noAtts()); + balancer.characters("oops".toCharArray(), 0, 4); + + // Simulate exception mid-element: caller drains. + assertEquals(3, balancer.openElementCount()); + balancer.drainOpenElements(); + assertEquals(0, balancer.openElementCount()); + + balancer.endDocument(); + + String xml = out.toString(); + // Expect

in that order. + int spanIdx = xml.indexOf(""); + int pIdx = xml.indexOf("

"); + int divIdx = xml.indexOf(""); + assertEquals(true, spanIdx >= 0 && pIdx > spanIdx && divIdx > pIdx, + "expected

order, got: " + xml); + } + + @Test + public void drainEmitsMatchingUriAndQName() throws Exception { + // Verifies the Copilot review point: endElement must carry the same + // (uri, localName, qName) tuple as the matching startElement. + ToXMLContentHandler out = new ToXMLContentHandler(); + XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out); + + balancer.startDocument(); + balancer.startPrefixMapping("h", "http://example.com/ns"); + balancer.startElement("http://example.com/ns", "wrap", "h:wrap", noAtts()); + // Emit content so the serializer can't collapse to a self-closing tag, + // forcing the close form to be explicit -- proves drainOpen used the + // matching qName ("h:wrap") rather than just the local name. + balancer.characters("x".toCharArray(), 0, 1); + balancer.drainOpenElements(); + balancer.endPrefixMapping("h"); + balancer.endDocument(); + + String xml = out.toString(); + assertEquals(true, xml.contains(""), + "expected qualified close , got: " + xml); + } + + @Test + public void drainIsIdempotent() throws Exception { + ToXMLContentHandler out = new ToXMLContentHandler(); + XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out); + + balancer.startDocument(); + balancer.startElement("", "p", "p", noAtts()); + balancer.drainOpenElements(); + balancer.drainOpenElements(); // second call: no-op + balancer.endDocument(); + assertEquals(0, balancer.openElementCount()); + } + + @Test + public void downstreamValidatorStillCatchesMismatchedEndElement() throws Exception { + // Balancer must NOT silently fix bad happy-path sequences -- the + // StrictXHTMLValidator wrapping the real handler must still see (and + // reject) excess endElement events. + ToXMLContentHandler out = new ToXMLContentHandler(); + StrictXHTMLValidator validator = new StrictXHTMLValidator(out); + XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(validator); + + balancer.startDocument(); + balancer.startElement("", "p", "p", noAtts()); + balancer.endElement("", "p", "p"); + // Extra endElement: stack is empty so balancer pops nothing, but the + // event still flows downstream to the validator, which must throw. + assertThrows(org.xml.sax.SAXException.class, + () -> balancer.endElement("", "p", "p")); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java index 1329b8db068..2298983d1fd 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java @@ -59,7 +59,9 @@ class PagesContentHandler extends DefaultHandler { @Override public void endDocument() throws SAXException { metadata.set(Office.PAGE_COUNT, String.valueOf(pageCount)); - if (pageCount > 0) { + // Either sf:page-start or sl:page-group opens a
; close the + // last open one regardless of which counter tracked it. + if (pageCount + slPageCount > 0) { doFooter(); xhtml.endElement("div"); } @@ -85,7 +87,17 @@ public void startElement(String uri, String localName, String qName, Attributes } else if ("sf:metadata".equals(qName)) { inPart = DocumentPart.METADATA; } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) { - if (pageCount > 0) { + // sf:p paragraphs can span page boundaries in iWork's XML schema. + // If a

is still open when the page changes, force-close it + // before the page

and reopen it in the new page so both + // pages have balanced tag pairs. + boolean reopenP = inPart == DocumentPart.PARSABLE_TEXT; + if (reopenP) { + xhtml.endElement("p"); + } + // Use the combined counter so we close the prior
regardless + // of whether it was opened by sf:page-start or sl:page-group. + if (pageCount + slPageCount > 0) { doFooter(); xhtml.endElement("div"); } @@ -96,6 +108,9 @@ public void startElement(String uri, String localName, String qName, Attributes pageCount++; } doHeader(); + if (reopenP) { + xhtml.startElement("p"); + } } else if ("sf:p".equals(qName)) { if (pageCount + slPageCount > 0) { inPart = DocumentPart.PARSABLE_TEXT; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java index 676575ce405..5aeb992188e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java @@ -77,53 +77,60 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); - Last5 l5 = new Last5(); - int read; + xhtml.startDocument(); + // try/finally so endDocument fires even if a header read, characters + // emit, or recursive helper throws IOException/SAXException/TikaException + // mid-parse. Without this the captured XHTML would be left unterminated. + try { + Last5 l5 = new Last5(); + int read; - // Try to get the creation date, which is YYYYMMDDhhmm - byte[] header = new byte[30]; - IOUtils.readFully(tis, header); - byte[] date = new byte[12]; - IOUtils.readFully(tis, date); + // Try to get the creation date, which is YYYYMMDDhhmm + byte[] header = new byte[30]; + IOUtils.readFully(tis, header); + byte[] date = new byte[12]; + IOUtils.readFully(tis, date); - String dateStr = new String(date, US_ASCII); - if (dateStr.startsWith("19") || dateStr.startsWith("20")) { - String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" + - dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + - dateStr.substring(10, 12) + ":00"; - metadata.set(TikaCoreProperties.CREATED, formattedDate); - // TODO Metadata.DATE is used as modified, should it be here? - metadata.set(TikaCoreProperties.CREATED, formattedDate); - } - metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE); + String dateStr = new String(date, US_ASCII); + if (dateStr.startsWith("19") || dateStr.startsWith("20")) { + String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" + + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + + dateStr.substring(10, 12) + ":00"; + metadata.set(TikaCoreProperties.CREATED, formattedDate); + metadata.set(TikaCoreProperties.MODIFIED, formattedDate); + } + metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE); - // The description, if set, is the next up-to-500 bytes - byte[] desc = new byte[500]; - IOUtils.readFully(tis, desc); - String description = extractText(desc, true); - if (description.length() > 0) { - metadata.set(TikaCoreProperties.DESCRIPTION, description); - } + // The description, if set, is the next up-to-500 bytes + byte[] desc = new byte[500]; + IOUtils.readFully(tis, desc); + String description = extractText(desc, true); + if (description.length() > 0) { + metadata.set(TikaCoreProperties.DESCRIPTION, description); + } - // Now look for text - while ((read = tis.read()) > -1) { - if (read == 0xe0 || read == 0xe3 || read == 0xf0) { - int nread = tis.read(); - if (nread == 0x3f || nread == 0xbf) { - // Looks promising, check back for a suitable value - if (read == 0xe3 && nread == 0x3f) { - if (l5.is33()) { - // Bingo, note text - handleNoteText(tis, xhtml); + // Now look for text + while ((read = tis.read()) > -1) { + if (read == 0xe0 || read == 0xe3 || read == 0xf0) { + int nread = tis.read(); + if (nread == 0x3f || nread == 0xbf) { + // Looks promising, check back for a suitable value + if (read == 0xe3 && nread == 0x3f) { + if (l5.is33()) { + // Bingo, note text + handleNoteText(tis, xhtml); + } + } else if (l5.is00()) { + // Likely view name + handleViewName(read, nread, tis, xhtml, l5); } - } else if (l5.is00()) { - // Likely view name - handleViewName(read, nread, tis, xhtml, l5); } + } else { + l5.record(read); } - } else { - l5.record(read); } + } finally { + xhtml.endDocument(); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java index e5ef13da29f..27c0b92efb2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java @@ -165,6 +165,21 @@ private String parserAuthor(String line) { } private static class TikaNodeFilter implements NodeFilter { + + // JHighlight wraps its highlighted code in a full HTML document. Of + // those wrappers: + // - html/body are dropped as elements but their descendants are + // still emitted (that's where the highlighted code lives). + // - head/meta/title/link/style are dropped *with* their subtrees, + // otherwise their text (CSS, the filename, etc.) leaks into the + // outer XHTML as bare character data. + // All of these collide with Tika's outer XHTMLContentHandler when + // emitted, producing malformed XHTML downstream. + private static final Set WRAPPER_TAGS_DROP_ELEMENT = + Set.of("html", "body"); + private static final Set WRAPPER_TAGS_DROP_SUBTREE = + Set.of("head", "meta", "title", "link", "style"); + boolean ignore = true; ContentHandler handler; @@ -181,6 +196,12 @@ public NodeFilter.FilterResult head(Node node, int i) { if (ignore) { return FilterResult.CONTINUE; } + if (WRAPPER_TAGS_DROP_SUBTREE.contains(node.nodeName())) { + return FilterResult.SKIP_ENTIRELY; + } + if (WRAPPER_TAGS_DROP_ELEMENT.contains(node.nodeName())) { + return FilterResult.CONTINUE; + } if (node instanceof TextNode) { String txt = ((TextNode) node).getWholeText(); if (txt != null) { @@ -234,6 +255,10 @@ public NodeFilter.FilterResult tail(Node node, int i) { if (ignore) { return FilterResult.CONTINUE; } + if (WRAPPER_TAGS_DROP_ELEMENT.contains(node.nodeName()) + || WRAPPER_TAGS_DROP_SUBTREE.contains(node.nodeName())) { + return FilterResult.CONTINUE; + } if (node instanceof TextNode || node instanceof DataNode) { return NodeFilter.FilterResult.CONTINUE; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java index 36948b71f89..03023cc0ed4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java @@ -56,7 +56,6 @@ public void testHTMLRenderWithReturnLine() throws Exception { createMetadata("text/x-java-source")).xml; assertTrue(htmlContent.indexOf("public") > 0); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index c686414ed10..07c74553386 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -62,6 +62,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.SummaryExtractor; import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLBalancingHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.StringUtils; @@ -665,11 +666,23 @@ void handleGeneralTextContainingPart(String contentType, String xhtmlClassLabel, if (relatedPartPart == null) { continue; } + // Wrap the contentHandler so we can close anything the + // inner parser left open if it throws mid-element. Without + // this, the
emitted after the loop would land on + // top of an open

/

}, {@code
/etc. from the failed sub-parse. + XHTMLBalancingHandler balancer = + new XHTMLBalancingHandler(contentHandler); try (InputStream stream = relatedPartPart.getInputStream()) { XMLReaderUtils.parseSAX(stream, - new EmbeddedContentHandler(contentHandler), context); + new EmbeddedContentHandler(balancer), context); } catch (IOException | TikaException e) { + balancer.drainOpenElements(); + parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + } catch (SAXException e) { + balancer.drainOpenElements(); + WriteLimitReachedException.throwIfWriteLimitReached(e); parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index a18f52a4d27..1e11e80b78a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -64,6 +64,14 @@ public class OOXMLTikaBodyPartHandler //

... private int tableCellDepth = 0; private int pWithinCell = 0; + // Stack of structural elements (paragraphs, tables, rows, cells) this + // handler has emitted to the xhtml stream and not yet closed. Used by + // closeAnyPending() to drain the stack in reverse order so the captured + // XHTML stays balanced when a caller's parseSAX call throws part-way. + // Tags emitted by FormattingTagManager (////) are not + // tracked here -- closeAnyPending closes them via formattingTags.closeAll() + // before draining this stack. + private final java.util.Deque openStructuralTags = new java.util.ArrayDeque<>(); //will need to replace this with a stack //if we're marking more that the first level

element @@ -163,6 +171,7 @@ public void startParagraph(ParagraphProperties paragraphProperties) throws SAXEx } else { xhtml.startElement(paragraphTag, "class", styleClass); } + openStructuralTags.push(paragraphTag); } writeParagraphNumber(paragraphProperties.getNumId(), paragraphProperties.getIlvl(), @@ -176,6 +185,7 @@ public void endParagraph() throws SAXException { formattingTags.closeAll(); if (pDepth == 1 && tableDepth == 0) { xhtml.endElement(paragraphTag); + popExpected(paragraphTag); } else if (tableCellDepth > 0 && pWithinCell > 0) { xhtml.characters(NEWLINE, 0, 1); } else if (tableCellDepth == 0) { @@ -214,10 +224,50 @@ public java.util.Set getEmittedCommentIds() { return emittedCommentIds; } + /** + * Closes any XHTML elements this handler opened but didn't get a chance to + * close, in the proper nesting order. Intended ONLY for the catch arm of a + * caller that swallowed a {@link SAXException} from the inner SAX parser; + * the normal happy-path flow keeps the trackers in sync via endParagraph + * / endTableCell / endTableRow / endTable / FormattingTagManager.closeAll. + * Without this, swallowed exceptions leave dangling {@code

}, {@code

}, + * {@code
}, or formatting tags on the wire that + * collide with the outer {@code }. + */ + public void closeAnyPending() throws SAXException { + formattingTags.closeAll(); + // Drain the structural-element stack in reverse open order. This + // handles nested tables correctly (multiple cells/rows/tables + // interleaved), unlike per-element counters which lose nesting info. + while (!openStructuralTags.isEmpty()) { + String tag = openStructuralTags.pop(); + xhtml.endElement(tag); + } + // Reset internal depth/state so subsequent emits start clean. + tableDepth = 0; + tableCellDepth = 0; + pDepth = 0; + pWithinCell = 0; + } + + /** + * Pops {@code openStructuralTags} expecting the given tag on top. + * If the stack is empty or the top differs, this is a no-op rather than a + * throw -- the stack is best-effort tracking for closeAnyPending(), and + * the existing happy-path tests (which don't trigger closeAnyPending) must + * not be perturbed by stack tracking bugs. + */ + private void popExpected(String tag) { + if (!openStructuralTags.isEmpty() && tag.equals(openStructuralTags.peek())) { + openStructuralTags.pop(); + } + } + @Override public void startTable() throws SAXException { xhtml.startElement("table"); + openStructuralTags.push("table"); tableDepth++; } @@ -226,6 +276,7 @@ public void startTable() throws SAXException { public void endTable() throws SAXException { xhtml.endElement("table"); + popExpected("table"); tableDepth--; } @@ -233,22 +284,26 @@ public void endTable() throws SAXException { @Override public void startTableRow() throws SAXException { xhtml.startElement("tr"); + openStructuralTags.push("tr"); } @Override public void endTableRow() throws SAXException { xhtml.endElement("tr"); + popExpected("tr"); } @Override public void startTableCell() throws SAXException { xhtml.startElement("td"); + openStructuralTags.push("td"); tableCellDepth++; } @Override public void endTableCell() throws SAXException { xhtml.endElement("td"); + popExpected("td"); pWithinCell = 0; tableCellDepth--; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index a93bfd7d081..0fb05407a51 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -271,6 +271,13 @@ public void startElement(String uri, String localName, String qName, Attributes runBuffer.append(TAB_CHAR); } else if (P.equals(localName)) { lastStartElementWasP = true; + // Each needs its own pStarted lifecycle. Without this, + // a nested (e.g., inside /) would + // inherit the outer paragraph's pStarted=true, suppress its own + // startParagraph in the branch, then fire its + // endParagraph on -- producing an unbalanced start/end + // count that desyncs the XHTML

/

stream. + pStarted = false; } else if (B.equals(localName)) { //TODO: add bCs if (inR && inRPr) { currRunProperties.setBold(true); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java index b34a1ffc77e..0df8ccaa07d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java @@ -35,6 +35,7 @@ import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; @@ -189,9 +190,10 @@ private int handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) int hidden = 0; xhtml.startElement("div", "class", "slide-content"); + OOXMLTikaBodyPartHandler bodyHandler = new OOXMLTikaBodyPartHandler(xhtml, metadata); try (InputStream stream = slidePart.getInputStream()) { - OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new OOXMLWordAndPowerPointTextHandler( - new OOXMLTikaBodyPartHandler(xhtml, metadata), linkedRelationships); + OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = + new OOXMLWordAndPowerPointTextHandler(bodyHandler, linkedRelationships); XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(wordAndPPTHandler), context); if (wordAndPPTHandler.isHiddenSlide()) { @@ -201,9 +203,19 @@ private int handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) if (wordAndPPTHandler.hasAnimations()) { metadata.set(Office.HAS_ANIMATIONS, true); } + } catch (SAXException e) { + // Truncated/malformed slide XML can leave the body handler with + // unclosed

,

,
, etc. on the wire. Close them before the + // below so subsequent slides -- and the outer -- + // land in a balanced spot. + WriteLimitReachedException.throwIfWriteLimitReached(e); + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + bodyHandler.closeAnyPending(); } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); + bodyHandler.closeAnyPending(); } xhtml.endElement("div"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index de3ff518359..9da2352c19c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -381,9 +381,15 @@ private OOXMLTikaBodyPartHandler handlePart(PackagePart packagePart, WriteLimitReachedException.throwIfWriteLimitReached(e); metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); + // The partial parse may have left

,

,
, or + // formatting tags open on the XHTML stream. Close them now so + // subsequent parts -- and the outer -- land in a + // balanced spot. + bodyHandler.closeAnyPending(); } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); + bodyHandler.closeAnyPending(); } Map partMetadata = bodyHandler.getEmbeddedPartMetadataMap(); resolveEmfNames(packagePart, partMetadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index d968cdb8562..1388edc3a1e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -220,6 +220,10 @@ protected void buildXHTML(XHTMLContentHandler xhtml) WriteLimitReachedException.throwIfWriteLimitReached(e); metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); + // Balance any /
left open by the partial parse so + // the
emitted below land in the + // right place. + sheetExtractor.closeAnyPending(); } try { getThreadedComments(container, sheetPart, xhtml); @@ -967,6 +971,12 @@ protected static class SheetTextAsHTML private XHTMLContentHandler xhtml; private int lastSeenRow = -1; private int lastSeenCol = -1; + // Track open / so the outer catch can emit balanced closes + // when processSheet throws part-way through a row (e.g., a malformed + // sheet XML). Without this, the outer code would emit + // while (or ) was still on the stack, producing malformed XHTML. + private boolean rowOpen; + private boolean cellOpen; protected SheetTextAsHTML(OfficeParserConfig config, XHTMLContentHandler xhtml) { this.includeHeadersFooters = config.isIncludeHeadersAndFooters(); @@ -982,14 +992,19 @@ public void startRow(int rowNum) { if (includeMissingRows && rowNum > (lastSeenRow + 1)) { for (int rn = lastSeenRow + 1; rn < rowNum; rn++) { xhtml.startElement("tr"); + rowOpen = true; xhtml.startElement("td"); + cellOpen = true; xhtml.endElement("td"); + cellOpen = false; xhtml.endElement("tr"); + rowOpen = false; } } // Start the new row xhtml.startElement("tr"); + rowOpen = true; lastSeenCol = -1; } catch (SAXException e) { //swallow @@ -1001,11 +1016,28 @@ public void startRow(int rowNum) { public void endRow(int rowNum) { try { xhtml.endElement("tr"); + rowOpen = false; } catch (SAXException e) { throw new RuntimeSAXException(e); } } + /** + * Closes any pending {@code } or {@code } that was opened + * before a {@link SAXException} interrupted sheet processing. Safe to + * call when nothing is open. + */ + void closeAnyPending() throws SAXException { + if (cellOpen) { + xhtml.endElement("td"); + cellOpen = false; + } + if (rowOpen) { + xhtml.endElement("tr"); + rowOpen = false; + } + } + public void cell(String cellRef, String formattedValue, XSSFCommentsShim.CommentData comment) { try { @@ -1014,12 +1046,15 @@ public void cell(String cellRef, String formattedValue, (cellRef == null) ? lastSeenCol + 1 : (new CellReference(cellRef)).getCol(); for (int cn = lastSeenCol + 1; cn < colNum; cn++) { xhtml.startElement("td"); + cellOpen = true; xhtml.endElement("td"); + cellOpen = false; } lastSeenCol = colNum; // Start this cell xhtml.startElement("td"); + cellOpen = true; // Main cell contents if (formattedValue != null) { @@ -1036,6 +1071,7 @@ public void cell(String cellRef, String formattedValue, } xhtml.endElement("td"); + cellOpen = false; } catch (SAXException e) { throw new RuntimeSAXException(e); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index 4460946c521..9d2e2239be6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -59,8 +59,8 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.xml.DcXMLParser; import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLBalancingHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.XMLReaderUtils; @@ -114,11 +114,12 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); IOException caughtException = null; - ContentHandler childHandler = new EmbeddedContentHandler( - new EpubNormalizingHandler(new BodyContentHandler(xhtml))); + EpubNormalizingHandler normalizer = + new EpubNormalizingHandler(new BodyContentHandler(xhtml)); + ContentHandler childHandler = new EmbeddedContentHandler(normalizer); Set encryptedItems = Collections.EMPTY_SET; try { - encryptedItems = bufferedParse(tis, childHandler, xhtml, metadata, context); + encryptedItems = bufferedParse(tis, childHandler, normalizer, xhtml, metadata, context); } catch (IOException e) { caughtException = e; } @@ -141,19 +142,22 @@ private void updateMimeType(InputStream is, Metadata metadata) throws IOExceptio } private Set bufferedParse(TikaInputStream tis, ContentHandler bodyHandler, + EpubNormalizingHandler normalizer, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { // DefaultZipContainerDetector opens (and salvages, if needed) the ZipFile and // stashes it on the TikaInputStream. Reuse it when present; otherwise open ourselves. if (tis.getOpenContainer() instanceof ZipFile) { - return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, context); + return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, + normalizer, xhtml, metadata, context); } try (ZipFile zipFile = ZipFile.builder().setFile(tis.getPath().toFile()).get()) { - return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context); + return bufferedParseZipFile(zipFile, bodyHandler, normalizer, xhtml, metadata, context); } } private Set bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler, + EpubNormalizingHandler normalizer, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { @@ -167,7 +171,7 @@ private Set bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan // emit partial content (matching 3.x's streamingParse contract), // then throw to signal the result is incomplete. LOG.trace("epub fallback: rootOPF=null, streaming all html entries"); - return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata, context, + return fallbackParseAllHtmlEntries(zipFile, bodyHandler, normalizer, metadata, context, "no OPF found in (possibly truncated) container"); } ZipArchiveEntry zae = zipFile.getEntry(rootOPF); @@ -175,7 +179,7 @@ private Set bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan zae, zae == null ? "n/a" : zipFile.canReadEntryData(zae)); if (zae == null || !zipFile.canReadEntryData(zae)) { LOG.trace("epub fallback: OPF entry missing/unreadable, streaming all html entries"); - return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata, context, + return fallbackParseAllHtmlEntries(zipFile, bodyHandler, normalizer, metadata, context, "OPF entry missing or unreadable in (possibly truncated) container"); } try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) { @@ -191,7 +195,7 @@ private Set bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan contentOrderScraper.locationMap.size()); if (contentOrderScraper.contentItems.isEmpty()) { LOG.trace("epub fallback: empty spine, streaming all html entries"); - return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata, context, + return fallbackParseAllHtmlEntries(zipFile, bodyHandler, normalizer, metadata, context, "OPF declared no spine items in (possibly truncated) container"); } String relativePath = ""; @@ -236,6 +240,12 @@ private Set bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan throw e; } saxExceptions.add(e); + // The aborted spine item may have left , + // ,

, etc. open on the wire. Close them + // before the next item (or the outer ) + // emits, otherwise the validator sees cross- + // nested events. + normalizer.drainOpenElements(); } catch (IOException ioe) { LOG.trace("epub spine read IOException on {}: {}", path, ioe.toString()); throw ioe; @@ -297,6 +307,7 @@ private Set bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan */ private Set fallbackParseAllHtmlEntries(ZipFile zipFile, ContentHandler bodyHandler, + EpubNormalizingHandler normalizer, Metadata metadata, ParseContext context, String reason) @@ -335,6 +346,8 @@ private Set fallbackParseAllHtmlEntries(ZipFile zipFile, } failed++; LOG.trace("epub fallback: SAX failure on {}: {}", entry.getName(), e.toString()); + // Close any tags the aborted parse left open. + normalizer.drainOpenElements(); } catch (IOException e) { failed++; LOG.trace("epub fallback: IO failure on {}: {}", entry.getName(), e.toString()); @@ -588,7 +601,8 @@ public Set getEncryptedItems() { //for now, this simply converts all names to local names to avoid //namespace conflicts in the content handler. This also removes namespaces //from attributes - private static class EpubNormalizingHandler extends ContentHandlerDecorator { + private static class EpubNormalizingHandler extends XHTMLBalancingHandler { + public EpubNormalizingHandler(ContentHandler contentHandler) { super(contentHandler); } @@ -607,7 +621,15 @@ public void startElement(String uri, String localName, String name, Attributes a if (needToRewrite) { AttributesImpl simplifiedAtts = new AttributesImpl(); for (int i = 0; i < atts.getLength(); i++) { - simplifiedAtts.addAttribute("", atts.getLocalName(i), atts.getLocalName(i), + String localAttName = atts.getLocalName(i); + // Stripping the namespace prefix can collapse two distinct + // qnames onto one local name (e.g. xml:lang + lang). The + // serialized XHTML must have unique attribute names, so + // keep the first occurrence and drop later duplicates. + if (simplifiedAtts.getIndex("", localAttName) >= 0) { + continue; + } + simplifiedAtts.addAttribute("", localAttName, localAttName, atts.getType(i), atts.getValue(i)); } super.startElement(uri, localName, localName, simplifiedAtts); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java index b001f09f856..6bbc7eaa86c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java @@ -150,6 +150,12 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { private boolean curBold; private boolean curItalic; private int pDepth = 0; + // Nesting depth of (text:a / draw:a) elements. While > 0, style tag + // emission is suppressed so b/i/u never close across an open (which + // would produce cross-nested XHTML like ...). The tradeoff + // is that bold/italic/underline runs that begin or end inside a link lose + // their formatting in the captured XHTML. + private int anchorDepth = 0; OpenDocumentBodyHandler(ContentHandler handler, ParseContext parseContext) { super(handler, MAPPINGS); this.handler = handler; @@ -275,6 +281,11 @@ private void endParagraph() throws SAXException { private void updateStyleTags() throws SAXException { + // Suppress style-tag flips while inside a -- emitting // + // here would cross-nest with the open anchor and produce malformed XHTML. + if (anchorDepth > 0) { + return; + } if (currTextStyle == null) { closeStyleTags(); return; @@ -325,7 +336,39 @@ private void endSpan() throws SAXException { updateStyleTags(); } + /** + * Returns true for ODF elements that map to block-level XHTML and so + * shouldn't sit inside open inline-style tags. When such an element opens + * while {@code //} are on the SAX stack, the inline tags would + * trap the new block element underneath them; subsequent style flips + * inside the block would emit close events that don't match the topmost + * open element. The startElement handler closes pending style tags + * before opening any of these. + *

+ * text:p / text:h / text:list / annotation / note / notes / a are handled + * by their own branches in startElement and never reach the default + * branch where this check is used. + */ + private static boolean isBlockLevelOpen(String uri, String localName) { + if (DRAW_NS.equals(uri) && "text-box".equals(localName)) { + return true; + } + if (TABLE_NS.equals(uri) && + ("table".equals(localName) || "table-row".equals(localName) + || "table-cell".equals(localName))) { + return true; + } + return TEXT_NS.equals(uri) && "list-item".equals(localName); + } + private void closeStyleTags() throws SAXException { + // Same reasoning as in updateStyleTags: never emit style-tag closes + // while a is on the stack -- the // would land in the + // wrong place and the strict validator (or any SAX parser) would reject + // the resulting XHTML. + if (anchorDepth > 0) { + return; + } // Close any still open style tags if (curUnderlined) { handler.endElement(XHTML, "u", "u"); @@ -437,6 +480,22 @@ public void startElement(String namespaceURI, String localName, String qName, At } else if ("notes".equals(localName)) { closeStyleTags(); handler.startElement(XHTML, "p", "p", NOTES_ATTRIBUTES); + } else if ("a".equals(localName)) { + // Suppress any pending style flip before opening the anchor, + // and bump the depth so style emission stays quiet while we're + // inside it. See updateStyleTags / closeStyleTags. + anchorDepth++; + super.startElement(namespaceURI, localName, qName, attrs); + } else if (isBlockLevelOpen(namespaceURI, localName)) { + // Block-level structural elements (draw:text-box ->

, + // table:table -> , etc.) opened while // are + // on top would trap those inline tags. Subsequent style flips + // inside would emit while the block is on top, producing + // cross-nested XHTML. Close pending styles before opening the + // block; if there's still text to emit at the same style after + // the block closes, updateStyleTags() will reopen them. + closeStyleTags(); + super.startElement(namespaceURI, localName, qName, attrs); } else { super.startElement(namespaceURI, localName, qName, attrs); } @@ -489,6 +548,24 @@ public void endElement(String namespaceURI, String localName, String qName) closeStyleTags(); handler.endElement(namespaceURI, "p", "p"); } else if ("a".equals(localName)) { + // closeStyleTags is a no-op while anchorDepth > 0, but we keep + // the call so any future change to its semantics still runs + // here. Decrement only after has been emitted so the + // suppression covers the close itself. + closeStyleTags(); + super.endElement(namespaceURI, localName, qName); + if (anchorDepth > 0) { + anchorDepth--; + } + } else if (SVG_NS.equals(namespaceURI) + && ("title".equals(localName) || "desc".equals(localName))) { + // svg:title / svg:desc map through MAPPINGS to . If a + // // opened inside (via text:span style) is still + // on top when fires, we'd cross-nest. Close pending + // style tags before the parent close. Other text:span lazy- + // close paths (via characters() -> updateStyleTags) are + // unaffected because this branch only triggers on these + // specific elements. closeStyleTags(); super.endElement(namespaceURI, localName, qName); } else { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index ec68f8c2bda..a361521021b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -223,6 +223,19 @@ private static void addNonNullAttribute(String name, String value, AttributesImp attributes.addAttribute("", name, name, "CDATA", value); } + private static void setOrReplaceAttribute(String name, String value, + AttributesImpl attributes) { + if (name == null || value == null) { + return; + } + int idx = attributes.getIndex("", name); + if (idx >= 0) { + attributes.setValue(idx, value); + } else { + attributes.addAttribute("", name, name, "CDATA", value); + } + } + private static PDActionURI getActionURI(PDAnnotation annot) { //copied and pasted from PDFBox's PrintURLs @@ -408,8 +421,8 @@ private void processDoc(String name, String annotationType, PDFileSpecification } if (spec instanceof PDSimpleFileSpecification) { //((PDSimpleFileSpecification)spec).getFile(); - attributes.addAttribute("", "class", "class", "CDATA", "linked"); - attributes.addAttribute("", "id", "id", "CDATA", spec.getFile()); + setOrReplaceAttribute("class", "linked", attributes); + setOrReplaceAttribute("id", spec.getFile(), attributes); xhtml.startElement("div", attributes); xhtml.endElement("div"); } else if (spec instanceof PDComplexFileSpecification) { @@ -497,8 +510,8 @@ private void extractPDEmbeddedFile(String displayName, String annotationType, return; } - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", fileName); + setOrReplaceAttribute("class", "embedded", attributes); + setOrReplaceAttribute("id", fileName, attributes); xhtml.startElement("div", attributes); xhtml.endElement("div"); @@ -1093,8 +1106,8 @@ private void processJavaScriptAction(String trigger, String jsActionName, PDActi embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true); } }; - addNonNullAttribute("class", "javascript", attrs); - addNonNullAttribute("type", jsAction.getType(), attrs); + setOrReplaceAttribute("class", "javascript", attrs); + setOrReplaceAttribute("type", jsAction.getType(), attrs); addNonNullAttribute("subtype", jsAction.getSubType(), attrs); xhtml.startElement("div", attrs); xhtml.endElement("div"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 6ce771c6f9b..e73471e353f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1537,6 +1537,20 @@ public void testMaxPagesInvalidValue() { config.setMaxPages(1); } + // TIKA-4728: handleDestinationOrAction pre-populated class/type on the action div, + // then processJavaScriptAction appended a second class/type for PDActionJavaScript + // actions, producing a div with duplicate attributes that SAX parsers reject. + // TikaTest.getXML wraps with StrictXHTMLValidator, so a regression makes + // this test throw at the offending SAX event. + @Test + public void testExtractActionsXHTMLWellFormed() throws Exception { + PDFParserConfig config = new PDFParserConfig(); + config.setExtractActions(true); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + getXML("testPDF_jsActionOnPage.pdf", context); + } + /** @Test public void testWriteLimit() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf new file mode 100644 index 00000000000..5b283307f8d Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java index 4244d62280f..f2c38f896f7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -97,17 +97,24 @@ metadata, getEncodingDetector(context))) { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); - - xhtml.startElement("p"); - char[] buffer = new char[4096]; - int n = reader.read(buffer); - while (n != -1) { - xhtml.characters(buffer, 0, n); - n = reader.read(buffer); + // try/finally so any upstream exception (e.g., WriteLimitReached) + // still emits

and instead of leaving the + // captured XHTML unterminated. + try { + xhtml.startElement("p"); + try { + char[] buffer = new char[4096]; + int n = reader.read(buffer); + while (n != -1) { + xhtml.characters(buffer, 0, n); + n = reader.read(buffer); + } + } finally { + xhtml.endElement("p"); + } + } finally { + xhtml.endDocument(); } - xhtml.endElement("p"); - - xhtml.endDocument(); } finally { tis.removeCloseShield(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java index 2a1224171bd..e918c8977b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java @@ -66,11 +66,13 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata metadata.set(Metadata.CONTENT_TYPE, TMX_CONTENT_TYPE.toString()); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); + xhtml.startDocument(); tis.setCloseShield(); try { XMLReaderUtils.parseSAX(tis, new TMXContentHandler(xhtml, metadata), context); } finally { tis.removeCloseShield(); + xhtml.endDocument(); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java index 476388370d1..c800561f439 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java @@ -66,12 +66,13 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString()); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); - + xhtml.startDocument(); tis.setCloseShield(); try { XMLReaderUtils.parseSAX(tis, new XLIFF12ContentHandler(xhtml, metadata), context); } finally { tis.removeCloseShield(); + xhtml.endDocument(); } }