From b43e4f28a8a0377c97eacd21c1c1b900889811be Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Wed, 1 Apr 2026 19:42:08 +0200 Subject: [PATCH 1/2] Fix bugs and resource leaks in Apache HttpClient protocol implementation The proxy race condition fix (a08a456b) introduced two regressions: The local builder for proxied requests lost: Default headers User-Agent A new CloseableHttpClient was created on every request, even when no proxy was used Fixes applied Store userAgent and defaultHeaders as fields Pass these fields to the proxy builder Build the client once in configure() and reuse it for non-proxy requests Additional fixes Fix toByteArray() returning null when the entity input stream is null Close the static CONNECTION_MANAGER in cleanup() Wrap the entity InputStream in a try-with-resources block --- .../protocol/httpclient/HttpProtocol.java | 90 ++++++++++--------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java index a0db6ac12..e7a342bd0 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java @@ -66,7 +66,6 @@ import org.apache.stormcrawler.proxy.SCProxy; import org.apache.stormcrawler.util.ConfUtils; import org.apache.stormcrawler.util.CookieConverter; -import org.jetbrains.annotations.Nullable; import org.slf4j.LoggerFactory; /** Uses Apache httpclient to handle http and https. */ @@ -82,6 +81,11 @@ public class HttpProtocol extends AbstractHttpProtocol private HttpClientBuilder builder; + private CloseableHttpClient client; + + private String userAgent; + private Collection defaultHeaders; + private RequestConfig requestConfig; private RequestConfig.Builder requestConfigBuilder; @@ -102,9 +106,9 @@ public void configure(final Config conf) { globalMaxContent = ConfUtils.getInt(conf, "http.content.limit", -1); - String userAgent = getAgentString(conf); + userAgent = getAgentString(conf); - Collection defaultHeaders = new LinkedList<>(); + defaultHeaders = new LinkedList<>(); String accept = ConfUtils.getString(conf, "http.accept"); if (StringUtils.isNotBlank(accept)) { @@ -153,6 +157,8 @@ public void configure(final Config conf) { .setCookieSpec(CookieSpecs.STANDARD); requestConfig = requestConfigBuilder.build(); + + client = builder.build(); } @Override @@ -163,8 +169,8 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except // set default request config to global config RequestConfig reqConfig = requestConfig; - // default to the shared builder for non-proxy requests - HttpClientBuilder clientBuilder = builder; + // default to the shared client for non-proxy requests + CloseableHttpClient httpClient = client; // conditionally add a dynamic proxy if (proxyManager != null) { @@ -173,10 +179,12 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except if (proxOptional.isPresent()) { SCProxy prox = proxOptional.get(); - // create local copies of builders to avoid polluting shared - // state across concurrent requests + // create a new builder with the same defaults (user-agent, + // headers) to avoid losing them in proxied requests HttpClientBuilder localBuilder = HttpClients.custom() + .setUserAgent(userAgent) + .setDefaultHeaders(defaultHeaders) .setConnectionManager(CONNECTION_MANAGER) .setConnectionManagerShared(true) .disableRedirectHandling() @@ -215,7 +223,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except System.currentTimeMillis() - buildStart); LOG.debug("fetching with " + prox.toString()); - clientBuilder = localBuilder; + httpClient = localBuilder.build(); } } @@ -268,11 +276,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except request.setConfig(reqConfig); - // no need to release the connection explicitly as this is handled - // automatically. The client itself must be closed though. - try (CloseableHttpClient httpclient = clientBuilder.build()) { - return httpclient.execute(request, responseHandler); - } + return httpClient.execute(request, responseHandler); } private void addCookiesToRequest(HttpRequestBase request, Metadata md) { @@ -353,7 +357,6 @@ public ProtocolResponse handleResponse(final HttpResponse response) throws IOExc }; } - @Nullable private static byte[] toByteArray( final HttpEntity entity, int maxContent, MutableBoolean trimmed) throws IOException { @@ -363,35 +366,42 @@ private static byte[] toByteArray( final InputStream instream = entity.getContent(); if (instream == null) { - return null; - } - Args.check( - (entity.getContentLength() <= Constants.MAX_ARRAY_SIZE) - || (maxContent >= 0 && maxContent <= Constants.MAX_ARRAY_SIZE), - "HTTP entity too large to be buffered in memory"); - int reportedLength = (int) entity.getContentLength(); - // set default size for buffer: 100 KB - int bufferInitSize = 102400; - if (reportedLength != -1) { - bufferInitSize = reportedLength; - } - // avoid init of too large a buffer when we will trim anyway - if (maxContent != -1 && bufferInitSize > maxContent) { - bufferInitSize = maxContent; + return new byte[] {}; } - final ByteArrayBuffer buffer = new ByteArrayBuffer(bufferInitSize); - final byte[] tmp = new byte[4096]; - int lengthRead; - while ((lengthRead = instream.read(tmp)) != -1) { - // check whether we need to trim - if (maxContent != -1 && buffer.length() + lengthRead > maxContent) { - buffer.append(tmp, 0, maxContent - buffer.length()); - trimmed.setValue(true); - break; + try (instream) { + Args.check( + (entity.getContentLength() <= Constants.MAX_ARRAY_SIZE) + || (maxContent >= 0 && maxContent <= Constants.MAX_ARRAY_SIZE), + "HTTP entity too large to be buffered in memory"); + int reportedLength = (int) entity.getContentLength(); + // set default size for buffer: 100 KB + int bufferInitSize = 102400; + if (reportedLength != -1) { + bufferInitSize = reportedLength; } - buffer.append(tmp, 0, lengthRead); + // avoid init of too large a buffer when we will trim anyway + if (maxContent != -1 && bufferInitSize > maxContent) { + bufferInitSize = maxContent; + } + final ByteArrayBuffer buffer = new ByteArrayBuffer(bufferInitSize); + final byte[] tmp = new byte[4096]; + int lengthRead; + while ((lengthRead = instream.read(tmp)) != -1) { + // check whether we need to trim + if (maxContent != -1 && buffer.length() + lengthRead > maxContent) { + buffer.append(tmp, 0, maxContent - buffer.length()); + trimmed.setValue(true); + break; + } + buffer.append(tmp, 0, lengthRead); + } + return buffer.toByteArray(); } - return buffer.toByteArray(); + } + + @Override + public void cleanup() { + CONNECTION_MANAGER.close(); } public static void main(String[] args) throws Exception { From 1e5c7138802e4211ca53745c1b97d9e937366a09 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Wed, 1 Apr 2026 21:30:09 +0200 Subject: [PATCH 2/2] Revert: Close the static CONNECTION_MANAGER in cleanup() --- .../stormcrawler/protocol/httpclient/HttpProtocol.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java index e7a342bd0..0c67b1841 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java @@ -401,7 +401,11 @@ private static byte[] toByteArray( @Override public void cleanup() { - CONNECTION_MANAGER.close(); + try { + client.close(); + } catch (IOException e) { + LOG.error("Error closing HTTP client", e); + } } public static void main(String[] args) throws Exception {