Skip to content

Commit e73adbf

Browse files
committed
Merge branch 'main' into fix/solr-10-migration
2 parents feb415e + 55d744f commit e73adbf

72 files changed

Lines changed: 1415 additions & 378 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Apache StormCrawler is an open source collection of resources for building low-l
99

1010
## Quickstart
1111

12-
NOTE: These instructions assume that you have [Apache Maven](https://maven.apache.org/install.html) installed. You will need to install [Apache Storm 2.8.4](http://storm.apache.org/) to run the crawler.
12+
NOTE: These instructions assume that you have [Apache Maven](https://maven.apache.org/install.html) installed. You will need to install [Apache Storm 2.8.5](http://storm.apache.org/) to run the crawler.
1313

1414
StormCrawler requires Java 17 or above. To execute tests, it requires you to have a locally installed and working Docker environment.
1515

THIRD-PARTY.txt

Lines changed: 63 additions & 68 deletions
Large diffs are not rendered by default.

archetype/src/main/resources/archetype-resources/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Have a look at the code and resources and modify them to your heart's content.
55

66
## Native
77

8-
You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.8.4/Setting-up-a-Storm-cluster.html) should help.
8+
You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.8.5/Setting-up-a-Storm-cluster.html) should help.
99
You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so:
1010

1111
```

archetype/src/main/resources/archetype-resources/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ under the License.
3232
<properties>
3333
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
3434
<stormcrawler.version>${project.version}</stormcrawler.version>
35-
<storm.version>2.8.4</storm.version>
35+
<storm.version>2.8.5</storm.version>
3636
<urlfrontier.version>2.4</urlfrontier.version>
3737
</properties>
3838

core/pom.xml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ under the License.
3939
<!-- dependency versions -->
4040
<crawler-commons.version>1.6</crawler-commons.version>
4141
<jsoup.version>1.22.1</jsoup.version>
42-
<icu4j.version>78.2</icu4j.version>
42+
<icu4j.version>78.3</icu4j.version>
4343
<xerces.version>2.12.2</xerces.version>
4444
<httpclient.version>4.5.14</httpclient.version>
4545
<snakeyaml.version>2.6</snakeyaml.version>
@@ -50,7 +50,6 @@ under the License.
5050
<okhttp.version>5.3.2</okhttp.version>
5151
<caffeine.version>3.2.3</caffeine.version>
5252
<xsoup.version>0.3.7</xsoup.version>
53-
<awaitility.version>4.3.0</awaitility.version>
5453
<guava.version>33.5.0-jre</guava.version>
5554
<jacoco.haltOnFailure>true</jacoco.haltOnFailure>
5655
<jacoco.classRatio>0.72</jacoco.classRatio>
@@ -145,7 +144,6 @@ under the License.
145144
<dependency>
146145
<groupId>org.awaitility</groupId>
147146
<artifactId>awaitility</artifactId>
148-
<version>${awaitility.version}</version>
149147
<scope>test</scope>
150148
</dependency>
151149

@@ -268,7 +266,7 @@ under the License.
268266
<dependency>
269267
<groupId>com.fasterxml.jackson.core</groupId>
270268
<artifactId>jackson-annotations</artifactId>
271-
<version>${jackson.version}</version>
269+
<version>${jackson-annotations.version}</version>
272270
</dependency>
273271
</dependencies>
274272
</dependencyManagement>

core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
import org.apache.stormcrawler.persistence.Status;
5353
import org.apache.stormcrawler.protocol.ProtocolResponse;
5454
import org.apache.stormcrawler.util.ConfUtils;
55+
import org.apache.stormcrawler.util.URLUtil;
5556
import org.slf4j.LoggerFactory;
5657
import org.xml.sax.InputSource;
5758

@@ -175,7 +176,7 @@ private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetad
175176
feed = input.build(new InputSource(is));
176177
}
177178

178-
URL url1 = new URL(url);
179+
URL url1 = URLUtil.toURL(url);
179180

180181
List<SyndEntry> entries = feed.getEntries();
181182
for (SyndEntry entry : entries) {

core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.apache.stormcrawler.protocol.RobotRules;
6262
import org.apache.stormcrawler.util.ConfUtils;
6363
import org.apache.stormcrawler.util.PerSecondReducer;
64+
import org.apache.stormcrawler.util.URLUtil;
6465
import org.slf4j.LoggerFactory;
6566

6667
/**
@@ -529,7 +530,7 @@ public void run() {
529530
boolean asap = false;
530531

531532
try {
532-
URL url = new URL(fit.url);
533+
URL url = URLUtil.toURL(fit.url);
533534
Protocol protocol = protocolFactory.getProtocol(url);
534535

535536
if (protocol == null) {
@@ -982,7 +983,7 @@ public void execute(Tuple input) {
982983
URL url;
983984

984985
try {
985-
url = new URL(urlString);
986+
url = URLUtil.toURL(urlString);
986987
} catch (MalformedURLException e) {
987988
LOG.error("{} is a malformed URL", urlString);
988989

core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ public void execute(Tuple tuple) {
292292
} else {
293293
final Elements links = jsoupDoc.select("a[href]");
294294
slinks = new HashMap<>(links.size());
295-
final URL baseUrl = new URL(url);
295+
final URL baseUrl = URLUtil.toURL(url);
296296
for (Element link : links) {
297297
// nofollow
298298
String[] relkeywords = link.attr("rel").split(" ");
@@ -374,7 +374,7 @@ public void execute(Tuple tuple) {
374374

375375
// https://github.com/apache/stormcrawler/issues/954
376376
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
377-
emitOutlink(tuple, new URL(url), redirection, metadata);
377+
emitOutlink(tuple, URLUtil.toURL(url), redirection, metadata);
378378
}
379379

380380
// Mark URL as redirected
@@ -515,7 +515,7 @@ protected List<Outlink> toOutlinks(
515515

516516
URL sourceUrl;
517517
try {
518-
sourceUrl = new URL(url);
518+
sourceUrl = URLUtil.toURL(url);
519519
} catch (MalformedURLException e) {
520520
// we would have known by now as previous components check whether
521521
// the URL is valid

core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
import org.apache.stormcrawler.protocol.RobotRules;
5454
import org.apache.stormcrawler.util.ConfUtils;
5555
import org.apache.stormcrawler.util.PerSecondReducer;
56+
import org.apache.stormcrawler.util.URLUtil;
5657
import org.slf4j.LoggerFactory;
5758

5859
/**
@@ -265,7 +266,7 @@ public void execute(Tuple input) {
265266
URL url;
266267

267268
try {
268-
url = new URL(urlString);
269+
url = URLUtil.toURL(urlString);
269270
} catch (MalformedURLException e) {
270271
LOG.error("{} is a malformed URL", urlString);
271272
// Report to status stream and ack

core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import org.apache.stormcrawler.persistence.DefaultScheduler;
6262
import org.apache.stormcrawler.persistence.Status;
6363
import org.apache.stormcrawler.util.ConfUtils;
64+
import org.apache.stormcrawler.util.URLUtil;
6465
import org.slf4j.LoggerFactory;
6566

6667
/**
@@ -183,7 +184,7 @@ private List<Outlink> parseSiteMap(
183184
String url, byte[] content, String contentType, Metadata parentMetadata)
184185
throws UnknownFormatException, IOException {
185186

186-
URL url1 = new URL(url);
187+
URL url1 = URLUtil.toURL(url);
187188
long start = System.currentTimeMillis();
188189
AbstractSiteMap siteMap;
189190
// let the parser guess what the mimetype is

0 commit comments

Comments
 (0)