-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDownloadPageNames.java
More file actions
executable file
·44 lines (41 loc) · 1.63 KB
/
DownloadPageNames.java
File metadata and controls
executable file
·44 lines (41 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Deque;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DownloadPageNames {
private static final Pattern TITLE_PATTERN = Pattern.compile("<li><a href=\".*\" title=\".*\">(.*)</a></li>");
private static final Pattern NEXT_PAGE_PATTERN = Pattern.compile(".*<a href=\"(.*)\" title=\"Special:AllPages\">Next page .*</a></div></div>");
private static final String WIKI_URL = "https://wiki.eclipse.org/";
private static final String STARTING_PAGE = WIKI_URL + "Special:AllPages";
public static void main(String[] args) throws Exception {
Deque<String> pages = new LinkedList<>();
pages.add(STARTING_PAGE);
while (!pages.isEmpty()) {
String currentPage = pages.removeFirst();
URL url = new URL(currentPage);
try (BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"))) {
String inputLine;
while ((inputLine = in.readLine()) != null) {
Matcher titleMatcher = TITLE_PATTERN.matcher(inputLine);
if (titleMatcher.matches()) {
String pageName = titleMatcher.group(1);
System.out.println(pageName);
continue;
}
Matcher nextPageMatcher = NEXT_PAGE_PATTERN.matcher(inputLine);
if (nextPageMatcher.matches()) {
String nextPageUrlSuffix = nextPageMatcher.group(1);
nextPageUrlSuffix = nextPageUrlSuffix.replaceAll("&", "&");
//System.out.println(nextPageUrlSuffix);
String nextPageUrl = WIKI_URL + nextPageUrlSuffix;
pages.addLast(nextPageUrl);
continue;
}
}
}
}
}
}