-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawlDepth.cpp
More file actions
105 lines (97 loc) · 4.11 KB
/
crawlDepth.cpp
File metadata and controls
105 lines (97 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
//path: crawlDepth.cpp
#include <vector>
#include <string>
#include <iostream>
#include <curl/curl.h>
#include <gumbo.h>
#include "crawler.h"
#include <regex>
#include <fstream>
#include "parallelCrawler.h"
void WebCrawler::crawlDepth(const std::string& url, int depth, const std::vector<std::string>& keywords) {
if (depth < 0) {
std::cerr << "Invalid depth parameter." << std::endl;
return;
}
// assign keywords to a variable
this->keywords = keywords;
//queue of urls to crawl
//add the current url to the queue
linkQueue.push(url);
while(!linkQueue.empty()){
// pop the front url from the queue
std::string currentUrl = linkQueue.front();
linkQueue.pop();
std::string html = crawl(currentUrl);
std::string source = "Source: " + currentUrl;
std::string text = extractText(html, source);
// Search keywords in the current page )
std::vector<std::string> keywordResults = searchHeadlines(text, source);
if(!keywordResults.empty()) {
std::cout << source << std::endl;
std::cout << keywordResults.size() << std::endl;
for (auto& result : keywordResults) {
std::cout << "____________-------------------_________________" << std::endl;
std::cout << result << std::endl;
// lets add this results to a json file too
std::ofstream myfile;
myfile.open ("data.json", std::ios_base::app);
myfile << result << std::endl;
myfile.close();
globalResults.push_back(result);
}
// std::string annotedResults = nlpClient.sendToNLP(globalResults);
// std::ofstream myfile;
// myfile.open ("results.txt", std::ios_base::app);
// myfile << annotedResults << std::endl;
// myfile.close();
} else {
std::cerr << "No Search results found on: " << currentUrl << std::endl;
}
if (depth > 0) {
// Extract links and crawl each linked page with reduced depth
GumboOutput* output = gumbo_parse(html.c_str());
std::vector<std::string> links = extractLinks(output->root);
gumbo_destroy_output(&kGumboDefaultOptions, output);
if(!links.empty()) {
std::cout << "Links at: " << currentUrl << std::endl;
std::cout << links.size() << std::endl;
for (const auto& link : links) {
if (isValidUrl(link)) {
// Check if the link has already been visited
if (visitedUrls.find(link) == visitedUrls.end()) {
linkQueue.push(link);
visitedUrls.insert(link);
// write to file
std::ofstream myfile;
myfile.open ("links.txt", std::ios_base::app);
myfile << link << std::endl;
myfile.close();
} else {
std::cerr << "Link already visited: " << link << std::endl;
}
}
}
} else {
std::cerr << "No links found on: " << currentUrl << std::endl;
}
// Check for pagination and crawl next pages
std::vector<std::string> paginationLinks = extractPaginationLinks(html);
if(!paginationLinks.empty()) {
for (const auto& paginationLink : paginationLinks) {
if (isValidUrl(paginationLink)) {
// Check if the link has already been visited
if (visitedUrls.find(paginationLink) == visitedUrls.end()) {
visitedUrls.insert(paginationLink);
linkQueue.push(paginationLink);
} else {
std::cerr << "Link already visited: " << paginationLink << std::endl;
}
}
}
} else {
std::cerr << "No pagination links found on: " << currentUrl << std::endl;
}
}
}
}