EE_error/crawlDepth.cpp at main · Intina47/EE_error · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
//path: crawlDepth.cpp
#include <vector>
#include <string>
#include <iostream>
#include <curl/curl.h>
#include <gumbo.h>
#include "crawler.h"
#include <regex>
#include <fstream>
#include "parallelCrawler.h"

void WebCrawler::crawlDepth(const std::string& url, int depth, const std::vector<std::string>& keywords) {
    if (depth < 0) {
        std::cerr << "Invalid depth parameter." << std::endl;
        return;
    }

    // assign keywords to a variable
    this->keywords = keywords;
    //queue of urls to crawl
    //add the current url to the queue
    linkQueue.push(url);

    while(!linkQueue.empty()){
        // pop the front url from the queue
        std::string currentUrl = linkQueue.front();
        linkQueue.pop();

        std::string html = crawl(currentUrl);
        std::string source = "Source: " + currentUrl;
        std::string text = extractText(html, source);

        // Search keywords in the current page )
        std::vector<std::string> keywordResults = searchHeadlines(text, source);
        if(!keywordResults.empty()) {
            std::cout << source << std::endl;
            std::cout << keywordResults.size() << std::endl;
            for (auto& result : keywordResults) {
                std::cout << "____________-------------------_________________" << std::endl;
                    std::cout << result << std::endl;
                    // lets add this results to a json file too
                    std::ofstream myfile;
                    myfile.open ("data.json", std::ios_base::app);
                    myfile << result << std::endl;
                    myfile.close();
                    globalResults.push_back(result);
            }
            // std::string annotedResults = nlpClient.sendToNLP(globalResults);
            // std::ofstream myfile;
            // myfile.open ("results.txt", std::ios_base::app);
            // myfile << annotedResults << std::endl;
            // myfile.close();

        } else {
                std::cerr << "No Search results found on: " << currentUrl << std::endl;
            }

        if (depth > 0) {
        // Extract links and crawl each linked page with reduced depth
        GumboOutput* output = gumbo_parse(html.c_str());
        std::vector<std::string> links = extractLinks(output->root);
        gumbo_destroy_output(&kGumboDefaultOptions, output);
        if(!links.empty()) {
            std::cout << "Links at: " << currentUrl << std::endl;
            std::cout << links.size() << std::endl;
            for (const auto& link : links) {
                if (isValidUrl(link)) {
                    // Check if the link has already been visited
                    if (visitedUrls.find(link) == visitedUrls.end()) {
                        linkQueue.push(link);
                        visitedUrls.insert(link);
                        // write to file
                        std::ofstream myfile;
                        myfile.open ("links.txt", std::ios_base::app);
                        myfile << link << std::endl;
                        myfile.close();
                    } else {
                        std::cerr << "Link already visited: " << link << std::endl;
                    }
                }
            }
        } else {
            std::cerr << "No links found on: " << currentUrl << std::endl;
        }

        // Check for pagination and crawl next pages
        std::vector<std::string> paginationLinks = extractPaginationLinks(html);
        if(!paginationLinks.empty()) {
            for (const auto& paginationLink : paginationLinks) {
                if (isValidUrl(paginationLink)) {
                    // Check if the link has already been visited
                    if (visitedUrls.find(paginationLink) == visitedUrls.end()) {
                        visitedUrls.insert(paginationLink);
                        linkQueue.push(paginationLink);
                    } else {
                        std::cerr << "Link already visited: " << paginationLink << std::endl;
                    }
                }
            }
        } else {
            std::cerr << "No pagination links found on: " << currentUrl << std::endl;
        }
    }
    }
}