CrawlerPython/crawl.py at main · chnrv99/CrawlerPython · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import time
import requests
from stem import Signal
from stem.control import Controller
from bs4 import BeautifulSoup
import os
import csv
import pandas as pd

filename = 'Dataset.csv'
First_Row = ['Title of Webpage','headings of the website','Link']
Row = []
Rows = []
    # Drug_names = ['Alcohol','Ayahuasca','Cannabis', 'Marijuana','Pot','Weed','Central Nervous System Depressants','Benzos','Cocaine','Coke','Crack',
    # 'GHB',
    # 'Hallucinogens',
    # 'Heroin',
    # 'Inhalants',
    # 'Ketamine',
    # 'Khat',
    # 'Kratom',
    # 'LSD', 'Acid',
    # 'MDMA','Ecstasy','Molly',
    # 'Mescaline','Peyote'
    # 'Methamphetamine','Crystal','Meth'
    # 'Over-the-Counter',
    # 'Speed','Psilocybin','Magic Mushrooms','Shrooms',
    # 'Tobacco','Nicotine','Vaping','Bath Salts','Flakka','Angel Dust','Opoids']


def if_drugs(link):
    # getting the details of drugs sold
    # getting the table
    dfs = pd.read_html(url)
    print(dfs)


def if_hackers():


def if_traffiking():


def get_heading(link):
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    print("The heading of "+link+" is: ")
    l=[]
    data=''
    for data in soup.find_all('h2'):
        l.append(data.get_text().strip())
        print(data.get_text().strip())
    for data in soup.find_all('h3'):
        l.append(data.get_text().strip())
        print(data.get_text().strip())
    for data in soup.find_all('h4'):
        l.append(data.get_text().strip())
        print(data.get_text().strip())
    for data in soup.find_all('h5'):
        l.append(data.get_text().strip())
        print(data.get_text().strip())
    for data in soup.find_all('h6'):
        l.append(data.get_text().strip())
        print(data.get_text().strip())


    print("\n\n")
    return l

def get_images(link):
    html_page = requests.get(link)

    soup = BeautifulSoup(html_page.content,'html.parser')


    images = soup.findAll('img')
    # print(images)
    # for image in images:
        # print(image.get('src'))

# while scraping we find 2 types of images: one with https and one without it

    l_with_https = []
    l_without_https = []

    url_base = link
    # while scraping we are checking for that above condition
    for image in images:
        if 'http' in image.get('src'):
            l_with_https.append(image.get('src'))
        else:
            print(image.get('src'))
            l_without_https.append(url_base + image.get('src'))

    # print('\n\n')
    # print(l_without_https)

    for i in range(4):
        webs = requests.get(l_without_https[i])
        open('images/' + l_without_https[i].split('/')[-1],'wb').write(webs.content)

        # run the model code
        #
        #

    # deleting the images
    for i in range(4):
        os.remove('images/' + l_without_https[i].split('/')[-1])


def get_title(link):
    l1=[]
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    title=''
    print("The title of "+link+" is: ")
    for data in soup.find_all('title'):
        l1.append(data.get_text().strip())
        print(data.get_text().strip())
    # data = soup.find_all('title').get_text.strip()
    print('\n\n')
    return l1


# Set the number of links to crawl
# num_links_to_crawl = 400
num_links_to_crawl = input("Enter number of links to crawl:")


# Set the user agent to use for the request
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'

# Set the headers for the request
headers = {'User-Agent': user_agent}

count=0

# Initialize the controller for the Tor network
with Controller.from_port(port=9051) as controller:
    # Set the controller password
    controller.authenticate(password='CristianoRonaldoCR7')

    # Set the starting URL
    # url = 'https://thehiddenwiki.com/'
    url = input("Enter the starting URL: ")
    # Initialize the visited set and the link queue
    visited = set()
    queue = [url]

    # Get the list of keywords to search for
    # keywords = input('Enter a list of keywords to search for, separated by commas: ').split(',')

    # Crawl the links
    while queue:
        # Get the next link in the queue
        link = queue.pop(0)


        # Skip the link if it has already been visited
        if link in visited:
            continue

        # Set the new IP address
        controller.signal(Signal.NEWNYM)
        try:
        # Send the request to the URL
            response = requests.get(link, headers=headers)
            # Parse the response
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all links on the page
            links = soup.find_all('a')

            # Add any links that contain the keywords to the queue
            for a in links:
                # print(a.get('href'))
                href = a.get('href')
                # if any(keyword in href for keyword in keywords):
                if(href==None):
                    continue
                elif('http' in href):
                    queue.append(href)
            # print(queue)
            # getting the title, heading and images
            # get_heading(url)
            s = get_title(link)
            # get_images(url)
            s1 = get_heading(link)

            # adding it to csv file
            if s!=[]:
                Row.append(s[0])
            Row.append(s1)
            Row.append(link)
            if Row not in Rows:
                Rows.append(Row)
            Row = []


            # Add the link to the visited set
            visited.add(link)

            # Print the title and URL of the page
            # print(soup.title.string, link)
            count+=1
            print('No of links visited: ',count)


            # Check if the number of visited links has reached the limit
            if len(visited) >= num_links_to_crawl:
                break
        except:
            print("Exception occured for link:",link)
            continue

data = pd.DataFrame(Rows,columns=First_Row)
data.to_csv('links.csv',index=False)

# Print the visited links
print('Visited links:')
for link in visited:
    print(link)