MachineLearning_Web_Scraper_Wikki/main.py at master · vishal2232/MachineLearning_Web_Scraper_Wikki · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

from bs4 import BeautifulSoup

import requests

import re

import operator
#parses json, formats it
import json

from tabulate import tabulate
#system calls, dealw with user arguments
import sys
#list of common stop words various languages like the
from stop_words import get_stop_words

#get the words
def getWordList(url):
    word_list = []
    #raw data
    source_code = requests.get(url)
    #convert to text
    plain_text = source_code.text
    #lxml format
    soup = BeautifulSoup(plain_text,'lxml')

    #find the words in paragraph tag
    for text in soup.findAll('p'):
        if text.text is None:
            continue
        #content
        content = text.text
        #lowercase and split into an array
        words = content.lower().split()

        #for each word
        for word in words:
            #remove non-chars
            cleaned_word = clean_word(word)
            #if there is still something there
            if len(cleaned_word) > 0:
                #add it to our word list
                word_list.append(cleaned_word)

    return word_list


#clean word with regex
def clean_word(word):
    cleaned_word = re.sub('[^A-Za-z]+', '', word)
    return cleaned_word


def createFrquencyTable(word_list):
    #word count
    word_count = {}
    for word in word_list:
        #index is the word
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    return word_count

#remove stop words
def remove_stop_words(frequency_list):
    stop_words = get_stop_words('en')

    temp_list = []
    for key,value in frequency_list:
        if key not in stop_words:
            temp_list.append([key, value])

    return temp_list

#access wiki API. json format. query it for data. search tyep. shows list of possibilities
wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
wikipedia_link = "https://en.wikipedia.org/wiki/"

#if the search word is too small, throw error
if(len(sys.argv) < 2):
    print("Enter valid string")
    exit()

#get the search word
string_query = sys.argv[1]

#to remove stop words or not
if(len(sys.argv) > 2):
    search_mode = True
else:
    search_mode = False

#create our URL
url = wikipedia_api_link + string_query

#try-except block. simple way to deal with exceptions
#great for HTTP requests
try:
    #use requests to retrieve raw data from wiki API URL we
    #just constructed
    response = requests.get(url)

    #format that data as a JSON dictionary
    data = json.loads(response.content.decode("utf-8"))

    #page title, first option
    #show this in web browser
    wikipedia_page_tag = data['query']['search'][0]['title']

    #get actual wiki page based on retrieved title
    url = wikipedia_link + wikipedia_page_tag
    #get list of words from that page
    page_word_list = getWordList(url)
    #create table of word counts, dictionary
    page_word_count = createFrquencyTable(page_word_list)
    #sort the table by the frequency count
    sorted_word_frequency_list = sorted(page_word_count.items(), key=operator.itemgetter(1), reverse=True)
    #remove stop words if the user specified
    if(search_mode):
        sorted_word_frequency_list = remove_stop_words(sorted_word_frequency_list)

    #sum the total words to calculate frequencies
    total_words_sum = 0
    for key,value in sorted_word_frequency_list:
        total_words_sum = total_words_sum + value

    #just get the top 20 words
    if len(sorted_word_frequency_list) > 20:
        sorted_word_frequency_list = sorted_word_frequency_list[:20]

    #create our final list which contains words, frequency (word count), percentage
    final_list = []
    for key,value in sorted_word_frequency_list:
        percentage_value = float(value * 100) / total_words_sum
        final_list.append([key, value, round(percentage_value, 4)])

    #headers before the table
    print_headers = ['Word', 'Frequency', 'Frequency Percentage']

    #print the table with tabulate
    print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl'))

#throw an exception in case it breaks
except requests.exceptions.Timeout:
    print("The server didn't respond. Please, try again later.")