-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
116 lines (96 loc) · 3 KB
/
scraper.py
File metadata and controls
116 lines (96 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import csv
import json
import pymongo
listingPage = urlopen(Request("https://www.imdb.com/chart/top", headers={'User-Agent': 'Mozilla/5.0'})).read()
primarySoup = BeautifulSoup(listingPage, 'html.parser')
counter = 0
arr = []
tdTags = primarySoup.find(["tbody",{"class":"lister-list"}])
for title in tdTags.findAll(["tr"]):
try:
counter+=1
arr.append(title.find(["div",{"class":"wlb_ribbon"}])["data-titleid"])
except Exception:
print("Exception occured")
'''
with open('data.csv', 'w',newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for i in arr:
writer.writerow([i])
'''
client = pymongo.MongoClient("mongodb+srv://tejus:<PASSWORD>@cluster0.ymzum.gcp.mongodb.net/<DATABASE_NAME>?retryWrites=true&w=majority")
db = client.test
mycol = db["testCollection"]
for tID in arr:
omdbPage = urlopen(Request("http://www.omdbapi.com/?apikey=8fa02782&i="+tID, headers={'User-Agent': 'Mozilla/5.0'})).read()
omdbJson = json.loads(omdbPage)
title = omdbJson["Title"]
year = int(omdbJson["Year"])
rated = omdbJson["Rated"]
released = omdbJson["Released"]
runtime = omdbJson["Runtime"]
runtime = int(runtime[0:-4])
genres = []
for g in omdbJson["Genre"].split(", "):
genres.append(g)
genre = genres
directors = []
for d in omdbJson["Director"].split(", "):
directors.append(d)
director = directors
writer = omdbJson["Writer"]
actors = []
for a in omdbJson["Actors"].split(", "):
actors.append(a)
actor = actors
plot = omdbJson["Plot"]
languages = []
for l in omdbJson["Language"].split(", "):
languages.append(l)
language = languages
countries = []
for c in omdbJson["Country"].split(", "):
countries.append(c)
country = countries
plot = omdbJson["Plot"]
awards = omdbJson["Awards"]
poster = omdbJson["Poster"]
ratings = omdbJson["Ratings"]
metascore = omdbJson["Metascore"]
imdbRating = omdbJson["imdbRating"]
imdbVotes = omdbJson["imdbVotes"]
imdbID = omdbJson["imdbID"]
typeOfMovie = omdbJson["Type"]
dvd = omdbJson["DVD"]
productions = []
for p in omdbJson["Production"].split(", "):
productions.append(p)
production = productions
dictionary = {
"Title": title,
"Year": year,
"Rated":rated,
"Released":released,
"Runtime": runtime,
"Genre":genres,
"Directors":directors,
"Writer":writer,
"Actors": actor,
"Plot": plot,
"Language": language,
"Country": country,
"Awards": awards,
"Poster":poster,
"Ratings": ratings,
"Metascore": metascore,
"imdbRating": imdbRating,
"imdbVotes": imdbVotes,
"imdbID": imdbID,
"Type": typeOfMovie,
"DVD": dvd,
"Productions": production
}
print(dictionary)
x = mycol.insert_one(dictionary)