-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathScraper.py
More file actions
105 lines (97 loc) · 5.68 KB
/
Scraper.py
File metadata and controls
105 lines (97 loc) · 5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
import sqlite3
apikey= "{API KEY}"
videoid= "{VIDEO ID}"
SAVETEXTFILE= "{SAVE TEXTFILE LOCATION}"
SAVECOMMENTDB= "{SAVE DATABASE LOCATION}"
def save_to_sqlite_db(savecommentdb, videoid, apikey):
with sqlite3.connect(savecommentdb) as conn: #opens an existing DB or creates one if not present
conn.execute('''CREATE TABLE IF NOT EXISTS "CommentThreads" ( `Id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, `Username` TEXT NOT NULL,
`Comment` TEXT, `PostedTime` TEXT NOT NULL, `Likes` INTEGER NOT NULL, `ReplyCount` INTEGER NOT NULL DEFAULT 0,
`CommentId` TEXT NOT NULL )''')
conn.commit()
conn.execute('''CREATE TABLE IF NOT EXISTS "CommentReplies" ( `Id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, `ParentId` TEXT NOT NULL,
`Username` TEXT NOT NULL, `Reply` TEXT, `ReplyTime` TEXT NOT NULL, `Likes` INTEGER NOT NULL )''')
conn.commit()
nextPageToken=""
FirstIteration=1
while FirstIteration == 1 or nextPageToken != "":
with sqlite3.connect(savecommentdb) as conn:
FirstIteration = 0
parameters = {"part": "id,snippet", "videoId": videoid, "maxResults": "100", "pageToken": nextPageToken,
"key": apikey}
r = requests.get("https://www.googleapis.com/youtube/v3/commentThreads", params=parameters)
commentthreads = r.json()
if "nextPageToken" in commentthreads:
nextPageToken = commentthreads['nextPageToken']
else:
nextPageToken = ""
for item in commentthreads['items']:
toplevelcomment = item['snippet']['topLevelComment']
commenter = toplevelcomment['snippet']['authorDisplayName']
comment = toplevelcomment['snippet']['textDisplay']
likecount=toplevelcomment['snippet']['likeCount']
totalreplycount = item['snippet']['totalReplyCount']
posttime=toplevelcomment['snippet']['updatedAt']
commentid = item['id']
conn.execute('''INSERT INTO CommentThreads (Username,Comment,PostedTime,Likes,ReplyCount,CommentId) VALUES (?,?,?,?,?,?)''',
[commenter,comment,posttime,likecount,totalreplycount,commentid])
conn.commit()
if totalreplycount > 0:
parameters = {"part": "id,snippet", "parentId": commentid, "maxResults": "100",
"key": apikey}
r = requests.get("https://www.googleapis.com/youtube/v3/comments", params=parameters)
replyitems = r.json()
for replyitem in reversed(replyitems['items']): #reversed coz for replies, we don't want the latest topmost
replier = replyitem['snippet']['authorDisplayName']
reply = replyitem['snippet']['textDisplay']
replylikes=replyitem['snippet']['likeCount']
replytime=replyitem['snippet']['updatedAt']
conn.execute('''INSERT INTO CommentReplies (ParentId,Username,Reply,ReplyTime,Likes) VALUES (?,?,?,?,?)''',
[commentid,replier,reply,replytime,replylikes])
conn.commit()
def save_to_txt(savetextfile, videoid, apikey):
OUTPUT=""
nextPageToken=""
totalcommentthreads=0
totalreplies=0
grandtotalcomments=0
FirstIteration=1
while FirstIteration == 1 or nextPageToken != "":
file = open(savetextfile, "ab")
FirstIteration = 0
parameters = {"part": "id,snippet", "videoId": videoid, "maxResults": "100", "pageToken": nextPageToken,
"key": apikey}
r = requests.get("https://www.googleapis.com/youtube/v3/commentThreads", params=parameters)
commentthreads = r.json()
if "nextPageToken" in commentthreads:
nextPageToken = commentthreads['nextPageToken']
else:
nextPageToken = ""
for item in commentthreads['items']:
OUTPUT=""
totalcommentthreads += 1
toplevelcomment = item['snippet']['topLevelComment']
commenter = toplevelcomment['snippet']['authorDisplayName']
comment = toplevelcomment['snippet']['textDisplay']
totalreplycount = item['snippet']['totalReplyCount']
commentid = item['id']
OUTPUT += "{\n" + commenter + ":\n" + comment + "\n"
if totalreplycount > 0:
parameters = {"part": "id,snippet", "parentId": commentid, "maxResults": "100",
"key": apikey}
r = requests.get("https://www.googleapis.com/youtube/v3/comments", params=parameters)
replyitems = r.json()
for replyitem in reversed(replyitems['items']): #reversed coz for replies, we don't want the latest topmost
totalreplies += 1
replier = replyitem['snippet']['authorDisplayName']
reply = replyitem['snippet']['textDisplay']
OUTPUT += "{\n" + replier + ":\n" + reply + "\n}\n"
OUTPUT += "}\n\n"
file.write(OUTPUT.encode())
file.close() #appends to file and closes on a page-wise basis
grandtotalcomments=totalreplies+totalcommentthreads
file = open(savetextfile, "ab")
file.write(("\n\n"+"COMMENT THREADS : "+str(totalcommentthreads)+"\nREPLIES : "+str(totalreplies)+"\nTOTAL : "+str(grandtotalcomments)).encode())
file.close()
save_to_sqlite_db(SAVECOMMENTDB,videoid,apikey)