forked from rishi-suresh-keshav/Big-Data
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathContent.py
More file actions
137 lines (97 loc) · 4.69 KB
/
Content.py
File metadata and controls
137 lines (97 loc) · 4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from bs4 import BeautifulSoup
import urllib
import requests
import json
import pymongo
from pymongo import MongoClient
sectorDictionary = dict()
sectorDictionary['Healthcare'] = ['healthcare Technology','Healthcare & Cleantech','Healthcare','Clean Tech','Technology and Healthcare','Digital Health'
,'Clean Technology','bio-tech','Life Sciences','Pharmaceuticals','Medical Technology','Biotechnology'
,'BioMedicine','technology & Life Sciences','Medical','Biomedical Devices and New Drugs','Medical Devices','Specialty Pharmaceuticals'
,'Wellness']
sectorDictionary['Finance'] = ['Business & Financial services','Business Services','Financial & Information Services','Real Estate']
sectorDictionary['Energy'] = ['Industrials & Energy','energy','Energy Technologies','Energy and Information Technology'
,'Renewable Energy','Alternative energy','Energytech','Energy-Related Products and Services']
sectorDictionary['Media'] = ['New Media','Interactive digital media','Internet and Digital Media'
,'Media and Entertainment','Consumer Internet','Info and Comm Technology','Digital Home and Digital Media','advertising']
sectorDictionary['Logistics'] = ['maritime']
sectorDictionary['Hardware Components'] = ['Semiconductors and Components','Nanotechnology and Microsystems','Manufacturing and Memory','silicon technology'
,'Hardware','Semiconductors']
sectorDictionary['Cloud platform'] = ['Cloud Computing','Cloud/SaaS','Cloud Services and Infrastructure']
sectorDictionary['Networks'] = ['Communications','Networking','Information and Communications Technology','Consumer and Business Networking Applications to networkingy']
sectorDictionary['ecommerce'] = ['internet','marketplace','Next-Gen Commerce','Internet-based Advertising','Web-Enabled Services']
sectorDictionary['social media'] = ['Social']
sectorDictionary['Software Technology'] = ['Information Services','IT Infrastructure','Enterprise Software','Infrastructure Software and Services'
,'Software','Gaming','games','Software & Business Services','Software and Services','Applications']
sectorDictionary['Mobile Technolgies'] = ['app','mobile communication','mobile software','mobile technologies','Mobile & Communications','Mobile and Communications','mobile computing'
,'mobile services','mobile and wireless','wireless technology','media and telecoms','Early stage mobile','Mobile Services & Infrastructure']
jsonList = []
def get_content(post_url):
content=""
r = requests.get(post_url)
soup = BeautifulSoup(r.content)
heading = soup.find("h1", "entry-title").string
heading = unicode(heading)
#print unicode(heading)
#content = soup.find(attrs={'class': 'post-boilerplate boilerplate-before'})
#print content
p_array = soup.findAll("p")
for p in p_array:
content=content + ''.join(p.get_text(' ', strip=True))
#print content
date = soup.find(attrs={'class': 'the-time'})
date = date.get_text(' ', strip=True)
data = [heading,content,date]
return data
def get_amount(data):
amount=""
token_list = data.split( );
for token in token_list:
if token.startswith('$'):
amount=token
return amount
def get_sector(content):
sector=""
flag=0
for key in sectorDictionary.keys():
for s in sectorDictionary[key]:
if content.lower().find(s.lower())>=0:
#print content.lower().find(s.lower())
sector=key
flag=1
return sector
if flag==0:
#print "Not found"
return "other"
def create_json(sector,amount,date,link):
temp = date.split(',')
temp2=temp[1]
year=temp2[1:5]
#print year
jsonList.append({"sector":sector,"amount":amount,"date":date,"year":year,"link":link})
connection = MongoClient('localhost',27017)
db = connection.vc_database
collection = db.vc_collection2
collection.insert({"sector":sector,"amount":amount,"date":date,"year":year,"link":link})
def write_file(jsonList):
file = open("content.json", "w")
print>>file, jsonList
def main():
with open("links.txt") as f:
links = f.readlines()
#print links
total=0
for link in links:
print link
data = get_content(link)
amount = get_amount(data[0])
sector=get_sector(data[1])
date = data[2]
if amount != "":
create_json(sector,amount,date,link)
#-jsonList.append({"sector":"aasdf","amount":"328947","date":"4378 may 2015"})
#write_file(jsonList)
print jsonList
#print jsonList[0]['amount']
#print sectorDictionary.keys()
main()