-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlp_modules.py
More file actions
158 lines (122 loc) · 4.27 KB
/
nlp_modules.py
File metadata and controls
158 lines (122 loc) · 4.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__synopsis__ : Tools for NLP related operations using various libraries (Spacy, NLTK, etc)
__description__ :
__project__ : my_modules
__author__ : 'Samujjwal Ghosh'
__version__ :
__date__ : June 2018
__copyright__ : "Copyright (c) 2018"
__license__ : "Python"; (Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html)
__classes__ :
__variables__ :
__methods__ :
TODO : 1.
"""
import sys,platform
if platform.system() == 'Windows':
sys.path.append('D:\GDrive\Dropbox\IITH\\0 Research')
sys.path.append('D:\GDrive\Dropbox\IITH\\0 Research\Datasets')
else:
sys.path.append('/home/cs16resch01001/codes')
sys.path.append('/home/cs16resch01001/datasets')
import my_modules as mm
entity_types = ['PERSON','NORP','FACILITY','ORG','GPE','LOC','PRODUCT','EVENT',
'WORK_OF_ART','LAW','LANGUAGE','DATE','TIME','PERCENT','MONEY',
'QUANTITY','ORDINAL','CARDINAL','PER','MISC']
def process_dict_spacy(train):
print("Method: process_dict_spacy(train)")
import spacy
from collections import OrderedDict
try:
nlp = spacy.load('en')
except Exception as e:
nlp = spacy.load('en_core_web_sm')
# from textblob import TextBlob
for id, val in train.items():
# print(id)
twt = val['parsed_tweet']
# twt = TextBlob(twt)
# twt = twt.correct()
# doc = nlp(str(twt))
doc = nlp(twt)
pos = []
lemmas = []
for token in doc:
pos.append(token.pos_)
lemmas.append(token.lemma_)
val['lemma'] = " ".join(lemmas)
val['pos'] = pos
return train
def process_spacy(s,entity=False):
import spacy
from collections import OrderedDict
try:
nlp = spacy.load('en')
except Exception as e:
nlp = spacy.load('en_core_web_sm')
doc = nlp(s)
print(doc)
result = OrderedDict()
tokens = []
pos = []
lemmas = []
for token in doc:
#print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
# token.shape_, token.is_alpha, token.is_stop)
tokens.append(token.text)
pos.append(token.pos_)
lemmas.append(token.lemma_)
result["tokens"] = tokens
result["pos"] = pos
result["lemmas"] = lemmas
if entity:
labels=[]
ents_list = []
for ent in doc.ents:
ents_list.append(str(ent))
for i,t in enumerate(list(tokens)):
if t in "".join(ents_list):
labels.append(True)
else:
labels.append(False)
result["labels"] = labels
result["ents_list"] = ents_list
result["ents"] = doc.ents
return result
def most_similar_spacy(word, k=10):
from spacy.en import English
parser = English()
from numpy import dot
from numpy.linalg import norm
# you can access known words from the parser's vocabulary
word_vocab = parser.vocab[word]
# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
# gather all known words, take only the lowercased versions
allWords = list({w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != word_vocab})
# sort by similarity to word
allWords.sort(key=lambda w: cosine(w.vector, word_vocab.vector))
allWords.reverse()
#print("Top 10 most similar words: ",w)
sim_list = []
for word_vocab in allWords[:k]:
#print(word,word.orth_)
sim_list.append(word_vocab.orth_)
return sim_list
def pos_nltk(str):
import nltk
text = nltk.word_tokenize(str)
return nltk.pos_tag(text)
def spelling_correction(tweet):
from textblob import TextBlob
b = TextBlob(tweet)
return b.correct()
def main():
s = '#NepalEarthquake India plz 1230,1485 #NDRF team, -2 dogs and 3.2 tonnes equipment to Nepal-Army for rescue operations: Indian Embassy'
# print(find_phone(s,replace=""))
# print(spelling_correction(s))
print(process_spacy(s))
print(most_similar_spacy('short'))
if __name__ == "__main__": main()