my_modules/nlp_modules.py at master · SamujjwalSam/my_modules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__synopsis__    : Tools for NLP related operations using various libraries (Spacy, NLTK, etc)
__description__ :
__project__     : my_modules
__author__      : 'Samujjwal Ghosh'
__version__     :
__date__        : June 2018
__copyright__   : "Copyright (c) 2018"
__license__     : "Python"; (Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html)

__classes__     :

__variables__   :

__methods__     :

TODO            : 1.
"""

import sys,platform
if platform.system() == 'Windows':
    sys.path.append('D:\GDrive\Dropbox\IITH\\0 Research')
    sys.path.append('D:\GDrive\Dropbox\IITH\\0 Research\Datasets')
else:
    sys.path.append('/home/cs16resch01001/codes')
    sys.path.append('/home/cs16resch01001/datasets')


import my_modules as mm

entity_types = ['PERSON','NORP','FACILITY','ORG','GPE','LOC','PRODUCT','EVENT',
                'WORK_OF_ART','LAW','LANGUAGE','DATE','TIME','PERCENT','MONEY',
                'QUANTITY','ORDINAL','CARDINAL','PER','MISC']

def process_dict_spacy(train):
    print("Method: process_dict_spacy(train)")
    import spacy
    from collections import OrderedDict
    try:
        nlp = spacy.load('en')
    except Exception as e:
        nlp = spacy.load('en_core_web_sm')

    # from textblob import TextBlob

    for id, val in train.items():
        # print(id)
        twt = val['parsed_tweet']
        # twt = TextBlob(twt)
        # twt = twt.correct()
        # doc = nlp(str(twt))
        doc = nlp(twt)
        pos = []
        lemmas = []
        for token in doc:
            pos.append(token.pos_)
            lemmas.append(token.lemma_)
        val['lemma'] = " ".join(lemmas)
        val['pos'] = pos

    return train


def process_spacy(s,entity=False):
    import spacy
    from collections import OrderedDict
    try:
        nlp = spacy.load('en')
    except Exception as e:
        nlp = spacy.load('en_core_web_sm')

    doc = nlp(s)
    print(doc)

    result = OrderedDict()
    tokens = []
    pos = []
    lemmas = []
    for token in doc:
        #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
        # token.shape_, token.is_alpha, token.is_stop)
        tokens.append(token.text)
        pos.append(token.pos_)
        lemmas.append(token.lemma_)
    result["tokens"] = tokens
    result["pos"] = pos
    result["lemmas"] = lemmas

    if entity:
        labels=[]
        ents_list = []
        for ent in doc.ents:
            ents_list.append(str(ent))

        for i,t in enumerate(list(tokens)):
            if t in "".join(ents_list):
                labels.append(True)
            else:
                labels.append(False)
        result["labels"] = labels
        result["ents_list"] = ents_list
        result["ents"] = doc.ents

    return result


def most_similar_spacy(word, k=10):
    from spacy.en import English
    parser = English()

    from numpy import dot
    from numpy.linalg import norm

    # you can access known words from the parser's vocabulary
    word_vocab = parser.vocab[word]

    # cosine similarity
    cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

    # gather all known words, take only the lowercased versions
    allWords = list({w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != word_vocab})

    # sort by similarity to word
    allWords.sort(key=lambda w: cosine(w.vector, word_vocab.vector))
    allWords.reverse()
    #print("Top 10 most similar words: ",w)
    sim_list = []
    for word_vocab in allWords[:k]:
        #print(word,word.orth_)
        sim_list.append(word_vocab.orth_)
    return sim_list


def pos_nltk(str):
    import nltk
    text = nltk.word_tokenize(str)
    return nltk.pos_tag(text)


def spelling_correction(tweet):
    from textblob import TextBlob
    b = TextBlob(tweet)
    return b.correct()


def main():
    s = '#NepalEarthquake India plz 1230,1485 #NDRF team, -2 dogs and 3.2 tonnes equipment to Nepal-Army for rescue operations: Indian Embassy'
    # print(find_phone(s,replace=""))
    # print(spelling_correction(s))

    print(process_spacy(s))

    print(most_similar_spacy('short'))


if __name__ == "__main__": main()