-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkey_phrase_finder.py
More file actions
76 lines (63 loc) · 2.85 KB
/
key_phrase_finder.py
File metadata and controls
76 lines (63 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import boto3
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
class KeyPhraseFinder:
def __init__(self, access_key, secret_key, session_token):
self.porter_stemmer = PorterStemmer()
nltk.download('punkt_tab')
nltk.download('stopwords')
self.english_stopwords = stopwords.words('english')
self.access_key = access_key
self.secret_key = secret_key
self.session_token = session_token
self.client = boto3.client(
'comprehend',
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
aws_session_token=self.session_token,
region_name='us-east-1'
)
def separate_term_from_key_phrase_object(self, key_phrase_object):
return re.sub(r'[^a-zA-Z0-9 ]', '', key_phrase_object['Text'])
def treat_phrase(self, phrase):
tokens = word_tokenize(phrase.lower())
tokens_without_stopwords = [t for t in tokens if t not in self.english_stopwords]
phrase_without_stopwords = ''.join(tokens_without_stopwords)
return {
'originalPhrase': phrase,
'stemmedPhrase': self.porter_stemmer.stem(phrase_without_stopwords)
}
def removeDuplicatedStemmedPhrases(self, theme, phrases):
seen_stemmed = set()
unique_phrases = []
for phrase in phrases:
if phrase['stemmedPhrase'] not in seen_stemmed:
if phrase['stemmedPhrase'] == theme['stemmedPhrase']:
continue
seen_stemmed.add(phrase['stemmedPhrase'])
tokens = word_tokenize(phrase['originalPhrase'])
if len(tokens) == 0:
continue
if tokens[0] in self.english_stopwords:
tokens = tokens[1:]
phrase['originalPhrase'] = ' '.join(tokens)
if tokens[-1] in self.english_stopwords:
tokens = tokens[:-1]
phrase['originalPhrase'] = ' '.join(tokens)
if phrase['originalPhrase'] == '':
continue
unique_phrases.append(phrase['originalPhrase'])
return unique_phrases
def get_key_phrases(self, theme, content):
response = self.client.detect_key_phrases(
Text=content,
LanguageCode='en'
)
treated_theme = self.treat_phrase(theme)
key_phrases_list = list(map(self.separate_term_from_key_phrase_object, response['KeyPhrases']))
treated_phrases_list = list(map(self.treat_phrase, key_phrases_list))
unique_phrases = self.removeDuplicatedStemmedPhrases(treated_theme, treated_phrases_list)
return unique_phrases