-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
71 lines (58 loc) · 2.38 KB
/
preprocessor.py
File metadata and controls
71 lines (58 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Natural Language Processing (NLP)
import nltk
# Stemmer that extracts the stem/core of a word (e.g. planning --> stem is plan)
from nltk.stem.lancaster import LancasterStemmer
import random
import numpy as np
class Preprocessor:
def __init__(self):
self.stemmer = LancasterStemmer()
self.ignore_words = ['?'] # List of words to ignore
def extract_info_from_intents(self, intents):
"""Extracts/splits the information from the intents
"""
words = []
classes = []
docs = []
# Extract/split the information from the intents
for intent in intents['intents']:
for pattern in intent['patterns']:
# Tokenize the words
word = nltk.word_tokenize(pattern)
words.extend(word)
docs.append((word, intent['tag']))
if intent['tag'] not in classes:
classes.append(intent['tag'])
# Stem the words and convert them to lower case
words = [self.stemmer.stem(word.lower())
for word in words if word not in self.ignore_words]
# Remove potential duplicates and sort it
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))
return words, classes, docs
def convert_intents_to_tensors(self, words, classes, documents):
"""Converts the information extracted from the intents (words, sentences, ...) into numbers to use for learning
"""
train_all = []
output = []
output_empty = [0] * len(classes)
# Convert to a bag of words (bow)
for doc in documents:
bow = []
pattern_words = doc[0]
# Stem the words
pattern_words = [self.stemmer.stem(word.lower()) for word in pattern_words]
for w in words:
# 0 for each tag and 1 for current tag
bow.append(1) if w in pattern_words else bow.append(0)
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
train_all.append([bow, output_row])
# shuffle the data
random.shuffle(train_all)
# Convert to a Numpy array
train_all = np.array(train_all)
# Split into training and test data set
train_x = list(train_all[:,0])
train_y = list(train_all[:,1])
return train_x, train_y