TensorFlow_SimpsonsChatbot/preprocessor.py at master · tfreundo/TensorFlow_SimpsonsChatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Natural Language Processing (NLP)
import nltk
# Stemmer that extracts the stem/core of a word (e.g. planning --> stem is plan)
from nltk.stem.lancaster import LancasterStemmer
import random
import numpy as np

class Preprocessor:

    def __init__(self):
        self.stemmer = LancasterStemmer()
        self.ignore_words = ['?']  # List of words to ignore

    def extract_info_from_intents(self, intents):
        """Extracts/splits the information from the intents
        """
        words = []
        classes = []
        docs = []

        # Extract/split the information from the intents
        for intent in intents['intents']:
            for pattern in intent['patterns']:
                # Tokenize the words
                word = nltk.word_tokenize(pattern)
                words.extend(word)
                docs.append((word, intent['tag']))
                if intent['tag'] not in classes:
                    classes.append(intent['tag'])

        # Stem the words and convert them to lower case
        words = [self.stemmer.stem(word.lower())
                 for word in words if word not in self.ignore_words]
        # Remove potential duplicates and sort it
        words = sorted(list(set(words)))
        classes = sorted(list(set(classes)))

        return words, classes, docs

    def convert_intents_to_tensors(self, words, classes, documents):
        """Converts the information extracted from the intents (words, sentences, ...) into numbers to use for learning
        """
        train_all = []
        output = []
        output_empty = [0] * len(classes)

        # Convert to a bag of words (bow)
        for doc in documents:
            bow = []
            pattern_words = doc[0]
            # Stem the words
            pattern_words = [self.stemmer.stem(word.lower()) for word in pattern_words]
            for w in words:
                # 0 for each tag and 1 for current tag
                bow.append(1) if w in pattern_words else bow.append(0)

            output_row = list(output_empty)
            output_row[classes.index(doc[1])] = 1

            train_all.append([bow, output_row])

        # shuffle the data
        random.shuffle(train_all)
        # Convert to a Numpy array
        train_all = np.array(train_all)

        # Split into training and test data set
        train_x = list(train_all[:,0])
        train_y = list(train_all[:,1])

        return train_x, train_y