SentimentAnalysis/parse_data.py at master · DesperadoAdil/SentimentAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
import numpy as np
import json
import re

"""
1 = train
2 = test
3 = dev
"""
TEST = "./data/test.json"
DEV = "./data/dev.json"
TRAIN = "./data/train.json"
DATA_FILE = [None, TRAIN, TEST, DEV]
DATA_JSON = [None, [], [], []]
LABEL = 5


def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),.!?\-\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "-LRB-", string)
    string = re.sub(r"\)", "-RRB-", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def gat_label(sent):
    # [0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0]
    if sent >= 0 and sent <= .2:
        return 0
    elif sent <= .4:
        return 1
    elif sent <= .6:
        return 2
    elif sent <= .8:
        return 3
    elif sent <= 1.0:
        return 4
    else:
        return 2


def load_data_and_labels(data_file):
    # Load data from json file
    with open(data_file, "r", encoding='utf8') as f:
        items = json.load(f)
    text = [item[0] for item in items]
    label = []
    for item in items:
        la = [0] * LABEL
        la[item[1]] = 1
        label.append(la)
    # label = [item[1] for item in items]
    return (text, label)


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            print ("start: %d, end: %d" % (start_index, end_index))
            yield shuffled_data[start_index:end_index]


def main():
    dictionary = {}
    with open("./stanfordSentimentTreebank/dictionary.txt", "r", encoding="utf8") as f:
        for line in f.readlines():
            key, value = line.strip().split("|")
            dictionary[clean_str(key)] = value


    split = {}
    with open("./stanfordSentimentTreebank/datasetSplit.txt", "r", encoding="utf8") as f:
        for i, line in enumerate(f.readlines()):
            if i > 0:
                key, value = line.strip().split(",")
                split[key] = value


    sentiment = {}
    with open("./stanfordSentimentTreebank/sentiment_labels.txt", "r", encoding="utf8") as f:
        for i, line in enumerate(f.readlines()):
            if i > 0:
                key, value = line.strip().split("|")
                sentiment[key] = value


    with open("./stanfordSentimentTreebank/datasetSentences.txt", "r", encoding="utf8") as f:
        for i, line in enumerate(f.readlines()):
            if i > 0:
                index, sentence = line.strip().split("\t")
                # print (": ".join((index, sentence)))
                sentence = clean_str(sentence)
                DATA_JSON[int(split[index])].append((sentence, gat_label(float(sentiment[dictionary[sentence]]))))


    for i in range(1, 4):
        print ("json.dump %s" % i)
        with open(DATA_FILE[i], "w", encoding="utf8") as f:
            json.dump(DATA_JSON[i], f)


if __name__ == '__main__':
    main()