-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_data.py
More file actions
124 lines (103 loc) · 3.69 KB
/
parse_data.py
File metadata and controls
124 lines (103 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
import numpy as np
import json
import re
"""
1 = train
2 = test
3 = dev
"""
TEST = "./data/test.json"
DEV = "./data/dev.json"
TRAIN = "./data/train.json"
DATA_FILE = [None, TRAIN, TEST, DEV]
DATA_JSON = [None, [], [], []]
LABEL = 5
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9(),.!?\-\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", "-LRB-", string)
string = re.sub(r"\)", "-RRB-", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def gat_label(sent):
# [0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0]
if sent >= 0 and sent <= .2:
return 0
elif sent <= .4:
return 1
elif sent <= .6:
return 2
elif sent <= .8:
return 3
elif sent <= 1.0:
return 4
else:
return 2
def load_data_and_labels(data_file):
# Load data from json file
with open(data_file, "r", encoding='utf8') as f:
items = json.load(f)
text = [item[0] for item in items]
label = []
for item in items:
la = [0] * LABEL
la[item[1]] = 1
label.append(la)
# label = [item[1] for item in items]
return (text, label)
def batch_iter(data, batch_size, num_epochs, shuffle=True):
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((data_size-1)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
print ("start: %d, end: %d" % (start_index, end_index))
yield shuffled_data[start_index:end_index]
def main():
dictionary = {}
with open("./stanfordSentimentTreebank/dictionary.txt", "r", encoding="utf8") as f:
for line in f.readlines():
key, value = line.strip().split("|")
dictionary[clean_str(key)] = value
split = {}
with open("./stanfordSentimentTreebank/datasetSplit.txt", "r", encoding="utf8") as f:
for i, line in enumerate(f.readlines()):
if i > 0:
key, value = line.strip().split(",")
split[key] = value
sentiment = {}
with open("./stanfordSentimentTreebank/sentiment_labels.txt", "r", encoding="utf8") as f:
for i, line in enumerate(f.readlines()):
if i > 0:
key, value = line.strip().split("|")
sentiment[key] = value
with open("./stanfordSentimentTreebank/datasetSentences.txt", "r", encoding="utf8") as f:
for i, line in enumerate(f.readlines()):
if i > 0:
index, sentence = line.strip().split("\t")
# print (": ".join((index, sentence)))
sentence = clean_str(sentence)
DATA_JSON[int(split[index])].append((sentence, gat_label(float(sentiment[dictionary[sentence]]))))
for i in range(1, 4):
print ("json.dump %s" % i)
with open(DATA_FILE[i], "w", encoding="utf8") as f:
json.dump(DATA_JSON[i], f)
if __name__ == '__main__':
main()