-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataUtils.py
More file actions
117 lines (94 loc) · 5.36 KB
/
dataUtils.py
File metadata and controls
117 lines (94 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re,os,glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
import logReg
def preProcessText (lines): #preprocessing for a single file
lines = [line.replace("<br /><br />", ' ') for line in lines] #removing the html tags
lines = [line.replace("'", '') for line in lines] #sticking together words that are separated with apostrophes (he's -> hes)
lines = [re.sub('[\W_]+', ' ', line) for line in lines] #keeping only letters,numbers and spaces
lines = [line.lower() for line in lines]
return lines
#preprocessing for the whole data set, we replace each txt file with the preprocessed one
def preProcessData(path):
folders = ["/train/pos", "/train/neg" ,"/test/pos", "/test/neg" ]
for folder in folders:
print("preprocessing folder: "+path+folder)
for filename in glob.glob(os.path.join(path+folder, '*.txt')):
with open(os.path.join(filename), 'r') as f:
lines = f.readlines()
lines = preProcessText (lines)
with open(os.path.join( filename), 'w') as f:
f.writelines(lines)
print("All files have been preprocessed\n")
def createWordDictionary(path): #creating the dictionary of words based on the training comments
wordDict = {}
folders = ["/train/pos", "/train/neg" ]
for folder in folders:
for filename in glob.glob(os.path.join(path+folder, '*.txt')):
with open(os.path.join(filename), 'r') as f:
lines = f.readlines()
for line in lines:
words = line.split()
for word in words:
if word in wordDict.keys():
wordDict[word] += 1
else:
wordDict[word] = 1
sorted_dict = {}
sorted_keys = sorted(wordDict, key=wordDict.get,reverse=True)
for w in sorted_keys:
sorted_dict[w] = wordDict[w]
return sorted_dict
def splitDictionary(dictionary,n,m): #getting a sub-dictionary which consists if
return dict(list(dictionary.items())[n:m+n]) #n words, starting from the m-th word
def textToArray(path,dictionary,n,m): #merges all the comments into one numpy array
dictionary = splitDictionary(dictionary,n,m)
rows = []
folders = ["/train/pos", "/train/neg" ,"/test/pos", "/test/neg" ]
for folder in folders:
for filename in glob.glob(os.path.join(path+folder, '*.txt')):
with open(os.path.join(filename), 'r') as f:
text = f.readlines()[0]
temp = np.zeros(len(dictionary)+1)
temp[len(dictionary)] = int("pos" in folder) #last column is 1 if the row is a positive review
split = text.split()
i = 0
for word in dictionary.keys():
if word in split:
temp[i]=1
i+=1
rows.append(temp)
return np.array(rows) #returns (50000,m) numpy array
def createFiles(dictionary):
for n in [0,100,1000,10000,20000]:
for m in [100,500,1000,2000,4000]:
print("Creating file for " + "m: " +str(m)+ ", n: " + str(n) )
array = textToArray('aclImdb',dictionary,n,m)
np.save('data/'+str(n)+"_"+str(m), array )
def printAccuracyForEach_m_n():
#testing different hyperparameters(m,n) using the Cross Validation data
for i in [0,100,1000,10000,20000]:
for j in [100,500,1000,2000,4000]:
array = np.load('data/'+str(i)+"_"+str(j)+".npy")
trainArray = array[:25000]
valArray =array[25000:]
np.random.shuffle(trainArray)
X_train = trainArray[:,:trainArray.shape[1]-1]
y_train = trainArray[:,trainArray.shape[1]-1:]
X_val = valArray[:,:valArray.shape[1]-1]
y_val = valArray[:,valArray.shape[1]-1:]
#we always use random_state=101, so that the cv and test data will not change
#if we split the data again for the same (m,n) combination
X_cv, X_test, y_cv, y_test= train_test_split(X_val, y_val, train_size=0.5,test_size=0.5, random_state=101)
w = logReg.gradientAscent(X_train,y_train,0.0001,1,0.5,0)
predictions = logReg.predict(w,X_cv)
acc = metrics.accuracy_score(y_cv, predictions)
print("n:"+str(i)+" m:"+str(j)+" Accuracy:",acc)
def printAccuracyForEach_h_iter(X_train,y_train,X_cv,y_cv): #testing different hyperparameters(step/h and iterations)
for h in [0.01,0.05,0.1 ,0.25, 0.5 ,0.75]:
for ite in [1,2,3]:
w = logReg.gradientAscent(X_train,y_train,0.0001,ite,h,0)
predictions = logReg.predict(w,X_cv)
print("h:"+str(h)+" iter:"+str(ite)+" Accuracy:",metrics.accuracy_score(y_cv, predictions))