-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdataPreprocessing.py
More file actions
77 lines (63 loc) · 2.95 KB
/
dataPreprocessing.py
File metadata and controls
77 lines (63 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
from sklearn.datasets import load_svmlight_file
from sklearn.datasets import load_svmlight_files
def libsvmPreprocess(pathInput, pathOutput, name):
# download the raw dataset from libsvm website, e.g., skin_nonskin, HIGGS.bz2
# at first load the raw dataset
x1 = load_svmlight_file(pathInput+name)
B = x1[0].toarray().tolist()
#for toarray
line_count = 0
with open(pathOutput+name+'.txt', 'w') as myfile:
myfile.write(str(len(B))+" "+str(len(B[0]))+"\n") # number data points / number of features
for i,line in enumerate(B):
line=str(line)+"\n"
myfile.writelines(line.replace('[','').replace(']',''))
myfile.close()
def insert(originalfile,string):
with open(originalfile,'r') as f:
with open('./datasets/gassensor_clean.txt','w') as f2:
f2.write(string)
f2.write(f.read())
def datasetPreprocess(pathInput, pathOutput, name):
# if the raw dataset is downloaded from UCI, e.g., gassensor
x1,y1,x2,y2,x3,y3,x4,y4,x5,y5,x6,y6,x7,y7,x8,y8,x9,y9,x10,y10=load_svmlight_files(("./datasets/gassensor/batch1.dat",
"./datasets/gassensor/batch2.dat",
"./datasets/gassensor/batch3.dat",
"./datasets/gassensor/batch4.dat",
"./datasets/gassensor/batch5.dat",
"./datasets/gassensor/batch6.dat",
"./datasets/gassensor/batch7.dat",
"./datasets/gassensor/batch8.dat",
"./datasets/gassensor/batch9.dat",
"./datasets/gassensor/batch10.dat"))
B=x1.toarray().tolist()+x2.toarray().tolist()+x3.toarray().tolist()+x4.toarray().tolist()+x5.toarray().tolist()+x6.toarray().tolist()+x7.toarray().tolist()+x8.toarray().tolist()+x9.toarray().tolist()+x10.toarray().tolist()
# please note that the raw data might have semicolon in the first field, which is the class and misc.info. This will lead to an error for converting string to float.
# x1,y1,x2,y2 = load_svmlight_files(("./datasets/gassensor/batch1.dat","./datasets/gassensor/batch2.dat"))
# B = x1[0].toarray().tolist()+x2.toarray().tolist()
#for tolist
import re
#B = x1[0].toarray().tolist()
with open('./datasets/gassensor.txt', 'w') as myfile:
myfile.write(str(len(B))+" "+str(len(B[0]))+"\n") # number data points / number of features
for i,line in enumerate(B):#myfile.write("%s\n" % line)
line=str(line)+"\n"
myfile.writelines(line.replace('[','').replace(']','').replace(',',' '))
#os.rename('newfile.txt',originalfile)
# file=insert('./datasets/gassensor.txt',"14000 128\n")
def main():
args = sys.argv
if len(args) < 5:
print ("Usage: python dataPreprocessing.py <flag> <path/of/inputDataset> <path/of/outputDataset> <name>, flag 0 for LIBSVM; 1 for UCI repository ")
print ("for example: python dataPreprocessing.py 0 datasets/ datasets/ skin_nonskin")
return
mode = int(args[1])
pathI = str(args[2])
pathO = str(args[3])
name = str(args[4])
if mode == 0:
libsvmPreprocess(pathI, pathO, name)
else:
datasetPreprocess(pathI, pathO, name)
if __name__=="__main__":
main()