-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtransform.py
More file actions
69 lines (51 loc) · 1.44 KB
/
transform.py
File metadata and controls
69 lines (51 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import codecs
import collections
import sys
import os
from os import listdir
#this script takes N texts of the form
#freq\tpattern
#and transforms them into a format suitable to be used with the CA_factominer script
if len(sys.argv) > 1:
# Usage: python main.py input
input_dir = sys.argv[1]
else:
print "input dir"
files = os.listdir( input_dir )
model = dict()
counter = 1
freqs = collections.defaultdict(dict)
for input_file in files:
text = input_file[:-4]
print text
f = codecs.open(input_dir + "/" + input_file, encoding="latin-1")
# f = open(input_dir + "/" + input_file)
for line in f:
line = line.strip()
cols = line.split("\t")
pattern_name = ""
if cols[1] not in model:
pattern_name = str(counter)
counter +=1
model[cols[1]]=pattern_name
else:
pattern_name = model[cols[1]]
freqs[text]["Pattern_"+pattern_name] = int(cols[0])
f.close()
o = codecs.open("Model.txt", "w", "utf-8")
#print the Model.txt file in sorted order of value (number of pattern id)
for k in sorted(model, key=lambda i: int(model[i])):
o.write(k + "\t" + str(model[k])+"\n")
o.close()
try:
os.stat("Models")
except:
os.mkdir("Models")
for f in freqs:
text_model_file = f+"_Model.txt"
o = codecs.open("Models/"+text_model_file, "w", "utf-8")
o.write("Pattern_ID" + "\t" + f + "\n")
for p in freqs[f]:
o.write(p + "\t" + str(freqs[f][p])+"\n")
o.close()
#o = codecs.open("out", "w", "utf-8") # uncomment for option (2)