-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path1_6_1_Week_4_Code_Graded_Problem.py
More file actions
197 lines (167 loc) · 7.3 KB
/
1_6_1_Week_4_Code_Graded_Problem.py
File metadata and controls
197 lines (167 loc) · 7.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/local/bin/python3
import random
import sys
import timeit
from sys import argv
from sys import argv
from enum import Enum
import numpy
import math
class Profile:
class Nucleotides(Enum):
A = 'A'
C = 'C'
G = 'G'
T = 'T'
nucleotideIndex = {'A': 0, 'C': 1, 'G': 2, 'T': 3 }
def __init__(self, file=None, k=0, dna=None, psuedocount=1):
if dna is None:
dna = []
if file != None:
self.initialize_from_file(file)
else:
self.initialize_with(k,dna)
self.initialize_profile(psuedocount)
def initialize_profile(self, psuedocount):
self.pseudocount = psuedocount
self.score = 0
if len(self.dna) != 0:
self.motif_matrix = Matrix = [['' for x in range(len(self.dna[0]))] for y in range(len(self.dna))]
for i in range(0, len(self.dna)):
self.motif_matrix[i] = list(self.dna[i])
self.profile_matrix = Matrix = [[0 for x in range(len(self.dna[0]))] for y in range(4)]
self.count_matrix = Matrix = [[0 for x in range(len(self.dna[0]))] for y in range(4)]
self.score_array = numpy.zeros(len(self.motif_matrix[0]), numpy.int)
self.motif_count()
self.apply_laplace_rule_of_succession(self.pseudocount)
self.compute_profile_matrix()
self.motif_score()
def motif_score(self):
self.score = 0
if len(self.dna) > 0:
for column in range(len(self.motif_matrix[0])):
motif_column = [self.motif_matrix[i][column] for i in range(0,self.num_motifs)]
self.score_array[column] = len(motif_column) - max(
motif_column.count(self.Nucleotides.A.value),
motif_column.count(self.Nucleotides.C.value),
motif_column.count(self.Nucleotides.G.value),
motif_column.count(self.Nucleotides.T.value))
self.score = sum(self.score_array)
return(self.score)
def motif_count(self):
for column in range(len(self.motif_matrix[0])):
motif_column = [self.motif_matrix[i][column] for i in range(0,self.num_motifs)]
self.count_matrix[0][column] = motif_column.count(self.Nucleotides.A.value)
self.count_matrix[1][column] = motif_column.count(self.Nucleotides.C.value)
self.count_matrix[2][column] = motif_column.count(self.Nucleotides.G.value)
self.count_matrix[3][column] = motif_column.count(self.Nucleotides.T.value)
def apply_laplace_rule_of_succession(self,pseudocount):
self.pseudocount = pseudocount
if pseudocount != 0:
for row in range(len(self.count_matrix)):
for col in range(len(self.count_matrix[0])):
self.count_matrix[row][col] += pseudocount
def compute_profile_matrix(self):
for row in range(len(self.count_matrix)):
for col in range(len(self.count_matrix[0])):
count_column = [self.count_matrix[i][col] for i in range(0, 4)]
self.profile_matrix[row][col] = self.count_matrix[row][col] / sum(count_column)
def compute_profile_matrix_entropy(self):
total_entropy = 0
for column in range(len(self.profile_matrix[0])):
profile_column = [self.profile_matrix[i][column] for i in range(0,4)]
entropy = 0
for probability in profile_column:
entropy += probability * math.log(probability,2)
entropy *= -1
total_entropy += entropy
return total_entropy
def initialize_with(self, k, dna):
self.k = k
self.num_motifs = len(dna)
self.dna = dna
def initialize_from_motifs_file(self, file):
self.k = 0
with open(file, 'r') as motif_file:
self.dna = motif_file.readlines()
self.num_motifs = len(self.dna)
for i, text in enumerate(self.dna):
self.dna[i] = text.replace('\n', '')
def initialize_from_file(self, file):
with open(file, 'r') as myfile:
first_line = myfile.readline()
self.k = int(first_line.split(' ')[0])
self.num_motifs = int(first_line.split(' ')[1].replace('\n', ''))
self.dna = myfile.readlines()
for i, text in enumerate(self.dna):
self.dna[i] = text.replace('\n', '')
class RandomizedMotifSearch:
def _random_k_mers(self, k, dna_strings):
random_k_kers = []
for dna in dna_strings:
# generate random index range 0:len(dna)-k
start_index = random.randrange(0, len(dna) - k + 1)
random_k_kers.append(dna[start_index:start_index + k])
return random_k_kers
def most_probable(self, profile, dna_motifs):
most_probables = []
for dna in dna_motifs:
m_probable = 0
m_probable_k_mer = None
# for each k-mer in dna
for column in range(0, len(dna) - profile.k + 1):
probability = 1
k_mer = dna[column:column + profile.k]
for j in range(len(k_mer)):
# compute probability of k-mer using profile
row = profile.nucleotideIndex[k_mer[j]]
prob = profile.profile_matrix[profile.nucleotideIndex[k_mer[j]]][j]
probability *= profile.profile_matrix[profile.nucleotideIndex[k_mer[j]]][j]
if probability > m_probable:
m_probable = probability
m_probable_k_mer = k_mer
most_probables.append(m_probable_k_mer)
return most_probables
def randomized_motif_search(self, k, dna):
motifs = self._random_k_mers(k, dna)
best_motifs = motifs
while True:
motif_profile = Profile(None, k, motifs)
motifs = self.most_probable(motif_profile, dna)
motif_score = Profile(None, k, motifs).motif_score()
best_score = Profile(None, k, best_motifs).motif_score()
if motif_score < best_score:
best_motifs = motifs
else:
return [best_score, best_motifs]
def randomized_motif_searchX(self, k, dna, times):
best_score = len(dna)*k
while times > 0:
motif_score = self.randomized_motif_search(k, dna)
if motif_score[0] < best_score:
best_motifs = motif_score[1]
best_score = motif_score[0]
times -= 1
return best_motifs
if __name__ == '__main__':
if len(argv) == 2:
with open(argv[1], 'r') as input_file:
first_line = input_file.readline()
k = int(first_line.split()[0])
t = int(first_line.split()[1])
dna = []
for dna_string in input_file.readlines():
dna.append(dna_string)
else:
first_line = sys.stdin.readline()
k = int(first_line.split()[0])
t = int(first_line.split()[1])
dna = []
for dna_string in sys.stdin.readlines():
dna.append(dna_string)
for i, text in enumerate(dna):
dna[i] = text.replace('\n', '')
motifs = []
randomized_motif_searcher = RandomizedMotifSearch()
motifs = randomized_motif_searcher.randomized_motif_searchX(k,dna,1000)
print('\n'.join(motifs))