-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCompareExtractedFiles.py
More file actions
124 lines (97 loc) · 2.8 KB
/
CompareExtractedFiles.py
File metadata and controls
124 lines (97 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
from __future__ import division
"""
Compare two extraction files
DEPENDENCIES
Python 2.7
numpy
USAGE
python CompareExtracts.py <extract1.fa> <extract2.fa> <output file name>
e.g.
python ExtractFlankingSequence.py a.fa b.fa out.fa
David L. Stern
Janelia Farm Research Campus
6 Dec. 2012
/*
* Copyright 2012 Howard Hughes Medical Institute.
* All rights reserved.
* Use is subject to Janelia Farm Research Campus Software Copyright 1.1
* license terms ( http://license.janelia.org/license/jfrc_copyright_1_1.html ).
*/
"""
from pyfasta import Fasta
import numpy as np
import sys
import getopt
import string
import re
def main():
#parse command line options
try:
opts, arg = getopt.getopt(sys.argv[1:],"h", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
if len(arg) < 3:
print "Usage: python CompareExtracts.py <extract1.fa> <extract2.fa> <output file name>"
sys.exit(0)
#process arguments
afile= arg[0]
bfile = arg[1]
OutFile = arg[2]
print "first file = %s" %(afile)
print "second file = %s" %(bfile)
print "Output File = %s" %(OutFile)
f = open(afile)
g = open(bfile)
a = f.readlines()
b = g.readlines()
f.close()
g.close()
#get seqn length
seqlen = len(a[1].strip())
#make arrays for sequence motides
c = np.zeros((len(a)/2,seqlen),dtype=np.character)
d = np.zeros((len(b)/2,seqlen),dtype=np.character)
#grab only sequences, ignore identifier lines
j = a[1:len(a):2]
k = b[1:len(b):2]
#dump data in numpy arrays
x = 0
for i in j:
c[x,:] = list(i[0:seqlen])
x+=1
y = 0
for i in k:
d[y,:] = list(i[0:seqlen])
y+=1
#compare numpy arrays as logical
z = (c == d)
matches = sum(z)
AllProp = matches / len(z)
#now determine matches for A, T, C, G
AMatches = np.zeros(seqlen)
TMatches = np.zeros(seqlen)
CMatches = np.zeros(seqlen)
GMatches = np.zeros(seqlen)
#can't figure out how to calculate on all columns simultaneously, so grab one column at a time
for i in range(seqlen):
bpCol = c[:,i]
boolCol = z[:,i]
TMatches[i] = sum(boolCol[np.where(bpCol == 'T')])
AMatches[i] = sum(boolCol[np.where(bpCol == 'A')])
CMatches[i] = sum(boolCol[np.where(bpCol == 'C')])
GMatches[i] = sum(boolCol[np.where(bpCol == 'G')])
TProp = TMatches / len(z)
AProp = AMatches / len(z)
CProp = CMatches / len(z)
GProp = GMatches / len(z)
np.savetxt(OutFile,(AllProp,TProp,AProp,CProp,GProp))
if __name__ == "__main__":
sys.exit(main())