ExtractFlankingSequence/CompareExtractedFiles.py at master · dstern/ExtractFlankingSequence · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
from __future__ import division
"""
Compare two extraction files

DEPENDENCIES

Python 2.7
numpy

USAGE

python CompareExtracts.py <extract1.fa> <extract2.fa> <output file name>

e.g.

python ExtractFlankingSequence.py a.fa b.fa out.fa


David L. Stern
Janelia Farm Research Campus
6 Dec. 2012

/*
 * Copyright 2012 Howard Hughes Medical Institute.
 * All rights reserved.
 * Use is subject to Janelia Farm Research Campus Software Copyright 1.1
 * license terms ( http://license.janelia.org/license/jfrc_copyright_1_1.html ).
 */

"""
from pyfasta import Fasta
import numpy as np
import sys
import getopt
import string
import re


def main():
        #parse command line options
        try:
                opts, arg = getopt.getopt(sys.argv[1:],"h", ["help"])
        except getopt.error, msg:
                print msg
                print "for help use --help"
                sys.exit(2)
        # process options
        for o, a in opts:
                if o in ("-h", "--help"):
                        print __doc__
                        sys.exit(0)
        if len(arg) < 3:
                print "Usage: python CompareExtracts.py <extract1.fa> <extract2.fa> <output file name>"
                sys.exit(0)
        #process arguments

        afile= arg[0]
        bfile = arg[1]
        OutFile = arg[2]

        print "first file = %s" %(afile)
        print "second file = %s" %(bfile)
        print "Output File = %s" %(OutFile)

	f = open(afile)
	g = open(bfile)
	a = f.readlines()
	b = g.readlines()
	f.close()
	g.close()

	#get seqn length
	seqlen = len(a[1].strip())

	#make arrays for sequence motides
	c = np.zeros((len(a)/2,seqlen),dtype=np.character)
	d = np.zeros((len(b)/2,seqlen),dtype=np.character)

	#grab only sequences, ignore identifier lines
	j =  a[1:len(a):2]
	k =  b[1:len(b):2]

	#dump data in numpy arrays
	x = 0
	for i in j:
		c[x,:] = list(i[0:seqlen])
		x+=1

	y = 0
	for i in k:
		d[y,:] = list(i[0:seqlen])
		y+=1

	#compare numpy arrays as logical
	z = (c == d)
	matches = sum(z)
	AllProp = matches / len(z)

	#now determine matches for A, T, C, G

	AMatches = np.zeros(seqlen)
	TMatches = np.zeros(seqlen)
	CMatches = np.zeros(seqlen)
	GMatches = np.zeros(seqlen)

	#can't figure out how to calculate on all columns simultaneously, so grab one column at a time
	for i in range(seqlen):
		bpCol = c[:,i]
		boolCol = z[:,i]
		TMatches[i] = sum(boolCol[np.where(bpCol == 'T')])
		AMatches[i] = sum(boolCol[np.where(bpCol == 'A')])
		CMatches[i] = sum(boolCol[np.where(bpCol == 'C')])
		GMatches[i] = sum(boolCol[np.where(bpCol == 'G')])

	TProp = TMatches / len(z)
	AProp = AMatches / len(z)
	CProp = CMatches / len(z)
	GProp = GMatches / len(z)

	np.savetxt(OutFile,(AllProp,TProp,AProp,CProp,GProp))

if __name__ == "__main__":
        sys.exit(main())