-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathfun_ch.py
More file actions
executable file
·56 lines (50 loc) · 2.46 KB
/
fun_ch.py
File metadata and controls
executable file
·56 lines (50 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# read the range from bed file "cov"
# read the range of repeat regions from rmsk file "rmsk"
# print the number of overlap nucleotides and the total number of 0 coverage and the repeat regions
# runsheng 06/22/2014
def cov_no(cov, rmsk, chr, repeattype):
"""
This function takes 4 parameters,
1. cov, the open(file.readlines()) list of bed file, with at least 3 cols: chr,start,end
2. rmsk, the open(file.readlines()) list of UCSC rmsk file, with 17 cols, but only the genoName,genoStartgenoEnd was used in the caculation
example: bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEnd repLeft id repClass_no
3. chr, use "I","II"... or "all"
the chromosome number, "I","II" etc in my bed file, but "chrI", "chrII" in rmsk file
4. repeattype, the type of repClass, can be "DNA","DNA?","LINE","Low_complexity","LTR","RC","rRNA","Simple_repeat",""Satellite","SINE","Unknown"
"""
# the operation of "set |"" instead of "list +"" give little benefit to the speed
cov_ch=[]
for line in cov:
if line.split("\t")[0]=="%s" % chr:
a=range(int(line.split("\t")[1]),int(line.split("\t")[2]))
cov_ch=cov_ch+a
print (len(cov_ch))
rmsk_ch=[]
for line in rmsk:
if repeattype =="all":
if line.split("\t")[5]=="chr%s" % chr:
b=range(int(line.split("\t")[6]),int(line.split("\t")[7]))
rmsk_ch=rmsk_ch+b
else:
if line.split("\t")[5]=="chr%s" % chr:
if line.split("\t")[11] == repeattype:
b=range(int(line.split("\t")[6]),int(line.split("\t")[7]))
rmsk_ch=rmsk_ch+b
print(len(rmsk_ch))
print(len(set(cov_ch)&set(rmsk_ch)))
return(chr, repeattype, len(cov_ch), len(rmsk_ch), len(set(cov_ch)&set(rmsk_ch)))
if __name__=="__main__":
#ali=cov_no(cov, rmsk, "I", "LINE")
def test():
cov=open("coverage0.txt").readlines()
rmsk=open("rmsk.txt").readlines()
f=open("foo.txt", "w")
#example of the function
for chr in ["I","II","III","IV","V","X"]:
for repeattype in ["DNA","DNA?","LINE","Low_complexity","LTR","RC","rRNA","Simple_repeat","Satellite","SINE","Unknown"]:
ov=cov_no(cov, rmsk, chr, repeattype)
print (ov)
f.write(str(ov))
f.write("\n")
f.close()
pass