-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgetOverlappingTranscripts.py
More file actions
executable file
·84 lines (60 loc) · 3.31 KB
/
getOverlappingTranscripts.py
File metadata and controls
executable file
·84 lines (60 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
#
"""
This script takes files in bed12 format (at least TWO!) and outputs overlapping unique transcripts;
it assigns transcript names using the first specified file.
To get overlaping transcripts of several annotations run e.g: \
getOverlapingTrascripts.py -f uniq.db.ref1.ncbi.bed uniq.db.ref2.ncbi.bed > uniq.overlap.db.ref1.ref2.bed
"""
import argparse
from collections import defaultdict
import sys
__author__ = "Ekaterina Osipova, 2019."
def read_all_annotations(bedList):
## Reads all transcripts of all annotation files into 2D list
# initiate a 2D list to store lists of elements from each file
# allTranscripts_allFiles = []
# initiate a dictionary to store the part of a transcript that is allowed to vary
transcripts_dict = defaultdict(list)
for bed in bedList:
with open(bed, 'r') as inf:
transcripts_oneFile = []
for line in inf.readlines():
transcrName, score, color = line.split()[3], line.split()[4], line.split()[8]
transcrInfo = '\t'.join(line.split()[:3] + line.split()[5:8] + line.split()[9:])
# store essential info for each transcript of this file in the list
transcripts_oneFile.append(transcrInfo)
# at the same time keep corresponding variable part of a transcript in the dictionary
transcripts_dict[transcrInfo].append((transcrName, score, color))
# add essential (that suppose to be identical) transcript info to the allTranscrpts 2D list
# allTranscripts_allFiles.append(transcripts_oneFile)
return transcripts_dict
# return allTranscripts_allFiles, transcripts_dict
def print_overlapping_transcripts(transcripts_dict, overlap):
## Outputs elements of the overlap_transcripts getting first available variable part from the transcripts_dict
for transcrInfo in transcripts_dict:
transcrVars = transcripts_dict[transcrInfo]
if len(transcrVars) >= overlap:
# combine a new line
newBedLine = '\t'.join(transcrInfo.split()[:3]) + '\t' + transcrVars[0][0] + '\t' + \
transcrVars[0][1] + '\t' + '\t'.join(transcrInfo.split()[3:6]) + '\t' + \
transcrVars[0][2] + '\t' + '\t'.join(transcrInfo.split()[6:])
print(newBedLine)
def main():
## Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filelist', nargs='*', type=str, help='list of bed12 files to extract identical\
overlapping transcripts from. Needs at least TWO files')
parser.add_argument('-n', '--number', type=int, help='transcript is present in AT LEAST this number of files')
args = parser.parse_args()
## Read all annotation files into 2D list
# allTranscripts_allFiles, transcripts_dict = read_all_annotations(args.filelist)
transcripts_dict = read_all_annotations(args.filelist)
## Get overlapping transcripts
# overlap_transcripts = set(allTranscripts_allFiles[0]).intersection(*allTranscripts_allFiles)
## Output overlapping transcripts
# print_overlapping_transcripts(overlap_transcripts, transcripts_dict)
overlap = args.number
print_overlapping_transcripts(transcripts_dict, overlap)
if __name__ == '__main__':
main()