-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlab03.py
More file actions
45 lines (38 loc) · 1.22 KB
/
lab03.py
File metadata and controls
45 lines (38 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
f = open("Homo_sapiens.GRCh38.114.gtf", "r")
c = f.read().split("\n")
del c[0:4]
c.pop()
c = [s.split("\t") for s in c]
lengths = {}
i = 0
for l in c:
try:
if(l[2] == "gene"):
lengths[i] = {
"length": int(l[4]) - int(l[3]) + 1,
"info": l
}
i += 1
except:
continue
lengths = sorted(lengths.items(), key=lambda x: x[1]["length"], reverse=True)[:3]
print("top 3 longest:")
for l, data in lengths:
info = data["info"]
print(f"len: {data['length']}, gene_id: {info[8].split('gene_id \"')[1].split('\"')[0]}, gene_name: {info[8].split('gene_name \"')[1].split('\"')[0]}")
t = {}
for l in c:
try:
if l[2] == "transcript":
gene_id = l[8].split('gene_id "')[1].split('"')[0]
gene_name = l[8].split('gene_name "')
if gene_id not in t:
t[gene_id] = {"count": 1, "gene_name": gene_name}
else:
t[gene_id]["count"] += 1
except:
continue
transcripts = sorted(t.items(), key=lambda x: x[1]["count"], reverse=True)[:3]
print("top 3 with the most transcript count:")
for gene_id, data in transcripts:
print(f"gene_id: {gene_id}, transcripts: {data['count']}")