-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathGenedata.py
More file actions
102 lines (83 loc) · 3.47 KB
/
Genedata.py
File metadata and controls
102 lines (83 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import csv
import os
#import warnings
import numpy as np
#warnings.filterwarnings("ignore")
class Gene_data:
train_test_split_ratio = 0.1
def __init__(self, id, label):
self.id = id
self.label = label
self.seq = None
self.seqleft = None
self.seqright = None
self.length = None
np.random.seed(1234)
@classmethod
def load_sequence(cls, dataset, left=1000, right=3000,predict=False):
genes = []
#count = 0
path = dataset
print('Importing dataset {0}'.format(dataset))
with open(path, 'r') as f:
index=0
for line in f:
if line[0] == '>':
if index!=0:
seq=seq.upper()
seq=seq.replace('U','T')
seq=list(seq)
#change all other characters into N
for index in range(len(seq)):
if seq[index] not in ['A','C','G','T']:
test=1
seq[index]='N'
seq = ''.join(seq)
seq_length = len(seq)
line_left = seq[:int(seq_length*left/(right+left))]
line_right = seq[int(seq_length*left/(right+left)):]
if len(line_right) >= right:
line_right = line_right[-right:]
if len(line_left) >= left:
line_left = line_left[:left]
gene = Gene_data(id,label)
gene.seqleft = line_left.rstrip()
gene.seqright = line_right.rstrip()
gene.length = seq_length
#if transcript_biotype != 'protein_coding':
# count += 1
genes.append(gene)
id = line.strip()
label = line[1:].split(',')[0] #changed to label not float
seq=""
else:
seq+=line.strip()
#print(index)
index+=1
#last seq
seq=seq.upper()
seq=seq.replace('U','T')
seq=list(seq)
#change all other characters into N
for index in range(len(seq)):
if seq[index] not in ['A','C','G','T']:
test=1
seq[index]='N'
seq = ''.join(seq)
seq_length = len(seq)
line_left = seq[:int(seq_length*left/(right+left))]
line_right = seq[int(seq_length*left/(right+left)):]
if len(line_right) >= right:
line_right = line_right[-right:]
if len(line_left) >= left:
line_left = line_left[:left]
gene = Gene_data(id,label)
gene.seqleft = line_left.rstrip()
gene.seqright = line_right.rstrip()
gene.length = seq_length
genes.append(gene)
genes = np.array(genes)
if not predict:
genes = genes[np.random.permutation(np.arange(len(genes)))]
print('Total number of samples:', genes.shape[0])
return genes