Misc_Utils/gff_parse.py at master · MeeshCompBio/Misc_Utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/env python
"""Classes to assist with GFF parsing. It tries to keep track of the partent/
child relationships and allows for fetching of parent or children features.

Contains the following classes:
    GFFError:   An error when a queried ID is not found in the GFF data.
    GFFFeature: Reads a line from a GFF file and parses out the feature info
    GFFHandler: Handles the parsing of the whole GFF file and allows access
                to parents, children, or "siblings" of individual features.
"""


class GFFError(LookupError):
    """Raised when a user asks for a feature ID that is not present in the GFF
    data."""


class GFFFeature(object):
    """This reads the data out of a single line of a GFF. It is a *very* ugly
    class, but GFF is a *very* ugly file format, and I don't know of a better
    way to handle this data."""
    __slots__ = [
        'seqid',
        'source',
        'type',
        'start',
        'end',
        'score',
        'strand',
        'phase',
        'ID',
        'Name',
        'Alias',
        'Parent',
        'Target',
        'Gap',
        'Derives_from',
        'Note',
        'Dbxref',
        'Ontology_term',
        'Is_circular'
        ]

    def __init__(self, line):
        self.seqid = None
        self.source = None
        self.type = None
        self.start = None
        self.end = None
        self.score = None
        self.strand = None
        self.phase = None
        self.ID = None
        self.Name = None
        self.Alias = None
        self.Parent = None
        self.Target = None
        self.Gap = None
        self.Derives_from = None
        self.Note = None
        self.Dbxref = None
        self.Ontology_term = None
        self.Is_circular = None
        self.read_gff_line(line)

    def read_gff_line(self, line):
        fields = line.strip().split('\t')
        #   Fixed fields are the first eight
        self.seqid = fields[0]
        self.source = fields[1]
        self.type = fields[2]
        self.start = int(fields[3])
        self.end = int(fields[4])
        self.score = fields[5]
        self.strand = fields[6]
        self.phase = fields[7]
        #   For the next field, we split it and look for keywords
        attrib = fields[8].split(';')
        for a in attrib:
            if a.startswith('ID'):
                self.ID = a.split('=')[1]
            elif a.startswith('Name'):
                self.Name = a.split('=')[1]
            elif a.startswith('Alias'):
                alias = a.split('=')[1]
                self.Alias = alias.split(',')
            elif a.startswith('Parent'):
                parent = a.split('=')[1]
                self.Parent = parent.split(',')
            elif a.startswith('Target'):
                self.Target = a.split('=')[1]
            elif a.startswith('Gap'):
                self.Gap = a.split('=')[1]
            elif a.startswith('Derives_from'):
                self.Derives_from = a.split('=')[1]
            elif a.startswith('Note'):
                note = a.split('=')[1]
                self.Note = note.split(',')
            elif a.startswith('Dbxref'):
                dbxref = a.split('=')[1]
                self.Dbxref = dbxref.split(',')
            elif a.startswith('Ontology_term'):
                ont = a.split('=')[1]
                self.Ontology_term = ont.split(',')
            elif a.startswith('Is_circular'):
                self.Is_circular = a.split('=')[1]


class GFFHandler(object):
    """Handles the parsing of the entire GFF file, line by line. Defines
    methods that allow for retrieval of specific features, or groups of
    features.

    Contains the following methods:
        get_feature(self, feat_id)
            Retuns a single feature, or raises an exception when the ID is not
            found

        get_parents(self, feat_id, type=None)
            Returns the parents of a feature, or raises an exception when the
            ID is not found. If the type keyword is specified, then only
            features of a certain type are returned. If there are no features
            of the supplied type, then empty list is returned. Defaults to
            returning all parents. This will only traverse up one level.

        get_children(self, feat_id, type=None)
            Returns the children of a feature, or raises an exception when the
            ID is not found. If the type keyword is specified, then only child
            features of the specified type are returned. If there are no child
            features of that type, then empty list is returned. Defaults to
            returning all children. This will only traverse down one level.

        get_siblings(self, feat_id, type=None)
            Returns the "siblings" (features that share parents), or raises an
            exception when the ID is not found. If the type keyword is
            specified, then only features of that type are returned. If there
            are no sibling features of that type, then empty list is returned.
            Defaults to returning all siblings. This does not check parents
            of parents or children of children.

        chrom_features(self, chrom, left, right, type=None)
            Returns all features on the specified chromosome, or raises an
            exception when that chromosome is not found. If the type keyword is
            specified, then only features of that type are returned. If the
            left keyowrd is specified, then features that have bases to the
            _right_ of the specified base are returned. If the right keyword is
            specified, then features with bases to the _left_ of the offset
            are returned. These effctively limit the search to a certain
            region. If there are no features of the specified type in the given
            region, then empty list is returned. Defaults to all types and the
            entire chromosome.
    """
    gff_data = []
    gff_ids = []

    def __init__(self):
        pass

    def gff_parse(self, fname):
        with open(fname, 'r') as f:
            for line in f:
                #   Skip lines starting with #; they are directives or comments
                if line.startswith('#'):
                    continue
                #   Skip blank lines. I guess we should support CRLF lines too
                elif line in ('\n', '\r\n'):
                    continue
                else:
                    g = GFFFeature(line)
                    self.gff_data.append(g)
                    self.gff_ids.append(g.ID)

    def get_feature(self, feat_id):
        if feat_id not in self.gff_ids:
            raise GFFError('ID {fid} not found in GFF!'.format(fid=feat_id))
        else:
            for f in self.gff_data:
                if f.ID == feat_id:
                    return(f)

    def get_parents(self, feat_id, feat_type=None):
        if feat_id not in self.gff_ids:
            raise GFFError('ID {fid} not found in GFF!'.format(fid=feat_id))
        else:
            parents = []
            targeted_features = []
            #   We search through the list and save features that match the
            #   query ID
            for f in self.gff_data:
                if f.Parent and feat_id == f.ID:
                    if feat_type:
                        if f.type == feat_type:
                            parents += f.Parent
                    else:
                        parents += f.Parent
            #   We then search AGAIN to get the features that match the parent
            #   feature ID. This is not very efficient, but given the structure
            #   of the data, it works.
            for f in self.gff_data:
                if f.ID in parents:
                    targeted_features.append(f)
            return(targeted_features)

    def get_children(self, feat_id, feat_type=None):
        if feat_id not in self.gff_ids:
            raise GFFError('ID {fid} not found in GFF!'.format(fid=feat_id))
        else:
            targeted_features = []
            #   Searching for child features is easy, since we just have to see
            #   if our query ID is in the Parent GFF attributes.
            for f in self.gff_data:
                if f.Parent and feat_id in f.Parent:
                    if feat_type:
                        if f.type == feat_type:
                            targeted_features.append(f)
                    else:
                        targeted_features.append(f)
            return(targeted_features)

    def get_siblings(self, feat_id, feat_type=None):
        if feat_id not in self.gff_ids:
            raise GFFError('ID {fid} not found in GFF!'.format(fid=feat_id))
        #   This is pretty easy, too. We just take the query ID, find its
        #   parents, and then find all children of those parents. These are the
        #   "siblings"
        parents = self.get_parents(feat_id, feat_type=feat_type)
        sibs = []
        for p in parents:
            sibs += self.get_children(p.ID, feat_type=feat_type)
        return(sibs)

    def chrom_features(self, chrom, left=None, right=None, feat_type=None):
        targeted_features = []
        filtered_features = []
        for f in self.gff_data:
            if f.seqid == chrom:
                if feat_type:
                    if f.type == feat_type:
                        targeted_features.append(f)
                else:
                    targeted_features.append(f)
        #   Now we go through the list of targeted features and filter based
        #   on position
        if left or right:
            #   if left is not specified, set it to 0
            if not left:
                left = 0
            #   Same with right, except make it huge
            if not right:
                right = 1e99
            for t in targeted_features:
                if t.end >= left or t.start <= right:
                    filtered_features.append(t)
        else:
            filtered_features = targeted_features
        return(filtered_features)