-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconll_format_utils.py
More file actions
421 lines (349 loc) · 14.2 KB
/
conll_format_utils.py
File metadata and controls
421 lines (349 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
"""
This script reads in both conllu and conll2009 datasets
- Usages:
python conll_format_utils.py --source_filename <filename> --data_format <either conllu or conll09>
- Variables:
Source_filename: str; input filename
data_format: str; choose one from conll09, conllu
"""
from collections import OrderedDict
import argparse
import re
def get_conllu_column_names():
# There are the columns name taken from UD data
columns = ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "FLAG", "VERB"]
# ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0).
# FORM: Word form or punctuation symbol.
# LEMMA: Lemma or stem of word form.
# UPOS: Universal part-of-speech tag.
# XPOS: Language-specific part-of-speech tag; underscore if not available.
# FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
# HEAD: Head of the current word, which is either a value of ID or zero (0).
# DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
# 4 doivent devoir VERB VERB _ 0 root _ _
# 14 ait avoir VERB VERB _ 10 ccomp _ _
return columns
def get_conll09_column_names():
#these coulmens names from CoNLL2009 propbank data,
# Changed POS to UPOS to have consistency among different formats
columns = ["ID", "FORM", "LEMMA","PLEMMA" ,"UPOS", "PUPOS", "FEATS","PFEATS", "HEAD", "PHEAD","DEPREL", "PDEPREL","FLAG", "VERB"]
return columns
class Meta_info():
def __init__(self, meta):
self.meta = meta
self.sen_id = ""
self.sen_txt = ""
self.process_meta()
def process_meta(self):
for meta_line in self.meta:
if re.match(r'^# [0-9]+ #', meta_line):#len(meta_line.split("#")) >= 3: #
self.sen_id = int(meta_line.split("#")[1])
self.sen_txt = "#".join(meta_line.split("#")[2:]).strip()
elif len(meta_line.split("sentence-text:"))>=2:
self.sen_txt = "sentence-text:".join(meta_line.split("sentence-text:")[1:]).strip()
elif len(meta_line.split("sentence-text:"))==1 and not re.match(r'^# \d # ', meta_line):
self.sen_txt = "#".join(meta_line.split("#")[1:]).strip()
class Token():
def __init__(self, tok_line, col_names):
self.id = tok_line[col_names.index("ID")]
self.form = tok_line[col_names.index("FORM")]
self.lemma = tok_line[col_names.index("LEMMA")]
self.upos = tok_line[col_names.index("UPOS")]
self.xpos = tok_line[col_names.index("XPOS")]
self.feat = tok_line[col_names.index("FEATS")]
self.head = tok_line[col_names.index("HEAD")]
self.deprel = tok_line[col_names.index("DEPREL")]
self.ispred = self.is_pred(tok_line[col_names.index("FLAG")])
self.sense = self.sense(tok_line[col_names.index("VERB")])
def is_pred(self, tok):
if tok == "_":
return False
else:
return True
def sense(self, tok):
if not self.is_pred(tok):
return "_"
else:
return tok.split(".")[-1]
def __str__(self):
out = "id: {}, form: {}, lemma: {}, upos: {}, xpos: {}, feat: {}, head: {}, deprel: {}, pred: {}".format(
self.id,
self.form,
self.lemma,
self.upos,
self.xpos,
self.feat,
self.head,
self.deprel,
self.ispred
)
return out
class SenChunk():
def __init__(self, sen, col_names=[]):
self.sen, self.meta = self.get_sen_meta(sen)
self.if_no_predicate = 0
self.n_predicate = 0
self.n_columns = len(sen[0])
if col_names==[]:
# print("Assuming CoNLLU columns")
self.col_names = get_conllu_column_names()
else:
self.col_names = col_names
self.predicates = self.get_prediate(col_names)
self.arguments = self.get_arguments()
self.txt = self.get_text()
self.n_token = len(self.txt.split(" "))
self.deprel = self.get_deprel()
self.tokens = self.get_tokens()
def get_sen_meta(self, sen):
"""
Seperate out the sentence from meta information
:param sen: List of tab seperated text
:return:
"""
sent = []
meta = []
for tok_line in sen:
if tok_line[0].split(" ")[0] == "#":
meta.append(" ".join(tok_line))
else:
sent.append(tok_line)
meta_out = Meta_info(meta)
return sent, meta_out
def get_prediate(self,col_names):
"""
:return: a list of predicates in a sentence, where each pred is a tuple (verb_id, verb, verb.sense, pos)
"""
if len(self.sen[0]) <= len(self.col_names):
self.if_no_predicate += 1
# print("no predicate")
return []
# ind = 0
predicates = []
for tok_line in self.sen:
# tok_col = tok.strip().split("\t")
# for feat in tok_col:
if tok_line[self.col_names.index("FLAG")] == "Y":
# start, end = get_tok_position(int(tok_col[0])-1, int(tok_col[0])-1, tokens)
# get_tok_position(tok_col[0]-1, tokens)
predicates.append((tok_line[0],
tok_line[self.col_names.index("VERB")].split(".")[0],
tok_line[self.col_names.index("VERB")],
tok_line[self.col_names.index("UPOS")]))
self.n_predicate += 1
return predicates
def get_arguments(self):
args = OrderedDict()
arg_columns = list(map(list,zip(*self.sen)))[len(self.col_names):]
# for ii, pred in enumerate(self.predicates):
# args[pred] = arg_columns[ii]
return arg_columns
def get_text(self):
tokenized_txt = "".join(list(zip(*self.sen))[1])
meta_txt = "".join(self.meta.sen_txt.split(" "))
if tokenized_txt != meta_txt: # to handle unknown Conllu format text token line
return " ".join(list(zip(*self.sen))[1])
else:
return self.meta.sen_txt
def get_deprel(self):
deprel = []
for tok_line in self.sen:
deprel.append(tok_line[self.col_names.index("DEPREL")])
return deprel
def get_tokens(self):
tokens = []
for tok_line in self.sen:
tok = Token(tok_line, self.col_names)
# print(tok)
tokens.append(tok)
return tokens
# def __repr__(self):
# print("text ", self.txt)
class ReadData():
def __init__(self, filename):
self.filename = filename
self.sen_with_no_predicates = 0
self.col_names = self.get_column_names()
self.all_sen = self.get_sen()
self.n_sen = len(self.all_sen)
self.predicate_count = self.count_predicates()
def read_file(self):
with open(self.filename) as f:
data = f.readlines()
chek_new_lines = conllu_format_check(data)
if chek_new_lines == -1:
print("Fix number of lines at the end. ")
return None
else:
print("Checked number of new lines at the end of the document")
return data
def get_sen(self):
all_sen = []
sen = []
tok_lines = self.read_file()
for tok_line in tok_lines:
if tok_line == "\n" or tok_line == "---\n":
if sen != []:
chunk = SenChunk(sen, self.col_names)
all_sen.append(chunk)
self.sen_with_no_predicates += chunk.if_no_predicate
sen = []
# elif line[0] == "#": # remove all meta data
# continue
else:
### To ignore non-int token indexes in label transfer code https://github.ibm.com/SystemT-Research/SRL-Test/issues/81
tok_index = tok_line.strip().split("\t")[0]
if re.match(r'^\d-\d', tok_index) or re.match(r'^\d\d-\d', tok_index) or re.match(r'^\d\d\d-\d', tok_index):
# print('ignored ',tok_index)
continue
sen.append(tok_line.strip().split("\t"))
if sen != []:
all_sen.append(SenChunk(sen, self.col_names))
self.sen_with_no_predicates += SenChunk(sen, self.col_names).if_no_predicate
return all_sen
def count_predicates(self):
count = 0
for sen in self.all_sen:
count += len(sen.predicates)
return count
def count_pos(self, pos="UPOS"):
stat = OrderedDict()
for sen in self.all_sen:
for tok in sen.tokens:
if pos == "UPOS":
predpos = tok.upos
elif pos == "XPOS":
predpos = tok.xpos
else:
print("POS can either be UPOS or XPOS")
if predpos in stat:
stat[predpos] += 1
else:
stat[predpos] = 1
return stat
def get_predicate_stat(self, pos="UPOS"):
stat = OrderedDict()
for sen in self.all_sen:
if sen.n_predicate > 0:
for pred in sen.predicates:
if pos == "UPOS":
predpos = sen.tokens[int(pred[0])-1].upos
elif pos == "XPOS":
predpos = sen.tokens[int(pred[0]) - 1].xpos
else:
print("POS can either be UPOS or XPOS")
if predpos in stat:
stat[predpos] += 1
else:
stat[predpos] = 1
return stat
def data_arg_count(self):
argCont = {}
for sen in self.all_sen:
for pred_id, pred in enumerate(sen.predicates):
if pred[2] not in argCont:
argCont[pred[2]] = {}
for arg in sen.arguments[pred_id]:
if arg != "_":
if arg in argCont[pred[2]]:
argCont[pred[2]][arg] += 1
else:
argCont[pred[2]][arg] = 1
return argCont
def data_verbsense_count(self):
verbCont = {}
for sen in self.all_sen:
for pred_id, pred in enumerate(sen.predicates):
if pred[1] not in verbCont:
verbCont[pred[1]] = {}
if pred[2] in verbCont[pred[1]]:
verbCont[pred[1]][pred[2]] += 1
else:
verbCont[pred[1]][pred[2]] = 1
return verbCont
def get_column_names(self):
return []
def conllu_format_check(data):
# just checking new lines at the end of conll u file
end_line = -1
count_new_lines = 0
while data[end_line] == "\n":
count_new_lines += 1
end_line = end_line -1
if count_new_lines != 1:
print("Expecting a new line at the end of the file.")
print("There are {} new lines".format(count_new_lines))
return -1
else:
return 0
class Reader(ReadData):
def __init__(self, input_file, data_format):
self.data_format = data_format
super(Reader, self).__init__(input_file)
def get_column_names(self):
if self.data_format == "conllu":
return get_conllu_column_names()
elif self.data_format == "conll09":
col_names = get_conll09_column_names()
ind = col_names.index("PUPOS")
col_names[ind] = "XPOS"
return col_names
else:
print("Nospecified")
def pred_pos_prop(srl_file, pos):
import pandas as pd
data_09 = Reader(srl_file, "conllu")
pred_count = data_09.get_predicate_stat(pos)
pos_count = data_09.count_pos(pos)
df_pred = pd.DataFrame(zip(*[pred_count.keys(), pred_count.values()]), columns=["POS", "pred_count"])
df_pos = pd.DataFrame(zip(*[pos_count.keys(), pos_count.values()]), columns=["POS", "pos_count"])
df = pd.merge(df_pred, df_pos, on=['POS'], how="inner", suffixes=('_c', '_ce'))
df["prop"] = 100 * df["pred_count"] / df["pos_count"]
return df
def read_large_data(filename):
"""
To get one sentence at a time from a big file
:param filename: path to the filename
:return: Yield one sentence at a time
"""
with open(filename) as f:
data = f.readlines()
all_sen = []
sen = []
for line in data:
if line == "\n" or line == "---\n":
if sen != []:
yield sen
sen = []
else:
sen.append(line.strip().split("\t"))
if sen != []:
yield sen
def write_raw_csv(conllu_file):
import pandas as pd
import os
path_raw_sen = os.path.dirname(conllu_file)+"/"+"sentences.csv"
# path_raw_sen = os.path.join(conllu_file.split("/")[:-1])+".csv"
print(path_raw_sen)
sen_id =[]
text = []
ind = 0
for sen in read_large_data(conllu_file):
sen_id.append(ind)
sen = SenChunk(sen)
ind += 1
text.append(sen.txt)
df = pd.DataFrame(zip(sen_id, text), columns=["id", "text"])
df.to_csv(path_raw_sen,index=False)
return path_raw_sen
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Statistics of various predicate forms')
parser.add_argument('--source_filename', type=str, default='/Users/ishan/git/DataAndEvaluation/data/input_data/srl/en/propbank_09/CoNLL2009-ST-English-train.txt',
help='Input labeled conll file')
# parser.add_argument('--output_filename', type=str,
# default='/Users/ishan/git/DataAndEvaluation/data/pipeline_output/srl/en/propbank_09/conll2009-izumo-10/conll2009-izumo-10-predcount.conllu',
# help='Input labeled conll file')
parser.add_argument('--data_format', type=str, default='conll09',
help='specify the data format')
args = parser.parse_args()
data = Reader(args.source_filename, args.data_format)