-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathpure_python_basic_text.py
More file actions
24 lines (21 loc) · 1.06 KB
/
pure_python_basic_text.py
File metadata and controls
24 lines (21 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import regex
def sep_punct(string, drop_punct): #diy tokenization, use nltk?
if not drop_punct: return "'".join(regex.sub(r"(\"|“|\(|\)|”|…|:|;|,|\*|\.|\?|!|/)", r" \g<1> ", string).split("’")) #separate all punc, then replace single quote ’ with '
return "'".join(regex.sub(r"(\"|“|\(|\)|”|…|:|;|,|\*|\.|\?|!|/)", " ", string).split("’")) #remove all punc, then replace single quote ’ with '
def readin(filename):
holder = []
with open(filename, 'r') as f_in:
for line in f_in:
holder.append(line.strip())
return holder
def pad(*lists_of_strings):
#lists must be same length!
nu_lists = []
padlen = []
for i in range(len(lists_of_strings)):
nu = []
for j in range(len(lists_of_strings[i])): #pad items in list to max length at their indices
if not i: padlen.append(max([len(lists_of_strings[k][j]) for k in range(len(lists_of_strings))]))
nu.append(lists_of_strings[i][j]+" "*(padlen[j]-len(lists_of_strings[i][j])))
nu_lists.append(nu)
return nu_lists