-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathreader.py
More file actions
38 lines (26 loc) · 891 Bytes
/
reader.py
File metadata and controls
38 lines (26 loc) · 891 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import numpy
stop = [word.encode('utf-8') for word in stopwords.words('english')]
tokenizer = RegexpTokenizer(r'\w+')
def extract(file_name):
"""
Extract target and the data from a file and
return them as numpy array for memory efficiency and speed
"""
data = {"text": [],
"class": []}
f = open(file_name, "r")
reader = csv.reader(f, delimiter=',')
for value, target in reader:
tokens = []
token = tokenizer.tokenize(value)
for i in token:
if i not in stop:
tokens.append(i)
value = " ".join(tokens).decode('cp1252', 'ignore')
data["text"].append(value)
data["class"].append(target)
f.close()
return numpy.asarray(data['text']),numpy.asarray(data['class'])