-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsv_dataset.py
More file actions
43 lines (33 loc) · 1.34 KB
/
csv_dataset.py
File metadata and controls
43 lines (33 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
from torch.utils.data import Dataset
class TextClassificationDataset(Dataset):
"""Dataset for PyTorch reading a CSV with text and class field."""
def __init__(self, csv_file, text_field, class_field, remove_stop_words=False):
"""
Args:
csv_file (str): Path to the CSV file.
text_field (str): Name of the field containing the text data.
class_field (str): Name of the field containing the class labels.
"""
self.data = []
if remove_stop_words:
from nltk.corpus import stopwords
sw = stopwords.words('english')
df = pd.read_csv(csv_file)
for _, row in df.iterrows():
text = row[text_field]
if type(text) != str:
continue
class_label = row[class_field]
if remove_stop_words:
text = ' '.join([w for w in text.split(" ") if w.lower() not in sw])
self.data.append((text.replace("###","[SEP]"), class_label))
def __len__(self):
"""Returns the number of samples in the dataset."""
return len(self.data)
def __getitem__(self, index):
"""Returns a sample at the given index."""
text, class_label = self.data[index]
# Preprocess the text data here if needed (e.g., tokenization, padding)
# Convert class label to tensor if needed
return text, class_label