mlModels/csv_dataset.py at main · sarwart/mlModels · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):
  """Dataset for PyTorch reading a CSV with text and class field."""

  def __init__(self, csv_file, text_field, class_field, remove_stop_words=False):
    """
    Args:
      csv_file (str): Path to the CSV file.
      text_field (str): Name of the field containing the text data.
      class_field (str): Name of the field containing the class labels.
    """
    self.data = []

    if remove_stop_words:
        from nltk.corpus import stopwords
        sw = stopwords.words('english')

    df = pd.read_csv(csv_file)

    for _, row in df.iterrows():
        text = row[text_field]
        if type(text) != str:
            continue

        class_label = row[class_field]

        if remove_stop_words:
            text = ' '.join([w for w in text.split(" ") if w.lower() not in sw])

        self.data.append((text.replace("###","[SEP]"), class_label))

  def __len__(self):
    """Returns the number of samples in the dataset."""
    return len(self.data)

  def __getitem__(self, index):
    """Returns a sample at the given index."""
    text, class_label = self.data[index]
    # Preprocess the text data here if needed (e.g., tokenization, padding)
    # Convert class label to tensor if needed
    return text, class_label