Simple-Transformer/utils.py at main · Bengal1/Simple-Transformer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
"""
A collection of utility functions and classes to support the training,
evaluation, and preprocessing of a Transformer-based machine
translation model.

This module provides reusable components for common deep learning
tasks, including:
- Learning Rate Schedulers: The `NoamLR` scheduler, an
  implementation of the schedule from the "Attention Is All You Need"
  paper.
- Device Configuration: A function for selecting the optimal
  computation device (CUDA or CPU).
- Logging: Utilities for configuring application-wide logging
  behavior.
- Checkpointing: Functions for saving and loading model weights,
  optimizer, and learning rate scheduler states.
- Visualization: Functions to save training statistics to a CSV
  file and visualize them using Matplotlib.
- Data Preprocessing: A function to download and preprocess the
  IWSLT14 dataset, creating local JSON files for efficient data loading.

This module provides a suite of utility functions to support the machine
learning pipeline. While primarily used as a library by the main
training script, it can also be executed directly from the command line
to perform data preprocessing tasks.
"""
import os
import json
import torch
import random
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from typing import Sequence

# --- Public API ---
__all__ = [
    "NoamLR",
    "early_stopping",
    "get_device",
    "set_seed",
    "save_model",
    "load_checkpoint",
    "save_stats_to_csv",
    "plot_metrics"
]

# --------------- Training Utilities ---------------- #
# --- Learning Rate Schedulers ---
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
    """
    Implements the Noam learning rate schedule from 'Attention Is All You Need'.

    This scheduler increases the learning rate linearly for the first `warmup_steps`
    training steps, and then decreases it proportionally to the inverse square root
    of the step number.

    Learning rate at step t is computed as:
        lr = model_size^{-0.5} * min(t^{-0.5}, t * warmup_steps^{-1.5})

    Attributes:
        model_size (int): The dimensionality of the model (used for scaling the learning rate).
        warmup_steps (int): Number of steps to linearly increase the learning rate.
    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 model_size: int = 512,
                 warmup_steps: int = 4000,
                 factor: float = 1.0,
                 last_epoch: int = -1):
        """Initializes the NoamLR scheduler.

        Args:
            optimizer (Optimizer): Wrapped optimizer.
            model_size (int, optional): Dimensionality of the model. Default is 512.
            warmup_steps (int, optional): Number of warm-up steps. Default is 4000.
            factor (float, optional):Scales the learning rate. Default is 1.0.
            last_epoch (int, optional): The index of last epoch. Default: -1.
        """
        if model_size <= 0:
            raise ValueError("model_size must be a positive integer.")
        if warmup_steps <= 0:
            raise ValueError("warmup_steps must be a positive integer.")
        if factor <= 0:
            raise ValueError("factor must be a positive float.")

        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.factor = factor
        super().__init__(optimizer, last_epoch)

    def get_lr(self) -> list[float]:
        """Computes the learning rate for the current step based on the Noam schedule.

        Returns:
            list: A list containing the learning rate for each parameter group.
        """
        step = max(1, self._step_count)  # Avoid division by zero
        scale = self.factor * self.model_size ** -0.5
        # Calculate the Noam learning rate based on the current step
        lr = scale * min(step ** -0.5, step * (self.warmup_steps ** -1.5))
        return [lr for _ in self.base_lrs]


def early_stopping(
    metric_record: Sequence[float],
    patience: int = 5,
    delta: float = 1e-5,
    best_is_max: bool = True
) -> bool:
    """
    Determine whether training should stop early based on a metric's recent performance.

    This function checks if the monitored metric has failed to improve
    within the last `patience` epochs, considering a minimum improvement
    threshold `delta`.

    Args:
        metric_record (Sequence[float]): Sequence of metric values
            (e.g., BLEU score or validation loss).
        patience (int, optional): Number of epochs to wait for improvement
            before suggesting early stopping. Must be positive. Default is 5.
        delta (float, optional): Minimum change in the metric to qualify as an
            improvement. Default is 1e-5.
        best_is_max (bool, optional): If True, higher metric values are better
            (e.g., BLEU). If False, lower metric values are better (e.g., loss).
            Default is True.

    Returns:
        bool: True if the metric did not improve sufficiently within the
            last `patience` epochs, indicating that training should stop.

    Raises:
        ValueError: If `patience` is not a positive integer.
    """
    if patience <= 0:
        raise ValueError("patience must be a positive integer")

    if len(metric_record) <= patience:
        return False  # not enough history yet

    if best_is_max:
        best_so_far = max(metric_record[:-patience])
        recent_best = max(metric_record[-patience:])
        return recent_best <= best_so_far - delta
    else:
        best_so_far = min(metric_record[:-patience])
        recent_best = min(metric_record[-patience:])
        return recent_best >= best_so_far + delta


# -------------- Device Configuration --------------- #
def get_device():
    """
    Selects and returns the optimal device (GPU or CPU) for computation.

    This function first checks for the availability of a NVIDIA GPU with
    CUDA support. If a GPU is found, it's chosen as the computation device.
    Otherwise, it defaults to the CPU. A descriptive message is printed to
    inform the user which device has been selected. This helps in verifying
    that the hardware is correctly recognized for accelerated computations.

    Returns:
        torch.device: The selected device, either 'cuda' or 'cpu'.
    """
    if torch.cuda.is_available():
        device = torch.device('cuda')
        device_name = torch.cuda.get_device_name(device)
        print(f"Using GPU: {device_name}\n")
    else:
        device = torch.device('cpu')
        print("Using CPU\n")

    return device


def set_seed(seed_value: int = 1747219200):
    """
    Sets the random seed for reproducibility across multiple libraries.

    This function ensures that the random number generators in Python's
    built-in `random` module, NumPy, and PyTorch are all initialized
    with the same seed. This is crucial for creating reproducible
    experiments in machine learning, as it guarantees that operations
    involving randomness (like data shuffling, weight initialization,
    and dropout) will yield the same results every time the code is run.

    Args:
        seed_value (int): The integer value to use as the seed. Defaults to 1747219200.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    # If a GPU is available, set the seed for all CUDA devices
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)


# ------------------ Checkpointing ------------------ #
def save_model(
        epoch: int,
        model: torch.nn.Module,
        opt: torch.optim.Optimizer,
        scheduler: torch.optim.lr_scheduler._LRScheduler,
        loss: float,
        filepath: str ="model_checkpoint.pth"):
    """
    Save model checkpoint.

    Args:
        epoch (int): Current epoch number
        model (nn.Module): Model to save
        opt (torch.optim.Optimizer): Optimizer state to save
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler state to save
        loss (float): Current loss value
        filepath (str): Path to save the checkpoint
    """
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': opt.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': loss
    }
    try:
        # Ensure the directory exists
        output_dir = os.path.dirname(filepath)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            logging.debug(f"Ensured directory exists: {output_dir}")

        torch.save(checkpoint, filepath)
        logging.info(f"Model checkpoint saved successfully at epoch {epoch} "
                     f"to {filepath}")
    except Exception as e:
        logging.error(f"Failed to save model checkpoint at epoch {epoch} "
                      f"to {filepath}: {e}")


def load_checkpoint(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler._LRScheduler,
    checkpoint_path: str = "model_checkpoint.pth",
    device: torch.device = torch.device("cpu")
) -> tuple[int, float | None]:
    """
    Loads model checkpoint, ignoring keys that are not present.
    """
    if not os.path.exists(checkpoint_path):
        logging.warning(f"No checkpoint found at '{checkpoint_path}'. "
                     f"Starting training from epoch 1.")
        return 1, None

    try:
        logging.info(f"Attempting to load checkpoint from: {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path,
                                map_location=device,
                                weights_only=False)

        model.load_state_dict(checkpoint["model_state_dict"], strict=False)
        logging.info("Model state loaded. Positional encoding buffer was skipped "
                     "as intended.")

        if "optimizer_state_dict" in checkpoint:
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        else:
            logging.warning("Optimizer state not found in checkpoint.")

        if "scheduler_state_dict" in checkpoint:
            scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        else:
            logging.warning("Scheduler state not found in checkpoint.")

        start_epoch = checkpoint["epoch"] + 1
        last_loss = checkpoint.get("loss", None)

        logging.info(f"Successfully resumed model from epoch {start_epoch}. "
                     f"Last loss: {last_loss if last_loss is not None else 'N/A'}")

        return start_epoch, last_loss

    except Exception as e:
        logging.error(f"Failed to load checkpoint from '{checkpoint_path}': "
                      f"{e}. Starting from epoch 1.")
        return 1, None


# ------------------- Statistics -------------------- #
def save_stats_to_csv(
        stats_record: dict[str, list[float]],
        file_path: str = None,
        epoch: int = None):
    """
    Saves training statistics to a CSV file.

    This function conditionally saves either the entire history of metrics
    or appends the latest epoch's metrics to an existing file.

    Args:
        stats_record (dict[str, list[float]]): A dictionary where each key is a
            metric name and each value is a list of metric values per epoch.
        file_path (str, optional): Path to the CSV file. If None, defaults to
            'training_stats.csv' in the current working directory.
        epoch (int, optional): If provided, the function appends only the latest
            values for the given epoch to the CSV. If None, it overwrites any
            existing file with the entire history from `stats_record`.

    Raises:
        ValueError: If `stats_record` contains no non-empty lists of metrics.
        OSError: If there's an issue creating the output directory or writing the
                 file.
    """
    target_path = file_path if file_path is not None else "training_stats.csv"

    available_data = {k: v for k, v in stats_record.items() if v}
    if not available_data:
        logging.error("No non-empty stats data provided to save. Aborting save "
                        "operation.")
        raise ValueError("Cannot save stats: No non-empty data found in "
                         "'stats_record'.")

    output_dir = os.path.dirname(target_path)
    if output_dir:  # Checks if output_dir is not an empty string
        try:
            os.makedirs(output_dir, exist_ok=True)
            logging.debug(f"Ensured output directory exists: '{output_dir}'")
        except Exception as e:
            logging.error(f"Failed to create directory '{output_dir}': {e}")
            raise OSError(
                f"Could not create output directory '{output_dir}'.") from e

    try:
        if epoch is None:   # Full save mode
            num_epochs = len(next(iter(available_data.values())))

            # Create a DataFrame for all historical data
            df = pd.DataFrame(
                {'epoch': list(range(1, num_epochs + 1)), **available_data})
            df.to_csv(target_path, index=False)
            logging.info(f"Full training statistics (epochs 1-{num_epochs}) saved "
                         f"to: '{target_path}'")
        else:   # Append mode
            new_data = {
                'epoch': [epoch],
                **{k: [v[-1]] for k, v in available_data.items()}
            }
            # Create a DataFrame for the new data point
            df = pd.DataFrame(new_data)

            write_header = not os.path.exists(target_path)

            df.to_csv(target_path, mode='a', header=write_header, index=False)
            logging.info(f"Epoch {epoch} statistics appended to: '{target_path}'")

    except Exception as e:
        logging.error(f"Failed to save training statistics to '{target_path}': {e}")
        raise OSError(f"Could not write to file '{target_path}'.") from e


# ------------------ Visualization ------------------ #
def _plot_losses(statistics: dict[str, list[float]]):
    """
    Plots the training and validation loss on the same graph for direct comparison.

    Args:
        statistics (dict): A dictionary with two keys:
            - 'train' (list): Training loss values per epoch.
            - 'validation' (list): Validation loss values per epoch.

    The function creates a single plot:
    - The x-axis represents epochs.
    - The y-axis represents the loss values.
    - Both train and validation losses are plotted with different colors and markers.

    Raises:
        ValueError: If `statistics` doesn't hold 'train' or 'validation'.
    """
    if "train" not in statistics or "validation" not in statistics:
        logging.error("Input dictionary must contain 'train' and 'validation' keys "
                      "for _plot_losses.")
        raise ValueError("Input dictionary must contain 'train' and 'validation' "
                         "keys.")
    # --- Data Extraction ---
    train_loss = statistics['train']
    validation_loss = statistics['validation']
    epochs = range(1, len(train_loss) + 1)
    # --- Plotting Configuration ---
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, train_loss, linestyle='-', color='#1f77b4',
             label='Train Loss', linewidth=2)
    plt.plot(epochs, validation_loss, linestyle='-', color='#d62728',
             label='Validation Loss', linewidth=2)
    # --- Chart Customization ---
    plt.title("Training & Validation Loss Over Epochs",
              fontsize=18, fontweight='bold')
    plt.xticks(epochs) # This ensures that xticks are integers
    plt.xlabel("Epoch", fontsize=12)
    plt.ylabel("Loss", fontsize=12)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.6)
    # --- Display Plot ---
    plt.show()


def _plot_bleu(bleu_scores: list[float]):
    """
    Plots BLEU scores over training epochs to visualize translation performance.

    Args:
        bleu_scores (list): A list of BLEU scores, one per epoch.

    The function creates a single plot:
    - The x-axis represents epochs.
    - The y-axis represents BLEU scores.
    - BLEU scores are plotted with a green line to show trends in translation quality.
    """
    if not bleu_scores:
        logging.warning("No BLEU scores provided to plot. Skipping BLEU plot.")
        return

    epochs = list(range(1, len(bleu_scores) + 1))
    # --- Plotting Configuration ---
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, bleu_scores, label='BLEU Score', color='green', linewidth=2)
    # --- Chart Customization ---
    plt.title("BLEU Score Over Epochs", fontsize=18, fontweight='bold')
    plt.xlabel("Epoch", fontsize=12)
    plt.ylabel("BLEU Score", fontsize=12)
    plt.xticks(epochs)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.legend(fontsize=10)
    plt.tight_layout()
    # --- Display Plot ---
    plt.show()


def plot_metrics(records: dict[str, list[float]]):
    """
    Plots training/validation losses and BLEU scores based on the provided metrics.

    Args:
        records (dict): A dictionary containing recorded metrics. Expected keys:
            - 'train' (list): Training loss values per epoch.
            - 'validation' (list): Validation loss values per epoch.
            - 'bleu' (list): BLEU scores per epoch.

    The function conditionally generates plots:
    - If both 'train' and 'validation' are present and non-empty, it plots losses.
    - If 'bleu' is present and non-empty, it plots BLEU scores.
    """
    # Plot training and validation losses if data is available
    if records.get('train') and records.get('validation'):
        _plot_losses(records)

    # Plot BLEU scores if data is available
    if records.get('bleu'):
        _plot_bleu(records['bleu'])


# ------------------ Data Preprocessing ------------------ #
def make_iwslt14_local_file(split: str,
                            debug: bool = False,
                            debug_size: int = 1000):
    """
    Saves the IWSLT14 dataset as a JSON file.

    Args:
        split (str): The dataset split to save ("train", "validation", or "test").
        debug (bool): If True, saves only a small subset for debugging.
        debug_size (int): Number of samples to keep in debug mode.

    Raises:
        ValueError: If `split` is not one of ['train', 'validation', 'test'].
    """
    if split not in ["train", "validation", "test"]:
        logging.error(f"Invalid dataset split provided: '{split}'. Must be 'train', "
                      f"'validation', or 'test'.")
        raise ValueError(f"Invalid 'split' argument: '{split}'.")

    dataset = load_dataset("ahazeemi/iwslt14-en-fr")[split]

    if debug: # Debug mode
        if debug_size <= 0:
            logging.warning(
                f"Debug size '{debug_size}' is invalid. Using default of 100.")
            debug_size = 100  # Fallback for invalid debug_size
        logging.info(f"Debug mode enabled: Selecting {debug_size} samples from "
                     f"'{split}' split.")
        actual_debug_size = min(debug_size, len(dataset))
        dataset = dataset.select(range(actual_debug_size))
        logging.debug(f"Selected {actual_debug_size} samples for debug mode.")

    logging.debug(
        f"Dataset loaded. Total samples in '{split}' split: {len(dataset)}")

    # Save dataset under the correct split
    local_dataset = {
        split: {
            "en": dataset["en"],
            "fr": dataset["fr"]
        }
    }

    filename = f"iwslt14_{split}_debug.json" if debug else f"iwslt14_{split}.json"

    output_dir = "data/local_datasets"
    os.makedirs(output_dir, exist_ok=True)

    full_filepath = os.path.join(output_dir, filename)

    with open(full_filepath, "w", encoding="utf-8") as f:
        json.dump(local_dataset, f, ensure_ascii=False, indent=4)

    print(f"{split} dataset saved as {filename} ({'debug' if debug else 'full'})")


"""
In order to generate full and debug datasets for train, validation,
and test splits of IWSLT14 Fr-En, uncomment the code below and run it
"""
# for sp in ["train", "validation", "test"]:
#     make_iwslt14_local_file(split=sp, debug=False)  # Full dataset
#     make_iwslt14_local_file(split=sp, debug=True)  # Debug datasets