From 84db079be5016a00338559ce1e91ef5febcceac4 Mon Sep 17 00:00:00 2001
From: Wouter Devriendt <wouterdevriendt@meta.com>
Date: Tue, 3 Mar 2026 22:58:46 -0800
Subject: [PATCH] fix: persistent disk cleanup and fallback behavior (UNTESTED)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

⚠️ WARNING: These changes are UNTESTED and need validation before merging.

Fix 1: Force-detach EBS volumes before deletion (reservation_expiry)
- Problem: Volume deletion fails with "VolumeInUse" if still attached
- Solution: Detach volume, wait for "available" state, then delete
- Impact: Prevents orphaned volumes when cleanup fails

Fix 2: Fail reservation when user explicitly requests persistent disk (reservation_processor)
- Problem: If user requested specific disk_name and it fails, silently fell back to EmptyDir
- Solution: Fail reservation if disk_name is set OR if error is "disk in use"
- Impact: Clear error messages instead of silent EmptyDir fallback

Changes:
- reservation_expiry/index.py: Add detach logic before delete (15 lines)
- reservation_processor/index.py: Check disk_name before fallback (8 lines)

Test plan needed:
- [ ] Test volume cleanup after expiry with snapshot
- [ ] Test explicit disk_name request with unavailable disk
- [ ] Test backward compatibility for old-style reservations without disk_name
- [ ] Verify no orphaned volumes remain after cleanup failures
---
 .../lambda/reservation_expiry/index.py           | 15 ++++++++++++++-
 .../lambda/reservation_processor/index.py        | 16 ++++++++--------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/terraform-gpu-devservers/lambda/reservation_expiry/index.py b/terraform-gpu-devservers/lambda/reservation_expiry/index.py
index a2a04f1d..1fac5504 100644
--- a/terraform-gpu-devservers/lambda/reservation_expiry/index.py
+++ b/terraform-gpu-devservers/lambda/reservation_expiry/index.py
@@ -1576,9 +1576,22 @@ def cleanup_pod(pod_name: str, namespace: str = "gpu-dev", reservation_data: dic
                                 logger.warning(f"Error updating DynamoDB for snapshot completion: {update_error}")
                                 # Don't fail cleanup if DynamoDB update fails
 
-                        # Step 4: Delete the EBS volume after snapshot completes
+                        # Step 4: Detach and delete the EBS volume after snapshot completes
                         try:
                             logger.info(f"Deleting EBS volume {volume_id} after successful snapshot")
+                            # Detach first if still attached (prevents VolumeInUse errors)
+                            try:
+                                vol_desc = ec2_client.describe_volumes(VolumeIds=[volume_id])
+                                attachments = vol_desc['Volumes'][0].get('Attachments', [])
+                                if attachments:
+                                    logger.info(f"Volume {volume_id} still attached to {attachments[0].get('InstanceId')} - detaching first")
+                                    ec2_client.detach_volume(VolumeId=volume_id, Force=True)
+                                    waiter = ec2_client.get_waiter('volume_available')
+                                    waiter.wait(VolumeIds=[volume_id], WaiterConfig={'Delay': 5, 'MaxAttempts': 24})
+                                    logger.info(f"Volume {volume_id} detached successfully")
+                            except Exception as detach_error:
+                                logger.warning(f"Error detaching volume {volume_id}: {detach_error}")
+
                             ec2_client.delete_volume(VolumeId=volume_id)
                             logger.info(f"Successfully deleted volume {volume_id}")
 
diff --git a/terraform-gpu-devservers/lambda/reservation_processor/index.py b/terraform-gpu-devservers/lambda/reservation_processor/index.py
index d2b0bb17..a376ad85 100644
--- a/terraform-gpu-devservers/lambda/reservation_processor/index.py
+++ b/terraform-gpu-devservers/lambda/reservation_processor/index.py
@@ -2504,23 +2504,23 @@ def progress_callback(progress_message):
                     logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}")
             except Exception as disk_error:
                 logger.error(f"Failed to set up persistent disk: {disk_error}")
-
-                # Check if this is a "disk in use" error - these should fail the reservation
                 error_msg = str(disk_error)
-                if "is currently in use" in error_msg or "already in use" in error_msg:
-                    # Don't fall back - fail the reservation with clear error
+
+                # If user explicitly requested a disk (disk_name set), never silently fall back
+                # Also fail for "disk in use" errors regardless
+                if disk_name or "is currently in use" in error_msg or "already in use" in error_msg:
                     update_reservation_status(
                         reservation_id,
                         "failed",
-                        failure_reason=error_msg
+                        failure_reason=f"Persistent disk setup failed: {error_msg}"
                     )
                     raise RuntimeError(f"Cannot create reservation: {error_msg}")
 
-                # For other errors, continue without persistent disk (backwards compatibility)
+                # Only fall back to non-persistent for old-style reservations without explicit disk_name
                 logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}")
                 use_persistent_disk = False
-                persistent_volume_id = None  # Clear any volume that was set before the error
-                is_new_disk = True  # EmptyDir volume will need shell environment setup
+                persistent_volume_id = None
+                is_new_disk = True
                 update_reservation_status(
                     reservation_id,
                     "preparing",