From 84db079be5016a00338559ce1e91ef5febcceac4 Mon Sep 17 00:00:00 2001 From: Wouter Devriendt Date: Tue, 3 Mar 2026 22:58:46 -0800 Subject: [PATCH] fix: persistent disk cleanup and fallback behavior (UNTESTED) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ⚠️ WARNING: These changes are UNTESTED and need validation before merging. Fix 1: Force-detach EBS volumes before deletion (reservation_expiry) - Problem: Volume deletion fails with "VolumeInUse" if still attached - Solution: Detach volume, wait for "available" state, then delete - Impact: Prevents orphaned volumes when cleanup fails Fix 2: Fail reservation when user explicitly requests persistent disk (reservation_processor) - Problem: If user requested specific disk_name and it fails, silently fell back to EmptyDir - Solution: Fail reservation if disk_name is set OR if error is "disk in use" - Impact: Clear error messages instead of silent EmptyDir fallback Changes: - reservation_expiry/index.py: Add detach logic before delete (15 lines) - reservation_processor/index.py: Check disk_name before fallback (8 lines) Test plan needed: - [ ] Test volume cleanup after expiry with snapshot - [ ] Test explicit disk_name request with unavailable disk - [ ] Test backward compatibility for old-style reservations without disk_name - [ ] Verify no orphaned volumes remain after cleanup failures --- .../lambda/reservation_expiry/index.py | 15 ++++++++++++++- .../lambda/reservation_processor/index.py | 16 ++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/terraform-gpu-devservers/lambda/reservation_expiry/index.py b/terraform-gpu-devservers/lambda/reservation_expiry/index.py index a2a04f1d..1fac5504 100644 --- a/terraform-gpu-devservers/lambda/reservation_expiry/index.py +++ b/terraform-gpu-devservers/lambda/reservation_expiry/index.py @@ -1576,9 +1576,22 @@ def cleanup_pod(pod_name: str, namespace: str = "gpu-dev", reservation_data: dic logger.warning(f"Error updating DynamoDB for snapshot completion: {update_error}") # Don't fail cleanup if DynamoDB update fails - # Step 4: Delete the EBS volume after snapshot completes + # Step 4: Detach and delete the EBS volume after snapshot completes try: logger.info(f"Deleting EBS volume {volume_id} after successful snapshot") + # Detach first if still attached (prevents VolumeInUse errors) + try: + vol_desc = ec2_client.describe_volumes(VolumeIds=[volume_id]) + attachments = vol_desc['Volumes'][0].get('Attachments', []) + if attachments: + logger.info(f"Volume {volume_id} still attached to {attachments[0].get('InstanceId')} - detaching first") + ec2_client.detach_volume(VolumeId=volume_id, Force=True) + waiter = ec2_client.get_waiter('volume_available') + waiter.wait(VolumeIds=[volume_id], WaiterConfig={'Delay': 5, 'MaxAttempts': 24}) + logger.info(f"Volume {volume_id} detached successfully") + except Exception as detach_error: + logger.warning(f"Error detaching volume {volume_id}: {detach_error}") + ec2_client.delete_volume(VolumeId=volume_id) logger.info(f"Successfully deleted volume {volume_id}") diff --git a/terraform-gpu-devservers/lambda/reservation_processor/index.py b/terraform-gpu-devservers/lambda/reservation_processor/index.py index d2b0bb17..a376ad85 100644 --- a/terraform-gpu-devservers/lambda/reservation_processor/index.py +++ b/terraform-gpu-devservers/lambda/reservation_processor/index.py @@ -2504,23 +2504,23 @@ def progress_callback(progress_message): logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}") except Exception as disk_error: logger.error(f"Failed to set up persistent disk: {disk_error}") - - # Check if this is a "disk in use" error - these should fail the reservation error_msg = str(disk_error) - if "is currently in use" in error_msg or "already in use" in error_msg: - # Don't fall back - fail the reservation with clear error + + # If user explicitly requested a disk (disk_name set), never silently fall back + # Also fail for "disk in use" errors regardless + if disk_name or "is currently in use" in error_msg or "already in use" in error_msg: update_reservation_status( reservation_id, "failed", - failure_reason=error_msg + failure_reason=f"Persistent disk setup failed: {error_msg}" ) raise RuntimeError(f"Cannot create reservation: {error_msg}") - # For other errors, continue without persistent disk (backwards compatibility) + # Only fall back to non-persistent for old-style reservations without explicit disk_name logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}") use_persistent_disk = False - persistent_volume_id = None # Clear any volume that was set before the error - is_new_disk = True # EmptyDir volume will need shell environment setup + persistent_volume_id = None + is_new_disk = True update_reservation_status( reservation_id, "preparing",