diff --git a/terraform-gpu-devservers/lambda/reservation_expiry/index.py b/terraform-gpu-devservers/lambda/reservation_expiry/index.py index a2a04f1d..1fac5504 100644 --- a/terraform-gpu-devservers/lambda/reservation_expiry/index.py +++ b/terraform-gpu-devservers/lambda/reservation_expiry/index.py @@ -1576,9 +1576,22 @@ def cleanup_pod(pod_name: str, namespace: str = "gpu-dev", reservation_data: dic logger.warning(f"Error updating DynamoDB for snapshot completion: {update_error}") # Don't fail cleanup if DynamoDB update fails - # Step 4: Delete the EBS volume after snapshot completes + # Step 4: Detach and delete the EBS volume after snapshot completes try: logger.info(f"Deleting EBS volume {volume_id} after successful snapshot") + # Detach first if still attached (prevents VolumeInUse errors) + try: + vol_desc = ec2_client.describe_volumes(VolumeIds=[volume_id]) + attachments = vol_desc['Volumes'][0].get('Attachments', []) + if attachments: + logger.info(f"Volume {volume_id} still attached to {attachments[0].get('InstanceId')} - detaching first") + ec2_client.detach_volume(VolumeId=volume_id, Force=True) + waiter = ec2_client.get_waiter('volume_available') + waiter.wait(VolumeIds=[volume_id], WaiterConfig={'Delay': 5, 'MaxAttempts': 24}) + logger.info(f"Volume {volume_id} detached successfully") + except Exception as detach_error: + logger.warning(f"Error detaching volume {volume_id}: {detach_error}") + ec2_client.delete_volume(VolumeId=volume_id) logger.info(f"Successfully deleted volume {volume_id}") diff --git a/terraform-gpu-devservers/lambda/reservation_processor/index.py b/terraform-gpu-devservers/lambda/reservation_processor/index.py index d2b0bb17..a376ad85 100644 --- a/terraform-gpu-devservers/lambda/reservation_processor/index.py +++ b/terraform-gpu-devservers/lambda/reservation_processor/index.py @@ -2504,23 +2504,23 @@ def progress_callback(progress_message): logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}") except Exception as disk_error: logger.error(f"Failed to set up persistent disk: {disk_error}") - - # Check if this is a "disk in use" error - these should fail the reservation error_msg = str(disk_error) - if "is currently in use" in error_msg or "already in use" in error_msg: - # Don't fall back - fail the reservation with clear error + + # If user explicitly requested a disk (disk_name set), never silently fall back + # Also fail for "disk in use" errors regardless + if disk_name or "is currently in use" in error_msg or "already in use" in error_msg: update_reservation_status( reservation_id, "failed", - failure_reason=error_msg + failure_reason=f"Persistent disk setup failed: {error_msg}" ) raise RuntimeError(f"Cannot create reservation: {error_msg}") - # For other errors, continue without persistent disk (backwards compatibility) + # Only fall back to non-persistent for old-style reservations without explicit disk_name logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}") use_persistent_disk = False - persistent_volume_id = None # Clear any volume that was set before the error - is_new_disk = True # EmptyDir volume will need shell environment setup + persistent_volume_id = None + is_new_disk = True update_reservation_status( reservation_id, "preparing",