Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion terraform-gpu-devservers/lambda/reservation_expiry/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,9 +1576,22 @@ def cleanup_pod(pod_name: str, namespace: str = "gpu-dev", reservation_data: dic
logger.warning(f"Error updating DynamoDB for snapshot completion: {update_error}")
# Don't fail cleanup if DynamoDB update fails

# Step 4: Delete the EBS volume after snapshot completes
# Step 4: Detach and delete the EBS volume after snapshot completes
try:
logger.info(f"Deleting EBS volume {volume_id} after successful snapshot")
# Detach first if still attached (prevents VolumeInUse errors)
try:
vol_desc = ec2_client.describe_volumes(VolumeIds=[volume_id])
attachments = vol_desc['Volumes'][0].get('Attachments', [])
if attachments:
logger.info(f"Volume {volume_id} still attached to {attachments[0].get('InstanceId')} - detaching first")
ec2_client.detach_volume(VolumeId=volume_id, Force=True)
waiter = ec2_client.get_waiter('volume_available')
waiter.wait(VolumeIds=[volume_id], WaiterConfig={'Delay': 5, 'MaxAttempts': 24})
logger.info(f"Volume {volume_id} detached successfully")
except Exception as detach_error:
logger.warning(f"Error detaching volume {volume_id}: {detach_error}")

ec2_client.delete_volume(VolumeId=volume_id)
logger.info(f"Successfully deleted volume {volume_id}")

Expand Down
16 changes: 8 additions & 8 deletions terraform-gpu-devservers/lambda/reservation_processor/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2504,23 +2504,23 @@ def progress_callback(progress_message):
logger.warning(f"Stored warning for reservation {reservation_id}: {disk_warning}")
except Exception as disk_error:
logger.error(f"Failed to set up persistent disk: {disk_error}")

# Check if this is a "disk in use" error - these should fail the reservation
error_msg = str(disk_error)
if "is currently in use" in error_msg or "already in use" in error_msg:
# Don't fall back - fail the reservation with clear error

# If user explicitly requested a disk (disk_name set), never silently fall back
# Also fail for "disk in use" errors regardless
if disk_name or "is currently in use" in error_msg or "already in use" in error_msg:
update_reservation_status(
reservation_id,
"failed",
failure_reason=error_msg
failure_reason=f"Persistent disk setup failed: {error_msg}"
)
raise RuntimeError(f"Cannot create reservation: {error_msg}")

# For other errors, continue without persistent disk (backwards compatibility)
# Only fall back to non-persistent for old-style reservations without explicit disk_name
logger.warning(f"Falling back to non-persistent storage due to disk error: {disk_error}")
use_persistent_disk = False
persistent_volume_id = None # Clear any volume that was set before the error
is_new_disk = True # EmptyDir volume will need shell environment setup
persistent_volume_id = None
is_new_disk = True
update_reservation_status(
reservation_id,
"preparing",
Expand Down