Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/scope/cloud/fal_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,16 @@ def setup(self):
print(f"GPU check failed: {e}")
raise

# Log CUDA environment so failures in plugin pipelines (e.g. flashvsr)
# that surface as "No CUDA GPUs are available" can be correlated with
# the worker configuration seen at startup time.
cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "<not set>")
nv_vis = os.environ.get("NVIDIA_VISIBLE_DEVICES", "<not set>")
print(
f"CUDA env at startup: CUDA_VISIBLE_DEVICES={cvd!r} "
f"NVIDIA_VISIBLE_DEVICES={nv_vis!r}"
)

# Environment for scope - whitelist only necessary variables (security)
ENV_WHITELIST = [
# Required for process execution
Expand Down
56 changes: 56 additions & 0 deletions src/scope/server/pipeline_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,53 @@ def get_device() -> torch.device:
return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def _assert_cuda_accessible() -> None:
"""Raise RuntimeError with a clear message if CUDA cannot actually be used.

``torch.cuda.is_available()`` only checks that the CUDA *runtime* is
installed; it does **not** guarantee that a physical GPU is visible. On
fal.ai GPU workers that use MIG partitions or that set
``CUDA_VISIBLE_DEVICES`` to an unexpected value the check passes but any
subsequent attempt to allocate a CUDA tensor raises
"No CUDA GPUs are available".

This helper forces lazy CUDA initialisation early so that the error surface
is a clean, actionable exception rather than a cryptic failure buried deep
inside a plugin's ``__init__``.
"""
import os

if not torch.cuda.is_available():
n_devs = torch.cuda.device_count()
cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "<not set>")
raise RuntimeError(
f"No CUDA GPUs are available (device_count={n_devs}, "
f"CUDA_VISIBLE_DEVICES={cvd!r}). "
"Check that the worker has a visible GPU and that "
"CUDA_VISIBLE_DEVICES is set correctly."
)

# is_available() returned True — now do a real device-count check and a
# tiny test allocation to catch cases where CUDA context init will fail
# (e.g. empty CUDA_VISIBLE_DEVICES, invalid MIG UUID, driver mismatch).
n_devs = torch.cuda.device_count()
cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "<not set>")
if n_devs == 0:
raise RuntimeError(
f"No CUDA GPUs are available (device_count=0, "
f"CUDA_VISIBLE_DEVICES={cvd!r}). "
"CUDA runtime is installed but no devices are visible."
)

try:
_ = torch.zeros(1, device="cuda")
except RuntimeError as exc:
raise RuntimeError(
f"CUDA device_count={n_devs} but test tensor allocation failed "
f"(CUDA_VISIBLE_DEVICES={cvd!r}): {exc}"
) from exc


class PipelineNotAvailableException(Exception):
"""Exception raised when pipeline is not available for processing."""

Expand Down Expand Up @@ -733,6 +780,15 @@ def _load_pipeline_implementation(
logger.info(f"Loading plugin pipeline: {pipeline_id}")
if stage_callback:
stage_callback("Initializing pipeline...")

# Validate that CUDA is actually accessible before handing off to
# the plugin. Plugin __init__ methods often allocate CUDA tensors
# immediately (model loads, warmup passes) and the generic
# "No CUDA GPUs are available" error they produce is hard to trace.
# _assert_cuda_accessible() surfaces the problem early with extra
# diagnostic context (device_count, CUDA_VISIBLE_DEVICES).
_assert_cuda_accessible()

config_class = pipeline_class.get_config_class()
# Get defaults from schema fields
schema_defaults = {}
Expand Down
Loading