From 2130cad84b78e1a9bc2a1132cf282e13c221daa3 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Mar 2026 15:34:29 +0500 Subject: [PATCH 1/4] Migrate attribute comments to docstrings --- .../_internal/core/models/backends/base.py | 5 +- src/dstack/_internal/core/models/common.py | 6 +- .../_internal/core/models/compute_groups.py | 8 +- src/dstack/_internal/core/models/config.py | 5 +- .../_internal/core/models/configurations.py | 11 +- src/dstack/_internal/core/models/fleets.py | 14 +- src/dstack/_internal/core/models/gateways.py | 28 ++- src/dstack/_internal/core/models/instances.py | 30 ++- src/dstack/_internal/core/models/placement.py | 3 +- .../_internal/core/models/repos/remote.py | 3 +- src/dstack/_internal/core/models/resources.py | 4 +- src/dstack/_internal/core/models/runs.py | 153 +++++++++------ src/dstack/_internal/core/models/volumes.py | 20 +- src/dstack/_internal/server/models.py | 175 +++++++++++------- .../_internal/server/schemas/health/dcgm.py | 11 +- src/dstack/_internal/server/schemas/runner.py | 23 ++- 16 files changed, 312 insertions(+), 187 deletions(-) diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index ba382a0b66..2e8eb898ee 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -32,7 +32,8 @@ class BackendType(str, enum.Enum): CLOUDRIFT = "cloudrift" CRUSOE = "crusoe" CUDO = "cudo" - DATACRUNCH = "datacrunch" # BackendType for backward compatibility + DATACRUNCH = "datacrunch" + """`DATACRUNCH` is kept as a `BackendType` for backward compatibility.""" DIGITALOCEAN = "digitalocean" DSTACK = "dstack" GCP = "gcp" @@ -40,7 +41,7 @@ class BackendType(str, enum.Enum): KUBERNETES = "kubernetes" LAMBDA = "lambda" LOCAL = "local" - REMOTE = "remote" # TODO: replace for LOCAL + REMOTE = "remote" NEBIUS = "nebius" OCI = "oci" RUNPOD = "runpod" diff --git a/src/dstack/_internal/core/models/common.py b/src/dstack/_internal/core/models/common.py index 6fcc6d0392..f55a032ba5 100644 --- a/src/dstack/_internal/core/models/common.py +++ b/src/dstack/_internal/core/models/common.py @@ -134,8 +134,10 @@ class RegistryAuth(FrozenCoreModel): class ApplyAction(str, Enum): - CREATE = "create" # resource is to be created or overridden - UPDATE = "update" # resource is to be updated in-place + CREATE = "create" + """`CREATE` means the resource is to be created or overridden.""" + UPDATE = "update" + """`UPDATE` means the resource is to be updated in-place.""" class NetworkMode(str, Enum): diff --git a/src/dstack/_internal/core/models/compute_groups.py b/src/dstack/_internal/core/models/compute_groups.py index 3fa967494d..55dc0d2385 100644 --- a/src/dstack/_internal/core/models/compute_groups.py +++ b/src/dstack/_internal/core/models/compute_groups.py @@ -24,12 +24,14 @@ class ComputeGroupProvisioningData(CoreModel): compute_group_id: str compute_group_name: str backend: BackendType - # In case backend provisions instance in another backend, - # it may set that backend as base_backend. base_backend: Optional[BackendType] = None + """`base_backend` may be set when a backend provisions an instance in another backend and needs + to record that backend as `base_backend`. + """ region: str job_provisioning_datas: List[JobProvisioningData] - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" class ComputeGroup(CoreModel): diff --git a/src/dstack/_internal/core/models/config.py b/src/dstack/_internal/core/models/config.py index c6d0916672..a0497401d9 100644 --- a/src/dstack/_internal/core/models/config.py +++ b/src/dstack/_internal/core/models/config.py @@ -23,6 +23,7 @@ class RepoConfig(CoreModel): class GlobalConfig(CoreModel): projects: Annotated[List[ProjectConfig], Field(description="The list of projects")] = [] - # Not used since 0.20.0. Can be removed when most users update their `config.yml` (it's updated - # each time a project is added) repos: Annotated[list[RepoConfig], Field(exclude=True)] = [] + """`repos` is not used since 0.20.0. It can be removed when most users update their `config.yml` + because it is updated each time a project is added. + """ diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 8c86fd5bd9..ac8d8d172b 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -101,10 +101,10 @@ def parse(cls, v: str) -> "PortMapping": class RepoExistsAction(str, Enum): - # Don't try to check out, terminate the run with an error (the default action since 0.20.0) ERROR = "error" - # Don't try to check out, skip the repo (the logic hardcoded in the pre-0.20.0 runner) + """`ERROR` means do not try to check out and terminate the run with an error. This is the default action since 0.20.0.""" SKIP = "skip" + """`SKIP` means do not try to check out and skip the repo. This is the logic hardcoded in the pre-0.20.0 runner.""" class RepoSpec(CoreModel): @@ -469,8 +469,8 @@ class BaseRunConfiguration(CoreModel): ), ), ] = None - # deprecated since 0.18.31; has no effect home_dir: str = "/root" + """`home_dir` is deprecated since 0.18.31 and has no effect.""" registry_auth: Annotated[ Optional[RegistryAuth], Field(description="Credentials for pulling a private Docker image") ] = None @@ -540,8 +540,11 @@ class BaseRunConfiguration(CoreModel): list[FilePathMapping], Field(description="The local to container file path mappings"), ] = [] - # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init` setup: CommandsList = [] + """ + setup: Deprecated since 0.18.31. It has no effect for tasks and services; for + dev environments it runs right before `init`. + """ @validator("python", pre=True, always=True) def convert_python(cls, v, values) -> Optional[PythonVersion]: diff --git a/src/dstack/_internal/core/models/fleets.py b/src/dstack/_internal/core/models/fleets.py index c88f606640..a56296bbad 100644 --- a/src/dstack/_internal/core/models/fleets.py +++ b/src/dstack/_internal/core/models/fleets.py @@ -30,8 +30,8 @@ class FleetStatus(str, Enum): - # Currently all fleets are ACTIVE/TERMINATING/TERMINATED - # SUBMITTED/FAILED may be used if fleets require async processing + # Currently all fleets are ACTIVE, TERMINATING, or TERMINATED. + # SUBMITTED and FAILED may be used if fleets require async processing. SUBMITTED = "submitted" ACTIVE = "active" TERMINATING = "terminating" @@ -372,10 +372,11 @@ class FleetSpec(generate_dual_core_model(FleetSpecConfig)): configuration_path: Optional[str] = None profile: Profile autocreated: bool = False - # merged_profile stores profile parameters merged from profile and configuration. - # Read profile parameters from merged_profile instead of profile directly. - # TODO: make merged_profile a computed field after migrating to pydanticV2 + # TODO: make `merged_profile` a computed field after migrating to Pydantic v2. merged_profile: Annotated[Profile, Field(exclude=True)] = None + """`merged_profile` stores profile parameters merged from `profile` and `configuration`. + Read profile parameters from `merged_profile` instead of `profile` directly. + """ @root_validator def _merged_profile(cls, values) -> Dict: @@ -416,7 +417,8 @@ class FleetPlan(CoreModel): offers: List[InstanceOfferWithAvailability] total_offers: int max_offer_price: Optional[float] = None - action: Optional[ApplyAction] = None # default value for backward compatibility + action: Optional[ApplyAction] = None + """`action` uses a default value for backward compatibility.""" def get_effective_spec(self) -> FleetSpec: if self.effective_spec is not None: diff --git a/src/dstack/_internal/core/models/gateways.py b/src/dstack/_internal/core/models/gateways.py index 7f09d3df18..b3fbadb844 100644 --- a/src/dstack/_internal/core/models/gateways.py +++ b/src/dstack/_internal/core/models/gateways.py @@ -102,27 +102,33 @@ class GatewaySpec(CoreModel): class Gateway(CoreModel): - # ID is only optional on the client side for compatibility with pre-0.20.7 servers. - # TODO(0.21): Make required. + # TODO(0.21): Make `id` required. id: Optional[uuid.UUID] = None + """`id` is only optional on the client side for compatibility with pre-0.20.7 servers.""" name: str configuration: GatewayConfiguration created_at: datetime.datetime status: GatewayStatus status_message: Optional[str] - # The ip address / hostname the user should set up the domain for. - # Could be the same as ip_address but also different, e.g. gateway behind ALB. hostname: Optional[str] - # The ip address of the gateway instance + """`hostname` is the IP address or hostname the user should set up the domain for. + Could be the same as `ip_address` but also different, for example a gateway behind ALB. + """ ip_address: Optional[str] + """`ip_address` is the IP address of the gateway instance.""" instance_id: Optional[str] wildcard_domain: Optional[str] default: bool - # TODO: Deprecated configuration fields duplicated on top-level - # for backward compatibility with 0.19.x clients that expect them required. - # Remove after 0.21 backend: Optional[BackendType] = None + """`backend` duplicates a configuration field on the top level for backward compatibility + with 0.19.x clients that expect it to be required. + Remove after 0.21. + """ region: Optional[str] = None + """`region` duplicates a configuration field on the top level for backward compatibility + with 0.19.x clients that expect it to be required. + Remove after 0.21. + """ class GatewayPlan(CoreModel): @@ -147,8 +153,10 @@ class GatewayComputeConfiguration(CoreModel): class GatewayProvisioningData(CoreModel): instance_id: str - ip_address: str # TODO: rename, Kubernetes uses domain names + # TODO: rename `ip_address`; Kubernetes uses domain names here. + ip_address: str region: str availability_zone: Optional[str] = None hostname: Optional[str] = None - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index 7eccee8b69..49e6281f16 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -23,9 +23,11 @@ class Gpu(CoreModel): name: str memory_mib: int - # Although it's declared as Optional, in fact it always has a value set by the root validator, - # that is, `assert gpu.vendor is not None` should be a safe type narrowing. vendor: Optional[gpuhunt.AcceleratorVendor] = None + """ + `vendor` is declared as optional, but the root validator always sets a value. + `assert gpu.vendor is not None` should be a safe type narrowing. + """ @root_validator(pre=True) def validate_name_and_vendor(cls, values): @@ -54,13 +56,15 @@ class Resources(CoreModel): memory_mib: int gpus: List[Gpu] spot: bool - disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility + disk: Disk = Disk(size_mib=102400) + """`disk` defaults to 100GB for backward compatibility.""" cpu_arch: Optional[gpuhunt.CPUArchitecture] = None - # Deprecated: description is now generated client-side. TODO: remove in 0.21. + # TODO: remove `description` in 0.21. description: Annotated[ str, Field(description="Deprecated: generated client-side. Will be removed in 0.21."), ] = "" + """`description` is deprecated because it is now generated client-side.""" @root_validator def _description(cls, values) -> Dict: @@ -187,7 +191,8 @@ class RemoteConnectionInfo(CoreModel): class InstanceConfiguration(CoreModel): project_name: str instance_name: str - user: str # dstack user name + user: str + """`user` stores the dstack user name.""" ssh_keys: List[SSHKey] instance_id: Optional[str] = None reservation: Optional[str] = None @@ -208,7 +213,8 @@ class InstanceAvailability(Enum): AVAILABLE = "available" NOT_AVAILABLE = "not_available" NO_QUOTA = "no_quota" - NO_BALANCE = "no_balance" # For dstack Sky + NO_BALANCE = "no_balance" + """`NO_BALANCE` is used for dstack Sky.""" IDLE = "idle" BUSY = "busy" @@ -268,7 +274,8 @@ class InstanceTerminationReason(str, Enum): NO_OFFERS = "no_offers" MASTER_FAILED = "master_failed" MAX_INSTANCES_LIMIT = "max_instances_limit" - NO_BALANCE = "no_balance" # used in dstack Sky + NO_BALANCE = "no_balance" + """`NO_BALANCE` is used in dstack Sky.""" @classmethod def from_legacy_str(cls, v: str) -> "InstanceTerminationReason": @@ -332,14 +339,17 @@ class Instance(CoreModel): fleet_id: Optional[UUID] = None fleet_name: Optional[str] = None instance_num: int - job_name: Optional[str] = None # deprecated, always None (instance can have more than one job) + job_name: Optional[str] = None + """`job_name` is deprecated and always `None` because an instance can have more than one job.""" hostname: Optional[str] = None status: InstanceStatus unreachable: bool = False health_status: HealthStatus = HealthStatus.HEALTHY - # termination_reason stores InstanceTerminationReason. - # str allows adding new enum members without breaking compatibility with old clients. termination_reason: Optional[str] = None + """ + `termination_reason` stores `InstanceTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ termination_reason_message: Optional[str] = None created: datetime.datetime finished_at: Optional[datetime.datetime] = None diff --git a/src/dstack/_internal/core/models/placement.py b/src/dstack/_internal/core/models/placement.py index 93b0cf09d0..a0ce418bca 100644 --- a/src/dstack/_internal/core/models/placement.py +++ b/src/dstack/_internal/core/models/placement.py @@ -16,7 +16,8 @@ class PlacementGroupConfiguration(CoreModel): class PlacementGroupProvisioningData(CoreModel): - backend: BackendType # can be different from configuration backend + backend: BackendType + """`backend` can be different from the backend in `configuration`.""" backend_data: Optional[str] = None diff --git a/src/dstack/_internal/core/models/repos/remote.py b/src/dstack/_internal/core/models/repos/remote.py index d3c3b70906..3bfd34024d 100644 --- a/src/dstack/_internal/core/models/repos/remote.py +++ b/src/dstack/_internal/core/models/repos/remote.py @@ -236,7 +236,8 @@ class GitRepoURL: ssh_port: Optional[str] path: str - original_host: str # before SSH config lookup + original_host: str + """`original_host` stores the host value before SSH config lookup.""" @staticmethod def parse( diff --git a/src/dstack/_internal/core/models/resources.py b/src/dstack/_internal/core/models/resources.py index 02cbbdc9b8..81230afcf3 100644 --- a/src/dstack/_internal/core/models/resources.py +++ b/src/dstack/_internal/core/models/resources.py @@ -375,7 +375,7 @@ def schema_extra(schema: Dict[str, Any]): class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): - # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only. + # TODO: remove `Range[int]` in 0.20. It is kept only for backward compatibility. cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = ( CPUSpec() ) @@ -390,8 +390,8 @@ class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): "you may need to configure this" ), ] = None - # Optional for backward compatibility gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = DEFAULT_GPU_SPEC + """`gpu` is optional for backward compatibility.""" disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK def pretty_format(self) -> str: diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 558b07e26e..74766c9638 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -126,27 +126,44 @@ def to_error(self) -> Optional[str]: class JobTerminationReason(str, Enum): - # Set by the server FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity" + """`FAILED_TO_START_DUE_TO_NO_CAPACITY` is set by the server.""" INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity" + """`INTERRUPTED_BY_NO_CAPACITY` is set by the server.""" INSTANCE_UNREACHABLE = "instance_unreachable" + """`INSTANCE_UNREACHABLE` is set by the server.""" WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded" + """`WAITING_INSTANCE_LIMIT_EXCEEDED` is set by the server.""" WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded" + """`WAITING_RUNNER_LIMIT_EXCEEDED` is set by the server.""" TERMINATED_BY_USER = "terminated_by_user" + """`TERMINATED_BY_USER` is set by the server.""" VOLUME_ERROR = "volume_error" + """`VOLUME_ERROR` is set by the server.""" GATEWAY_ERROR = "gateway_error" + """`GATEWAY_ERROR` is set by the server.""" SCALED_DOWN = "scaled_down" + """`SCALED_DOWN` is set by the server.""" DONE_BY_RUNNER = "done_by_runner" + """`DONE_BY_RUNNER` is set by the server.""" ABORTED_BY_USER = "aborted_by_user" + """`ABORTED_BY_USER` is set by the server.""" TERMINATED_BY_SERVER = "terminated_by_server" + """`TERMINATED_BY_SERVER` is set by the server.""" INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded" + """`INACTIVITY_DURATION_EXCEEDED` is set by the server.""" TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy" - # Set by the runner + """`TERMINATED_DUE_TO_UTILIZATION_POLICY` is set by the server.""" CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error" + """`CONTAINER_EXITED_WITH_ERROR` is set by the runner.""" PORTS_BINDING_FAILED = "ports_binding_failed" + """`PORTS_BINDING_FAILED` is set by the runner.""" CREATING_CONTAINER_ERROR = "creating_container_error" + """`CREATING_CONTAINER_ERROR` is set by the runner.""" EXECUTOR_ERROR = "executor_error" + """`EXECUTOR_ERROR` is set by the runner.""" MAX_DURATION_EXCEEDED = "max_duration_exceeded" + """`MAX_DURATION_EXCEEDED` is set by the runner.""" def to_status(self) -> JobStatus: mapping = { @@ -210,9 +227,11 @@ class Requirements(CoreModel): max_price: Optional[float] = None spot: Optional[bool] = None reservation: Optional[str] = None - # Backends can use `multinode` to filter out offers if - # some offers support multinode and some do not. multinode: Optional[bool] = None + """ + multinode: Backends can use `multinode` to filter out offers when some offers support + multinode and some do not. + """ def pretty_format(self, resources_only: bool = False): res = self.resources.pretty_format() @@ -241,7 +260,8 @@ class JobSSHKey(CoreModel): class ProbeSpec(CoreModel): - type: Literal["http"] # expect other probe types in the future, namely `exec` + type: Literal["http"] + """`type` currently expects `http`, but other probe types such as `exec` may be added later.""" url: str method: HTTPMethod = DEFAULT_PROBE_METHOD headers: list[HTTPHeaderSpec] = [] @@ -253,13 +273,16 @@ class ProbeSpec(CoreModel): class JobSpec(CoreModel): - replica_num: int = 0 # default value for backward compatibility + replica_num: int = 0 + """`replica_num` uses a default value for backward compatibility.""" job_num: int job_name: str - jobs_per_replica: int = 1 # default value for backward compatibility + jobs_per_replica: int = 1 + """`jobs_per_replica` uses a default value for backward compatibility.""" replica_group: str = DEFAULT_REPLICA_GROUP_NAME app_specs: Optional[List[AppSpec]] - user: Optional[UnixUser] = None # default value for backward compatibility + user: Optional[UnixUser] = None + """`user` uses a default value for backward compatibility.""" commands: List[str] env: Dict[str, str] home_dir: Optional[str] @@ -275,51 +298,62 @@ class JobSpec(CoreModel): volumes: Optional[List[MountPoint]] = None ssh_key: Optional[JobSSHKey] = None working_dir: Optional[str] - # `repo_data` is optional for client compatibility with pre-0.19.17 servers and for compatibility - # with jobs submitted before 0.19.17. All new jobs are expected to have non-None `repo_data`. - # For --no-repo runs, `repo_data` is `VirtualRunRepoData()`. repo_data: Annotated[Optional[AnyRunRepoData], Field(discriminator="repo_type")] = None - # `repo_code_hash` can be None because it is not used for the repo or because the job was - # submitted before 0.19.17. See `_get_repo_code_hash` on how to get the correct `repo_code_hash` - # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant. + """`repo_data` is optional for client compatibility with pre-0.19.17 servers and for jobs + submitted before 0.19.17. All new jobs are expected to have non-`None` `repo_data`. + For `--no-repo` runs, `repo_data` is `VirtualRunRepoData()`. + """ + # TODO: drop this compatibility note when support for jobs submitted before 0.19.17 is no longer relevant. repo_code_hash: Optional[str] = None - # `repo_dir` was added in 0.19.27. Default value is set for backward compatibility + """`repo_code_hash` can be `None` because it is not used for the repo or because the job was + submitted before 0.19.17. See `_get_repo_code_hash` for how to get the correct value. + """ repo_dir: str = LEGACY_REPO_DIR - # None for jobs without repo and any jobs submitted by pre-0.20.0 clients + """`repo_dir` was added in 0.19.27 and uses a default value for backward compatibility.""" repo_exists_action: Optional[RepoExistsAction] = None + """`repo_exists_action` is `None` for jobs without a repo and for jobs submitted by pre-0.20.0 clients.""" file_archives: list[FileArchiveMapping] = [] - # None for non-services and pre-0.19.19 services. See `get_service_port` service_port: Optional[int] = None + """`service_port` is `None` for non-services and pre-0.19.19 services. See `get_service_port`.""" probes: list[ProbeSpec] = [] class JobProvisioningData(CoreModel): backend: BackendType - # In case backend provisions instance in another backend, it may set that backend as base_backend. base_backend: Optional[BackendType] = None + """`base_backend` may be set when a backend provisions an instance in another backend and wants + to record that + backend as `base_backend`. + """ instance_type: InstanceType instance_id: str - # hostname may not be set immediately after instance provisioning. - # It is set to a public IP or, if public IPs are disabled, to a private IP. hostname: Optional[str] = None + """`hostname` may not be set immediately after instance provisioning. + It is set to a public IP or, if public IPs are disabled, to a private IP. + """ internal_ip: Optional[str] = None - # public_ip_enabled can used to distinguished instances with and without public IPs. - # hostname being None is not enough since it can be filled after provisioning. public_ip_enabled: bool = True - # instance_network a network address for multimode installation. Specified as `/` - # internal_ip will be selected from the specified network + """`public_ip_enabled` is used to distinguish instances with and without public IPs. + `hostname` being `None` is not enough because it can be filled after provisioning. + """ instance_network: Optional[str] = None + """`instance_network` stores the multimode installation network, specified as + `/`. `internal_ip` will be selected from the specified network. + """ region: str availability_zone: Optional[str] = None reservation: Optional[str] = None price: float username: str - # ssh_port be different from 22 for some backends. - # ssh_port may not be set immediately after instance provisioning ssh_port: Optional[int] = None - dockerized: bool # True if backend starts shim + """`ssh_port` may be different from 22 for some backends and may not be set immediately after + instance provisioning. + """ + dockerized: bool + """`dockerized` is `True` when the backend starts the shim.""" ssh_proxy: Optional[SSHConnectionParams] = None - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" def get_base_backend(self) -> BackendType: if self.base_backend is not None: @@ -340,22 +374,29 @@ class JobRuntimeData(CoreModel): """ network_mode: NetworkMode - # GPU, CPU, memory resource shares. None means all available (no limit) gpu: Optional[int] = None + """`gpu` stores the GPU resource share. `None` means all available with no limit.""" cpu: Optional[float] = None + """`cpu` stores the CPU resource share. `None` means all available with no limit.""" memory: Optional[Memory] = None - # container:host port mapping reported by shim. Empty dict if network_mode == NetworkMode.HOST - # None if data is not yet available (on vm-based backends and ssh instances) - # or not applicable (container-based backends) + """`memory` stores the memory resource share. `None` means all available with no limit.""" ports: Optional[dict[int, int]] = None - # List of volumes used by the job - volume_names: Optional[list[str]] = None # None for backward compatibility - # Virtual shared offer - offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility - # Resolved working directory and OS username reported by the runner. - # None if the runner hasn't reported them yet or if it's an old runner. + """`ports` stores the container-to-host port mapping reported by shim. It is an empty dict if + `network_mode == NetworkMode.HOST`. `None` if data is not yet available + on VM-based backends and SSH instances, or not applicable on container-based backends. + """ + volume_names: Optional[list[str]] = None + """`volume_names` stores the list of volumes used by the job. It is `None` for backward compatibility.""" + offer: Optional[InstanceOfferWithAvailability] = None + """`offer` stores the virtual shared offer. It is `None` for backward compatibility.""" working_dir: Optional[str] = None + """`working_dir` stores the resolved working directory reported by the runner. + `None` if the runner has not reported it yet or if it is an old runner. + """ username: Optional[str] = None + """`username` stores the resolved OS username reported by the runner. + `None` if the runner has not reported it yet or if it is an old runner. + """ class ClusterInfo(CoreModel): @@ -371,16 +412,19 @@ class Probe(CoreModel): class JobSubmission(CoreModel): id: UUID4 submission_num: int - deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers + deployment_num: int = 0 + """`deployment_num` uses a default value for compatibility with pre-0.19.14 servers.""" submitted_at: datetime last_processed_at: datetime finished_at: Optional[datetime] = None inactivity_secs: Optional[int] = None status: JobStatus - status_message: str = "" # default for backward compatibility - # termination_reason stores JobTerminationReason. - # str allows adding new enum members without breaking compatibility with old clients. + status_message: str = "" + """`status_message` uses a default value for backward compatibility.""" termination_reason: Optional[str] = None + """`termination_reason` stores `JobTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ termination_reason_message: Optional[str] = None exit_status: Optional[int] = None job_provisioning_data: Optional[JobProvisioningData] = None @@ -413,7 +457,7 @@ def schema_extra(schema: Dict[str, Any]): class RunSpec(generate_dual_core_model(RunSpecConfig)): - # TODO: run_name is redundant here since they already passed in configuration + # TODO: consider removing `run_name` here because it is already passed in `configuration`. run_name: Annotated[ Optional[str], Field(description="The run name. If not set, the run name is generated automatically."), @@ -452,9 +496,10 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)): list[FileArchiveMapping], Field(description="The list of file archive ID to container path mappings."), ] = [] - # Server uses configuration.working_dir since 0.19.27 and ignores this field, but the field - # still exists for compatibility with old clients that send it. working_dir: Optional[str] = None + """`working_dir` is kept for compatibility with old clients that still send it, even though the + server uses `configuration.working_dir` since 0.19.27 and ignores this field. + """ configuration_path: Annotated[ Optional[str], Field( @@ -473,10 +518,11 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)): " Can be empty only before the run is submitted." ), ] = None - # merged_profile stores profile parameters merged from profile and configuration. - # Read profile parameters from merged_profile instead of profile directly. - # TODO: make merged_profile a computed field after migrating to pydanticV2 + # TODO: make `merged_profile` a computed field after migrating to Pydantic v2. merged_profile: Annotated[Profile, Field(exclude=True)] = None + """`merged_profile` stores profile parameters merged from `profile` and `configuration`. + Read profile parameters from `merged_profile` instead of `profile` directly. + """ @root_validator def _merged_profile(cls, values) -> Dict: @@ -546,16 +592,19 @@ class Run(CoreModel): submitted_at: datetime last_processed_at: datetime status: RunStatus - status_message: str = "" # default for backward compatibility - # termination_reason stores RunTerminationReason. - # str allows adding new enum members without breaking compatibility with old clients. + status_message: str = "" + """`status_message` uses a default value for backward compatibility.""" termination_reason: Optional[str] = None + """`termination_reason` stores `RunTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ run_spec: RunSpec jobs: List[Job] latest_job_submission: Optional[JobSubmission] = None cost: float = 0 service: Optional[ServiceSpec] = None - deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers + deployment_num: int = 0 + """`deployment_num` uses a default value for compatibility with pre-0.19.14 servers.""" error: Optional[str] = None deleted: Optional[bool] = None next_triggered_at: Optional[datetime] = None diff --git a/src/dstack/_internal/core/models/volumes.py b/src/dstack/_internal/core/models/volumes.py index 280ab14f10..701611402b 100644 --- a/src/dstack/_internal/core/models/volumes.py +++ b/src/dstack/_internal/core/models/volumes.py @@ -17,9 +17,10 @@ class VolumeStatus(str, Enum): SUBMITTED = "submitted" - # PROVISIONING is currently not used since on all backends supporting volumes, - # volumes become ACTIVE (ready to be used) almost immediately after provisioning. PROVISIONING = "provisioning" + """`PROVISIONING` is currently not used because on all backends supporting volumes, + volumes become `ACTIVE` almost immediately after provisioning. + """ ACTIVE = "active" FAILED = "failed" @@ -88,12 +89,13 @@ class VolumeProvisioningData(CoreModel): volume_id: str size_gb: int availability_zone: Optional[str] = None - # price per month price: Optional[float] = None - # should be manually attached/detached + """`price` stores the monthly price.""" attachable: bool = True + """`attachable` shows whether the volume should be attached and detached manually.""" detachable: bool = True - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" class VolumeAttachmentData(CoreModel): @@ -125,13 +127,15 @@ class Volume(CoreModel): status_message: Optional[str] = None deleted: bool deleted_at: Optional[datetime] = None - volume_id: Optional[str] = None # id of the volume in the cloud + volume_id: Optional[str] = None + """`volume_id` is the volume identifier in the cloud provider.""" provisioning_data: Optional[VolumeProvisioningData] = None cost: float = 0 attachments: Optional[List[VolumeAttachment]] = None - # attachment_data is deprecated in favor of attachments. - # It's only set for volumes that were attached before attachments. attachment_data: Optional[VolumeAttachmentData] = None + """`attachment_data` is deprecated in favor of `attachments`. + It is only set for volumes that were attached before attachments were introduced. + """ def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]: if self.attachments is not None: diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index d1a30b941b..d74d3c01b3 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -84,9 +84,11 @@ class DecryptedString(generate_dual_core_model(DecryptedStringConfig)): This is useful so that application code can have custom handling of failed decrypts (e.g. ignoring). """ - # Do not read plaintext directly to avoid ignoring errors accidentally. - # Unpack with get_plaintext_or_error(). plaintext: Optional[str] + """ + `plaintext` should not be read directly to avoid ignoring errors accidentally. + Unpack with `get_plaintext_or_error()`. + """ decrypted: bool = True exc: Optional[Exception] = None @@ -211,20 +213,26 @@ class UserModel(BaseModel): name: Mapped[str] = mapped_column(String(50), unique=True) created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) token: Mapped[DecryptedString] = mapped_column(EncryptedString(200), unique=True) - # token_hash is needed for fast search by token when stored token is encrypted token_hash: Mapped[str] = mapped_column(String(2000), unique=True) + """`token_hash` is used for fast token lookup when the stored token is encrypted.""" global_role: Mapped[GlobalRole] = mapped_column(EnumAsString(GlobalRole, 100)) - # deactivated users cannot access API active: Mapped[bool] = mapped_column(Boolean, default=True) + """`active` controls whether the user can access the API.""" deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) - # `original_name` stores the name of a deleted user, while `name` is changed to a unique generated value. original_name: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + """`original_name` stores the deleted user's original name while `name` is changed to a unique + generated value. + """ - # SSH keys can be null for users created before 0.19.33. - # Keys for those users are being gradually generated on /get_my_user calls. - # TODO: make keys required in a future version. + # TODO: make these keys required in a future version. ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + """`ssh_private_key` can be `null` for users created before 0.19.33. + Keys for those users are being gradually generated on `/get_my_user` calls. + """ ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + """`ssh_public_key` can be `null` for users created before 0.19.33. + Keys for those users are being gradually generated on `/get_my_user` calls. + """ email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True, index=True) @@ -243,8 +251,10 @@ class ProjectModel(BaseModel): created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) is_public: Mapped[bool] = mapped_column(Boolean, default=False) deleted: Mapped[bool] = mapped_column(Boolean, default=False) - # `original_name` stores the name of a deleted project, while `name` is changed to a unique generated value. original_name: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + """`original_name` stores the deleted project's original name while `name` is changed to a unique + generated value. + """ owner_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) owner: Mapped[UserModel] = relationship(lazy="joined") @@ -264,14 +274,15 @@ class ProjectModel(BaseModel): foreign_keys=[default_gateway_id] ) - # TODO: Drop after the release without pools - # Note that multi-replica deployments can break if - # upgrading from an old version that uses pools to the version that drops pools from the DB. + # TODO: drop `default_pool_id` after the release without pools. default_pool_id: Mapped[Optional[UUIDType]] = mapped_column( ForeignKey("pools.id", use_alter=True, ondelete="SET NULL"), nullable=True, deferred=True, # Not loaded so it can be deleted in the next releases ) + """`default_pool_id` exists because multi-replica deployments can break when upgrading from an + old version that uses pools to the version that drops pools from the database. + """ default_pool: Mapped[Optional["PoolModel"]] = relationship(foreign_keys=[default_pool_id]) @@ -286,8 +297,8 @@ class MemberModel(BaseModel): user_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped[UserModel] = relationship(lazy="joined") project_role: Mapped[ProjectRole] = mapped_column(EnumAsString(ProjectRole, 100)) - # member_num defines members ordering member_num: Mapped[Optional[int]] = mapped_column(Integer) + """`member_num` defines member ordering.""" class BackendModel(BaseModel): @@ -315,16 +326,18 @@ class RepoModel(BaseModel): ) project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship() - # RepoModel.name stores repo_id name: Mapped[str] = mapped_column(String(100)) + """`name` stores `repo_id`.""" type: Mapped[RepoType] = mapped_column(EnumAsString(RepoType, 100)) info: Mapped[str] = mapped_column(Text) - # `creds` is deprecated, for newly initialized repos per-user `RepoCredsModel` should be used - # instead. As of 0.18.25, there is no plan to remove this field, it's used as a fallback when - # `RepoCredsModel` associated with the user is not found. creds: Mapped[Optional[str]] = mapped_column(String(5000)) + """ + `creds` is deprecated. Newly initialized repos should use per-user `RepoCredsModel` instead. + As of 0.18.25 there is no plan to remove this field; it is used as a fallback when + `RepoCredsModel` associated with the user is not found. + """ class RepoCredsModel(BaseModel): @@ -354,7 +367,8 @@ class CodeModel(BaseModel): repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) repo: Mapped["RepoModel"] = relationship() blob_hash: Mapped[str] = mapped_column(String(4000)) - blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3 + blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) + """`blob` is stored on S3 when it is `None`.""" class FileArchiveModel(BaseModel): @@ -369,7 +383,8 @@ class FileArchiveModel(BaseModel): user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped["UserModel"] = relationship() blob_hash: Mapped[str] = mapped_column(Text) - blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3 + blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) + """`blob` is stored on S3 when it is `None`.""" class RunModel(BaseModel): @@ -389,23 +404,26 @@ class RunModel(BaseModel): repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) repo: Mapped["RepoModel"] = relationship() - # Runs reference fleets so that fleets cannot be deleted while they are used. - # A fleet can have no busy instances but still be used by a run (e.g. a service with 0 replicas). fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) + """`fleet_id` keeps runs attached to fleets so the fleets cannot be deleted while they are used. + A fleet can have no busy instances but still be used by a run, for example a service with + zero replicas. + """ fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="runs") run_name: Mapped[str] = mapped_column(String(100)) submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # NOTE: `status` must be changed only via `switch_run_status()` status: Mapped[RunStatus] = mapped_column(EnumAsString(RunStatus, 100), index=True) + """`status` must be changed only via `switch_run_status()`.""" termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column( EnumAsString(RunTerminationReason, 100) ) - # resubmission_attempt counts consecutive transitions to pending without provisioning. - # Can be used to choose retry delay depending on the attempt number. resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0) + """`resubmission_attempt` counts consecutive transitions to pending without provisioning. + It can be used to choose a retry delay based on the attempt number. + """ run_spec: Mapped[str] = mapped_column(Text) service_spec: Mapped[Optional[str]] = mapped_column(Text) priority: Mapped[int] = mapped_column(Integer, default=0) @@ -439,9 +457,10 @@ class JobModel(BaseModel): ) run: Mapped["RunModel"] = relationship() - # Jobs need to reference fleets because we may choose an optimal fleet for a master job - # but not yet create an instance for it. fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) + """`fleet_id` keeps jobs attached to fleets because we may choose an optimal fleet for a master + job but not yet create an instance for it. + """ fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="jobs") run_name: Mapped[str] = mapped_column(String(100)) @@ -450,26 +469,29 @@ class JobModel(BaseModel): submission_num: Mapped[int] = mapped_column(Integer) submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) - # NOTE: `status` must be changed only via `switch_job_status()` status: Mapped[JobStatus] = mapped_column(EnumAsString(JobStatus, 100), index=True) + """`status` must be changed only via `switch_job_status()`.""" termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column( EnumAsString(JobTerminationReason, 100) ) termination_reason_message: Mapped[Optional[str]] = mapped_column(Text) - # `disconnected_at` stores the first time of connectivity issues with the instance. - # Resets every time connectivity is restored. disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + """`disconnected_at` stores the first time connectivity issues were seen with the instance. + It resets every time connectivity is restored. + """ exit_status: Mapped[Optional[int]] = mapped_column(Integer) job_spec_data: Mapped[str] = mapped_column(Text) job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger) - inactivity_secs: Mapped[Optional[int]] = mapped_column(Integer) # 0 - active, None - N/A - # `removed` is used to ensure that the instance is killed after the job is finished + inactivity_secs: Mapped[Optional[int]] = mapped_column(Integer) + """`inactivity_secs` uses `0` for active jobs and `None` when inactivity is not applicable.""" remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + """`remove_at` is used to ensure the instance is killed after the job is finished.""" volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # `instance_assigned` means instance assignment was done. - # if `instance_assigned` is True and `instance` is None, no instance was assigned. instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False) + """`instance_assigned` shows whether instance assignment has already been attempted. + If `instance_assigned` is `True` and `instance` is `None`, no instance was assigned. + """ instance_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("instances.id", ondelete="CASCADE") ) @@ -481,15 +503,16 @@ class JobModel(BaseModel): probes: Mapped[list["ProbeModel"]] = relationship( back_populates="job", order_by="ProbeModel.probe_num" ) - # Whether the replica is registered to receive service requests. - # Always `False` for non-service runs. registered: Mapped[bool] = mapped_column(Boolean, server_default=false()) - # `waiting_master_job` is `True` for non-master jobs that have to wait - # for master processing before they can be processed. - # This allows updating all replica jobs even when only master is locked, - # e.g. to provision instances for all jobs when processing master. - # If not set, all jobs should be processed only one-by-one. + """`registered` shows whether the replica is registered to receive service requests. + It is always `False` for non-service runs. + """ waiting_master_job: Mapped[Optional[bool]] = mapped_column(Boolean) + """`waiting_master_job` is `True` for non-master jobs that have to wait for master processing before + they can be processed. This allows updating all replica jobs even when only master is locked, + for example to provision instances for all jobs when processing master. If not set, all jobs + should be processed only one-by-one. + """ class GatewayModel(PipelineModelMixin, BaseModel): @@ -501,9 +524,11 @@ class GatewayModel(PipelineModelMixin, BaseModel): name: Mapped[str] = mapped_column(String(100)) region: Mapped[str] = mapped_column(String(100)) wildcard_domain: Mapped[Optional[str]] = mapped_column(String(100)) - # `configuration` is optional for compatibility with pre-0.18.2 gateways. - # Use `get_gateway_configuration` to construct `configuration` for old gateways. configuration: Mapped[Optional[str]] = mapped_column(Text) + """ + configuration: Optional for compatibility with pre-0.18.2 gateways. + Use `get_gateway_configuration` to construct `configuration` for old gateways. + """ created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) status: Mapped[GatewayStatus] = mapped_column(EnumAsString(GatewayStatus, 100)) status_message: Mapped[Optional[str]] = mapped_column(Text) @@ -537,9 +562,10 @@ class GatewayComputeModel(BaseModel): instance_id: Mapped[str] = mapped_column(String(100)) ip_address: Mapped[str] = mapped_column(String(100)) hostname: Mapped[Optional[str]] = mapped_column(String(100)) - # `configuration` is optional for compatibility with pre-0.18.2 gateways. - # Use `get_gateway_compute_configuration` to construct `configuration` for old gateways. configuration: Mapped[Optional[str]] = mapped_column(Text) + """`configuration` is optional for compatibility with pre-0.18.2 gateways. + Use `get_gateway_compute_configuration` to construct `configuration` for old gateways. + """ backend_data: Mapped[Optional[str]] = mapped_column(Text) region: Mapped[str] = mapped_column(String(100)) @@ -548,12 +574,12 @@ class GatewayComputeModel(BaseModel): ) backend: Mapped[Optional["BackendModel"]] = relationship() - # The key to authorize the server with the gateway ssh_private_key: Mapped[str] = mapped_column(Text) + """`ssh_private_key` is the key used to authorize the server with the gateway.""" ssh_public_key: Mapped[str] = mapped_column(Text) - # active means the server should maintain connection to gateway. active: Mapped[bool] = mapped_column(Boolean, default=True) + """`active` means the server should maintain a connection to the gateway.""" deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) app_updated_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) @@ -594,8 +620,8 @@ class FleetModel(PipelineModelMixin, BaseModel): deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # NOTE: `status` must be changed only via `switch_fleet_status()` status: Mapped[FleetStatus] = mapped_column(EnumAsString(FleetStatus, 100), index=True) + """`status` must be changed only via `switch_fleet_status()`.""" status_message: Mapped[Optional[str]] = mapped_column(Text) spec: Mapped[str] = mapped_column(Text) @@ -611,9 +637,10 @@ class FleetModel(PipelineModelMixin, BaseModel): UUIDType(binary=False), index=True ) - # `consolidation_attempt` counts how many times in a row fleet needed consolidation. - # Allows increasing delays between attempts. consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0") + """`consolidation_attempt` counts how many times in a row the fleet needed consolidation. + It allows increasing delays between attempts. + """ last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) __table_args__ = ( @@ -646,7 +673,7 @@ class InstanceModel(PipelineModelMixin, BaseModel): project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) - # TODO: Drop after the release without pools + # TODO: drop `pool_id` after the release without pools. pool_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("pools.id"), deferred=True, # Not loaded so it can be deleted in the next releases @@ -662,38 +689,36 @@ class InstanceModel(PipelineModelMixin, BaseModel): compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id")) compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances") - # NOTE: `status` must be changed only via `switch_instance_status()` status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True) + """`status` must be changed only via `switch_instance_status()`.""" unreachable: Mapped[bool] = mapped_column(Boolean) - # VM started_at: Mapped[Optional[datetime]] = mapped_column( NaiveDateTime, default=get_current_datetime ) + """`started_at` is used only for VM instances.""" finished_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # create instance - # TODO: Introduce a field that would store all resolved instance profile parameters, etc, (similar to job_spec). - # Currently, profile parameters are parsed every time they are accessed (e.g. see profile.retry). + # TODO: introduce a field that stores all resolved instance profile parameters, similar to `job_spec`. profile: Mapped[Optional[str]] = mapped_column(Text) + """`profile` stores raw profile data. Profile parameters are currently parsed every time they are + accessed, for example through `profile.retry`. + """ requirements: Mapped[Optional[str]] = mapped_column(Text) instance_configuration: Mapped[Optional[str]] = mapped_column(Text) termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100)) - # TODO: Suggestion: do not assign DEFAULT_FLEET_TERMINATION_IDLE_TIME as the default here - # (make Optional instead; also instead of -1) + # TODO: consider not assigning `DEFAULT_FLEET_TERMINATION_IDLE_TIME` here and making this optional. termination_idle_time: Mapped[int] = mapped_column( Integer, default=DEFAULT_FLEET_TERMINATION_IDLE_TIME ) + """`termination_idle_time` stores the idle timeout used for termination decisions.""" - # Deprecated last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime, deferred=True) + """`last_retry_at` is deprecated.""" - # instance termination handling termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # dstack versions prior to 0.20.1 represented instance termination reasons as raw strings. - # Such strings may still be stored in the database, so we are using a wide column (4000 chars) - # and a fallback deserializer to convert them to relevant enum members. + """`termination_deadline` is used for instance termination handling.""" termination_reason: Mapped[Optional[InstanceTerminationReason]] = mapped_column( EnumAsString( InstanceTerminationReason, @@ -701,9 +726,13 @@ class InstanceModel(PipelineModelMixin, BaseModel): fallback_deserializer=InstanceTerminationReason.from_legacy_str, ) ) + """`termination_reason` may need legacy deserialization because dstack versions prior to 0.20.1 represented instance termination + reasons as raw strings. Such strings may still be stored in the database, so this uses a + wide column and a fallback deserializer to convert them to relevant enum members. + """ termination_reason_message: Mapped[Optional[str]] = mapped_column(String(4000)) - # Deprecated since 0.19.22, not used health_status: Mapped[Optional[str]] = mapped_column(String(4000), deferred=True) + """`health_status` is deprecated since 0.19.22 and is no longer used.""" health: Mapped[HealthStatus] = mapped_column( EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY ) @@ -713,8 +742,8 @@ class InstanceModel(PipelineModelMixin, BaseModel): backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100)) backend_data: Mapped[Optional[str]] = mapped_column(Text) - # Not set for cloud fleets that haven't been provisioning offer: Mapped[Optional[str]] = mapped_column(Text) + """`offer` is not set for cloud fleets that have not started provisioning.""" region: Mapped[Optional[str]] = mapped_column(String(2000)) price: Mapped[Optional[float]] = mapped_column(Float) @@ -722,8 +751,8 @@ class InstanceModel(PipelineModelMixin, BaseModel): remote_connection_info: Mapped[Optional[str]] = mapped_column(Text) - # NULL means `auto` (only during provisioning, when ready it's not NULL) total_blocks: Mapped[Optional[int]] = mapped_column(Integer) + """`total_blocks` uses `NULL` to mean `auto` during provisioning; once ready it is not `NULL`.""" busy_blocks: Mapped[int] = mapped_column(Integer, default=0) jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance") @@ -785,19 +814,19 @@ class VolumeModel(PipelineModelMixin, BaseModel): deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) to_be_deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) - # NOTE: `status` must be changed only via `switch_volume_status()` status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True) + """`status` must be changed only via `switch_volume_status()`.""" status_message: Mapped[Optional[str]] = mapped_column(Text) configuration: Mapped[str] = mapped_column(Text) volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) - # auto_cleanup_enabled is set for all new models but old models may not have it. auto_cleanup_enabled: Mapped[Optional[bool]] = mapped_column(Boolean) + """`auto_cleanup_enabled` is set for all new models, but old models may not have it.""" attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume") - # Deprecated in favor of VolumeAttachmentModel.attachment_data volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text) + """`volume_attachment_data` is deprecated in favor of `VolumeAttachmentModel.attachment_data`.""" __table_args__ = ( Index( @@ -832,7 +861,7 @@ class PlacementGroupModel(PipelineModelMixin, BaseModel): fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id")) fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id]) - # TODO: rename `fleet_deleted` -> `to_be_deleted` + # TODO: rename `fleet_deleted` to `to_be_deleted`. fleet_deleted: Mapped[bool] = mapped_column(Boolean, default=False) created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) @@ -908,9 +937,10 @@ class JobMetricsPoint(BaseModel): memory_usage_bytes: Mapped[int] = mapped_column(BigInteger) memory_working_set_bytes: Mapped[int] = mapped_column(BigInteger) - # json-encoded lists of metric values of len(gpus) length gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text) + """`gpus_memory_usage_bytes` stores a JSON-encoded list of metric values with length `len(gpus)`.""" gpus_util_percent: Mapped[str] = mapped_column(Text) + """`gpus_util_percent` stores a JSON-encoded list of metric values with length `len(gpus)`.""" class JobPrometheusMetrics(BaseModel): @@ -920,8 +950,8 @@ class JobPrometheusMetrics(BaseModel): job: Mapped["JobModel"] = relationship() collected_at: Mapped[datetime] = mapped_column(NaiveDateTime) - # Raw Prometheus text response text: Mapped[str] = mapped_column(Text) + """`text` stores the raw Prometheus text response.""" class ProbeModel(BaseModel): @@ -936,7 +966,8 @@ class ProbeModel(BaseModel): job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True) job: Mapped["JobModel"] = relationship(back_populates="probes") - probe_num: Mapped[int] = mapped_column(Integer) # index in JobSpec.probes + probe_num: Mapped[int] = mapped_column(Integer) + """`probe_num` is the index in `JobSpec.probes`.""" due: Mapped[datetime] = mapped_column(NaiveDateTime) success_streak: Mapped[int] = mapped_column(BigInteger) active: Mapped[bool] = mapped_column(Boolean) diff --git a/src/dstack/_internal/server/schemas/health/dcgm.py b/src/dstack/_internal/server/schemas/health/dcgm.py index f6aeaa40e5..cf8f5ce506 100644 --- a/src/dstack/_internal/server/schemas/health/dcgm.py +++ b/src/dstack/_internal/server/schemas/health/dcgm.py @@ -32,17 +32,20 @@ class DCGMHealthIncident(CoreModel): See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73 """ - # dcgmIncidentInfo_t system: int + """`system` comes from `dcgmIncidentInfo_t`.""" health: DCGMHealthResult + """`health` comes from `dcgmIncidentInfo_t`.""" - # dcgmDiagErrorDetail_t error_message: str + """`error_message` comes from `dcgmDiagErrorDetail_t`.""" error_code: int + """`error_code` comes from `dcgmDiagErrorDetail_t`.""" - # dcgmGroupEntityPair_t - entity_group_id: int # dcgmGroupEntityPair_t + entity_group_id: int + """`entity_group_id` comes from `dcgmGroupEntityPair_t`.""" entity_id: int + """`entity_id` comes from `dcgmGroupEntityPair_t`.""" class DCGMHealthResponse(CoreModel): diff --git a/src/dstack/_internal/server/schemas/runner.py b/src/dstack/_internal/server/schemas/runner.py index 89649ddda6..43e9ddbb82 100644 --- a/src/dstack/_internal/server/schemas/runner.py +++ b/src/dstack/_internal/server/schemas/runner.py @@ -28,7 +28,8 @@ class JobStateEvent(CoreModel): class LogEvent(CoreModel): - timestamp: int # milliseconds + timestamp: int + """`timestamp` is stored in milliseconds.""" message: bytes @validator("message", pre=True) @@ -43,7 +44,8 @@ class PullResponse(CoreModel): job_logs: List[LogEvent] runner_logs: List[LogEvent] last_updated: int - no_connections_secs: Optional[int] = None # Optional for compatibility with old runners + no_connections_secs: Optional[int] = None + """`no_connections_secs` is optional for compatibility with old runners.""" class JobInfoResponse(CoreModel): @@ -101,8 +103,7 @@ class SubmitBody(CoreModel): cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)] secrets: Annotated[Optional[Dict[str, str]], Field(include=True)] repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)] - # run_spec is deprecated in favor of run.run_spec - # TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier. + # TODO: remove `run_spec` once instances deployed with 0.19.8 or earlier are no longer supported. run_spec: Annotated[ RunSpec, Field( @@ -115,6 +116,7 @@ class SubmitBody(CoreModel): }, ), ] + """`run_spec` is deprecated in favor of `run.run_spec`.""" class HealthcheckResponse(CoreModel): @@ -143,7 +145,8 @@ class ComponentStatus(str, Enum): class ComponentInfo(CoreModel): - name: str # Not using ComponentName enum for compatibility of newer shim with older server + name: str + """`name` does not use `ComponentName` so newer shim versions remain compatible with the older server.""" version: str status: ComponentStatus @@ -203,8 +206,10 @@ class TaskListItem(CoreModel): class TaskListResponse(CoreModel): - ids: Optional[list[str]] = None # returned by pre-0.19.26 shim - tasks: Optional[list[TaskListItem]] = None # returned by 0.19.26+ shim + ids: Optional[list[str]] = None + """`ids` is returned by pre-0.19.26 shim versions.""" + tasks: Optional[list[TaskListItem]] = None + """`tasks` is returned by shim versions 0.19.26 and newer.""" class TaskInfoResponse(CoreModel): @@ -212,8 +217,10 @@ class TaskInfoResponse(CoreModel): status: TaskStatus termination_reason: str termination_message: str - # default value for backward compatibility with 0.18.34, could be removed after a few releases ports: Optional[list[PortMapping]] = [] + """`ports` uses a default value for backward compatibility with 0.18.34. + It can be removed after a few releases. + """ class TaskSubmitRequest(CoreModel): From f7457b2fa566a2bdec1e5e10c97c12f3d49e63ee Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Mar 2026 15:45:51 +0500 Subject: [PATCH 2/4] Minor fixes --- src/dstack/_internal/core/models/runs.py | 28 ++++-------------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 74766c9638..bfef8bc786 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -126,44 +126,27 @@ def to_error(self) -> Optional[str]: class JobTerminationReason(str, Enum): + # Set by the server FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity" - """`FAILED_TO_START_DUE_TO_NO_CAPACITY` is set by the server.""" INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity" - """`INTERRUPTED_BY_NO_CAPACITY` is set by the server.""" INSTANCE_UNREACHABLE = "instance_unreachable" - """`INSTANCE_UNREACHABLE` is set by the server.""" WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded" - """`WAITING_INSTANCE_LIMIT_EXCEEDED` is set by the server.""" WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded" - """`WAITING_RUNNER_LIMIT_EXCEEDED` is set by the server.""" TERMINATED_BY_USER = "terminated_by_user" - """`TERMINATED_BY_USER` is set by the server.""" VOLUME_ERROR = "volume_error" - """`VOLUME_ERROR` is set by the server.""" GATEWAY_ERROR = "gateway_error" - """`GATEWAY_ERROR` is set by the server.""" SCALED_DOWN = "scaled_down" - """`SCALED_DOWN` is set by the server.""" DONE_BY_RUNNER = "done_by_runner" - """`DONE_BY_RUNNER` is set by the server.""" ABORTED_BY_USER = "aborted_by_user" - """`ABORTED_BY_USER` is set by the server.""" TERMINATED_BY_SERVER = "terminated_by_server" - """`TERMINATED_BY_SERVER` is set by the server.""" INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded" - """`INACTIVITY_DURATION_EXCEEDED` is set by the server.""" TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy" - """`TERMINATED_DUE_TO_UTILIZATION_POLICY` is set by the server.""" + # Set by the runner CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error" - """`CONTAINER_EXITED_WITH_ERROR` is set by the runner.""" PORTS_BINDING_FAILED = "ports_binding_failed" - """`PORTS_BINDING_FAILED` is set by the runner.""" CREATING_CONTAINER_ERROR = "creating_container_error" - """`CREATING_CONTAINER_ERROR` is set by the runner.""" EXECUTOR_ERROR = "executor_error" - """`EXECUTOR_ERROR` is set by the runner.""" MAX_DURATION_EXCEEDED = "max_duration_exceeded" - """`MAX_DURATION_EXCEEDED` is set by the runner.""" def to_status(self) -> JobStatus: mapping = { @@ -228,9 +211,7 @@ class Requirements(CoreModel): spot: Optional[bool] = None reservation: Optional[str] = None multinode: Optional[bool] = None - """ - multinode: Backends can use `multinode` to filter out offers when some offers support - multinode and some do not. + """Backends can use `multinode` to filter out offers when some offers support multinode and some do not. """ def pretty_format(self, resources_only: bool = False): @@ -322,8 +303,7 @@ class JobProvisioningData(CoreModel): backend: BackendType base_backend: Optional[BackendType] = None """`base_backend` may be set when a backend provisions an instance in another backend and wants - to record that - backend as `base_backend`. + to record that backend as `base_backend`. """ instance_type: InstanceType instance_id: str From 3c3c396cf9ca7e10a09b5851ae8097090fbedb0b Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Mar 2026 15:49:08 +0500 Subject: [PATCH 3/4] Update AGENTS.md --- AGENTS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AGENTS.md b/AGENTS.md index 336b97bb5b..bb1a7aac0f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -20,6 +20,7 @@ - Keep primary/public functions before local helper functions in a module section. - Keep private classes, exceptions, and similar implementation-specific types close to the private functions that use them unless they are shared more broadly in the module. - Prefer pydantic-style models in `core/models`. +- Document attributes when the note adds behavior, compatibility, or semantic context that is not obvious from the name and type. Use attribute docstrings without leading newline. - Tests use `test_*.py` modules and `test_*` functions; fixtures live near usage. ## Testing Guidelines From 7b6a3225d316f3ea451c90ef2d8926bdca728b3a Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Mar 2026 15:52:10 +0500 Subject: [PATCH 4/4] Remove leading enwlines --- src/dstack/_internal/core/models/instances.py | 6 ++---- src/dstack/_internal/server/models.py | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index 49e6281f16..11a1aca518 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -24,8 +24,7 @@ class Gpu(CoreModel): name: str memory_mib: int vendor: Optional[gpuhunt.AcceleratorVendor] = None - """ - `vendor` is declared as optional, but the root validator always sets a value. + """`vendor` is declared as optional, but the root validator always sets a value. `assert gpu.vendor is not None` should be a safe type narrowing. """ @@ -346,8 +345,7 @@ class Instance(CoreModel): unreachable: bool = False health_status: HealthStatus = HealthStatus.HEALTHY termination_reason: Optional[str] = None - """ - `termination_reason` stores `InstanceTerminationReason`. + """`termination_reason` stores `InstanceTerminationReason`. `str` allows adding new enum members without breaking compatibility with old clients. """ termination_reason_message: Optional[str] = None diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index d74d3c01b3..da733054fa 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -525,8 +525,7 @@ class GatewayModel(PipelineModelMixin, BaseModel): region: Mapped[str] = mapped_column(String(100)) wildcard_domain: Mapped[Optional[str]] = mapped_column(String(100)) configuration: Mapped[Optional[str]] = mapped_column(Text) - """ - configuration: Optional for compatibility with pre-0.18.2 gateways. + """`configuration` is Optional for compatibility with pre-0.18.2 gateways. Use `get_gateway_configuration` to construct `configuration` for old gateways. """ created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)