diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 29228d42..6d9b7c73 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -176,7 +176,7 @@ jobs: - distribution: k3s version: 1.32 nodes: 1 - - distribution: gke + - distribution: eks version: 1.32 nodes: 3 config: @@ -315,7 +315,7 @@ jobs: - distribution: k3s version: 1.32 nodes: 1 - - distribution: gke + - distribution: eks version: 1.32 nodes: 3 steps: diff --git a/.gitignore b/.gitignore index 6695e663..beb9709d 100644 --- a/.gitignore +++ b/.gitignore @@ -63,5 +63,12 @@ applications/wg-easy/release/ applications/flipt/release/ applications/flipt/chart/Chart.lock -**/.claude/settings.local.json +.claude/ +CLAUDE.md .worktrees/ + +# Gas Town workspace artifacts +.beads/ +.claude/ +.runtime/ +CLAUDE.md diff --git a/applications/mlflow/README.md b/applications/mlflow/README.md index b8fd16e0..3998737a 100644 --- a/applications/mlflow/README.md +++ b/applications/mlflow/README.md @@ -32,6 +32,7 @@ helm install mlflow oci://registry.replicated.com/mlflow/stable - [MLflow Helm Chart Documentation](./charts/mlflow/README.md) - Installation and configuration details - [Configuration Reference](./charts/mlflow/README_CONFIG.md) - Detailed configuration options +- [Auto-Update Behavior](./docs/auto-update.md) - How KOTS auto-deploy works with multi-chart weight ordering - [Development Guide](./DEVELOPMENT.md) - Guide for development including containerized environment ## For Developers @@ -97,6 +98,28 @@ This solution offers flexibility in how you store MLflow data: See the [Configuration Reference](./charts/mlflow/README_CONFIG.md) for detailed setup instructions. +## Preflight Checks + +KOTS installations run automated preflight checks to validate the target environment before deploying. These checks catch common issues early and provide actionable remediation guidance. + +| Check | Type | What it validates | +|-------|------|-------------------| +| Kubernetes Version | Cluster | Kubernetes 1.21+ required, 1.28+ recommended | +| CPU Capacity | Node Resources | At least 4 CPU cores across all nodes | +| Storage Class | Storage | A default storage class exists for PostgreSQL and MinIO PVCs | +| Registry & Image Availability | Air-gap | Critical container images (mlflow, postgresql, minio) are accessible in the configured registry | + +### Air-Gap Image Validation + +The registry preflight check validates that critical container images are available before installation begins. In air-gap environments, this confirms all images from the airgap bundle were successfully pushed to the local registry. In online environments, it verifies network access to upstream registries (ghcr.io, quay.io). + +Images validated: +- `mlflow` — MLflow tracking server +- `cloudnative-pg/postgresql` — PostgreSQL database for metadata storage +- `minio` — S3-compatible object storage for artifacts + +If this check fails in an air-gap environment, re-push the airgap bundle to the local registry. In online environments, verify that the cluster has outbound network access to the image registries. + ## Getting Started ### Prerequisites diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index 3f65a07a..23fd8a63 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -15,7 +15,7 @@ vars: PORT: 5000 # Chart configuration - CHARTS: mlflow infra + CHARTS: mlflow infra postgres-support # Environment detection CI: @@ -39,7 +39,9 @@ vars: sh: helm show chart ./charts/mlflow | grep '^version:' | cut -d ' ' -f 2 INFRA_VERSION: sh: helm show chart ./charts/infra | grep '^version:' | cut -d ' ' -f 2 - + POSTGRES_SUPPORT_VERSION: + sh: helm show chart ./charts/postgres-support | grep '^version:' | cut -d ' ' -f 2 + # Release configuration # APP_NAME can be overridden by setting REPLICATED_APP environment variable APP_NAME: '{{.REPLICATED_APP | default "diamon-mlflow"}}' @@ -338,10 +340,12 @@ tasks: # Get chart versions MLFLOW_VERSION="{{.MLFLOW_VERSION}}" INFRA_VERSION="{{.INFRA_VERSION}}" - + POSTGRES_SUPPORT_VERSION="{{.POSTGRES_SUPPORT_VERSION}}" + echo "Working with chart versions:" echo "MLflow chart version: $MLFLOW_VERSION" echo "Infra chart version: $INFRA_VERSION" + echo "Postgres Support chart version: $POSTGRES_SUPPORT_VERSION" # Update MLflow HelmChart manifest MLFLOW_HELMCHART="{{.RELEASE_DIR}}/mlflow-chart.yaml" @@ -374,7 +378,21 @@ tasks: else echo "⚠️ Infra HelmChart not found at $INFRA_HELMCHART" fi - + + # Update Postgres Support HelmChart manifest + PG_SUPPORT_HELMCHART="{{.RELEASE_DIR}}/postgres-support-chart.yaml" + if [ -f "$PG_SUPPORT_HELMCHART" ]; then + echo "Updating version in $PG_SUPPORT_HELMCHART to $POSTGRES_SUPPORT_VERSION..." + + if command -v yq &> /dev/null; then + yq eval ".spec.chart.chartVersion = \"$POSTGRES_SUPPORT_VERSION\"" -i "$PG_SUPPORT_HELMCHART" + else + sed -i.bak "s/chartVersion:.*/chartVersion: \"$POSTGRES_SUPPORT_VERSION\"/" "$PG_SUPPORT_HELMCHART" && rm "${PG_SUPPORT_HELMCHART}.bak" + fi + else + echo "⚠️ Postgres Support HelmChart not found at $PG_SUPPORT_HELMCHART" + fi + echo "✅ Release manifest versions updated successfully." - cmd: task versions:verify || echo "⚠️ Version check failed after update. Please verify manually." ignore_error: true diff --git a/applications/mlflow/charts/mlflow/templates/deployment.yaml b/applications/mlflow/charts/mlflow/templates/deployment.yaml index a4cdf623..10868547 100644 --- a/applications/mlflow/charts/mlflow/templates/deployment.yaml +++ b/applications/mlflow/charts/mlflow/templates/deployment.yaml @@ -214,6 +214,8 @@ spec: {{- end }} {{- end }} volumeMounts: + - name: tmp + mountPath: /tmp {{- if .Values.mlflow.extraPipPackages }} - name: pip-packages mountPath: /pip-packages @@ -226,7 +228,11 @@ spec: {{- with .Values.mlflow.extraVolumeMounts }} {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.mlflow.resources }} + {{- $resources := .Values.mlflow.resources | default dict }} + {{- if and .Values.gpu.enabled .Values.gpu.resources }} + {{- $resources = mergeOverwrite (deepCopy $resources) .Values.gpu.resources }} + {{- end }} + {{- with $resources }} resources: {{- toYaml . | nindent 10 }} {{- end }} @@ -256,6 +262,8 @@ spec: {{ toYaml . | nindent 6 }} {{- end }} volumes: + - name: tmp + emptyDir: {} {{- if .Values.mlflow.extraPipPackages }} - name: pip-packages emptyDir: {} @@ -276,7 +284,11 @@ spec: hostAliases: {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.mlflow.nodeSelector }} + {{- $nodeSelector := .Values.mlflow.nodeSelector | default dict }} + {{- if and .Values.gpu.enabled .Values.gpu.nodeSelector }} + {{- $nodeSelector = merge .Values.gpu.nodeSelector $nodeSelector }} + {{- end }} + {{- with $nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} @@ -313,7 +325,11 @@ spec: topologySpreadConstraints: {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.mlflow.tolerations }} + {{- $tolerations := .Values.mlflow.tolerations | default list }} + {{- if and .Values.gpu.enabled .Values.gpu.tolerations }} + {{- $tolerations = concat $tolerations .Values.gpu.tolerations }} + {{- end }} + {{- with $tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/applications/mlflow/charts/mlflow/templates/license-configmap.yaml b/applications/mlflow/charts/mlflow/templates/license-configmap.yaml new file mode 100644 index 00000000..22b43a30 --- /dev/null +++ b/applications/mlflow/charts/mlflow/templates/license-configmap.yaml @@ -0,0 +1,15 @@ +{{- if or .Values.license.tier .Values.license.maxUsers }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "mlflow.fullname" . }}-license + labels: + {{- include "mlflow.labels" . | nindent 4 }} +data: + {{- with .Values.license.tier }} + LICENSE_TIER: {{ . | quote }} + {{- end }} + {{- with .Values.license.maxUsers }} + LICENSE_MAX_USERS: {{ . | quote }} + {{- end }} +{{- end }} diff --git a/applications/mlflow/charts/mlflow/templates/networkpolicy.yaml b/applications/mlflow/charts/mlflow/templates/networkpolicy.yaml new file mode 100644 index 00000000..f5b5c0e1 --- /dev/null +++ b/applications/mlflow/charts/mlflow/templates/networkpolicy.yaml @@ -0,0 +1,63 @@ +{{- if .Values.networkPolicy.enabled }} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "mlflow.fullname" . }} + labels: + {{- include "mlflow.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "mlflow.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + {{- if .Values.networkPolicy.allowSameNamespace }} + # Allow all traffic from pods in the same namespace + - from: + - podSelector: {} + {{- end }} + {{- if .Values.networkPolicy.ingress.ingressNginx.enabled }} + # Allow ingress-nginx to reach mlflow on the tracking server port + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Values.networkPolicy.ingress.ingressNginx.namespace }} + ports: + - protocol: TCP + port: {{ .Values.mlflow.service.port }} + {{- end }} + egress: + {{- if .Values.networkPolicy.allowSameNamespace }} + # Allow all traffic to pods in the same namespace + - to: + - podSelector: {} + {{- end }} + {{- if .Values.networkPolicy.egress.dns.enabled }} + # Allow DNS resolution + - to: + - namespaceSelector: {} + ports: + - protocol: UDP + port: {{ .Values.networkPolicy.egress.dns.port }} + - protocol: TCP + port: {{ .Values.networkPolicy.egress.dns.port }} + {{- end }} + {{- if .Values.networkPolicy.egress.postgres.enabled }} + # Allow mlflow to reach postgres + - to: + - podSelector: {} + ports: + - protocol: TCP + port: {{ .Values.networkPolicy.egress.postgres.port }} + {{- end }} + {{- if .Values.networkPolicy.egress.minio.enabled }} + # Allow mlflow to reach minio + - to: + - podSelector: {} + ports: + - protocol: TCP + port: {{ .Values.networkPolicy.egress.minio.port }} + {{- end }} +{{- end }} diff --git a/applications/mlflow/charts/mlflow/values.yaml b/applications/mlflow/charts/mlflow/values.yaml index 4bb56250..345dfdaf 100644 --- a/applications/mlflow/charts/mlflow/values.yaml +++ b/applications/mlflow/charts/mlflow/values.yaml @@ -47,11 +47,22 @@ mlflow: # -- Pod Labels for the mlflow deployment podLabels: {} # -- Configure the Security Context for the Pod - podSecurityContext: {} + podSecurityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault # -- Set the resource requests / limits for the container. resources: {} # -- Configure the Security Context for the Container - containerSecurityContext: {} + containerSecurityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL # -- Specify probes for the container # [[ref]](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) probes: @@ -325,10 +336,74 @@ mlflow: # -- Specify whether to ignore TLS ignoreTls: false +# -- License entitlements passed from KOTS Config via HelmChart CR values +license: + # -- License tier (e.g. "community", "team", "enterprise") + tier: "" + # -- Maximum number of users allowed by the license + maxUsers: "" + +# -- GPU node scheduling configuration for ML/AI workloads +# When enabled, merges GPU-specific nodeSelector, tolerations, and resource +# limits into the mlflow deployment alongside any existing scheduling config. +# +# This pattern uses NVIDIA GPU resources by default. To adapt for other providers: +# - AMD ROCm: change resource key to "amd.com/gpu", update nodeSelector/tolerations +# to match your AMD GPU node labels (e.g., "amd.com/gpu.present: 'true'") +# - Intel: use "gpu.intel.com/i915" as the resource key +# - Cloud-specific: GKE uses "cloud.google.com/gke-accelerator", EKS uses +# "k8s.amazonaws.com/accelerator", AKS uses "kubernetes.azure.com/accelerator" +gpu: + # -- Enable GPU node scheduling + enabled: false + # -- Node selector for GPU-capable nodes + # Common labels: "nvidia.com/gpu.present: 'true'" (NVIDIA GPU Operator), + # "cloud.google.com/gke-accelerator: nvidia-tesla-t4" (GKE) + nodeSelector: {} + # nvidia.com/gpu.present: "true" + # -- Tolerations for GPU node taints + # Most GPU node pools use taints to prevent non-GPU workloads from scheduling + tolerations: [] + # - key: nvidia.com/gpu + # operator: Exists + # effect: NoSchedule + # -- Resource limits for GPU allocation + resources: + limits: {} + # nvidia.com/gpu: "1" + replicated: # -- Specifies whetherto enable the Replicated SDK enabled: true +# -- NetworkPolicy configuration for restricting pod traffic +networkPolicy: + # -- Specifies whether to create a NetworkPolicy resource + enabled: false + # -- Allow ingress from pods in the same namespace + allowSameNamespace: true + # -- Ingress rules + ingress: + # -- Allow ingress-nginx to reach mlflow on the tracking server port + ingressNginx: + enabled: true + # -- Namespace where the ingress controller lives + namespace: ingress-nginx + # -- Egress rules + egress: + # -- Allow mlflow to reach postgres on port 5432 + postgres: + enabled: true + port: 5432 + # -- Allow mlflow to reach minio on port 9000 + minio: + enabled: true + port: 9000 + # -- Allow DNS resolution (required for service discovery) + dns: + enabled: true + port: 53 + minio: enabled: true secrets: diff --git a/applications/mlflow/charts/postgres-support/Chart.yaml b/applications/mlflow/charts/postgres-support/Chart.yaml new file mode 100644 index 00000000..105f3b2c --- /dev/null +++ b/applications/mlflow/charts/postgres-support/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: postgres-support +description: Support bundle specs for CloudnativePG PostgreSQL +type: application +version: 0.1.0 diff --git a/applications/mlflow/charts/postgres-support/templates/_supportbundle.tpl b/applications/mlflow/charts/postgres-support/templates/_supportbundle.tpl new file mode 100644 index 00000000..05b8248a --- /dev/null +++ b/applications/mlflow/charts/postgres-support/templates/_supportbundle.tpl @@ -0,0 +1,111 @@ +{{- define "postgres-support.supportbundle" -}} +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: postgres-supportbundle +spec: + collectors: + - logs: + name: cnpg-operator-logs + namespace: {{ .Release.Namespace }} + selector: + - app.kubernetes.io/name=cloudnative-pg + limits: + maxAge: 720h + maxLines: 10000 + - logs: + name: postgres-cluster-logs + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + limits: + maxAge: 720h + maxLines: 10000 + - exec: + name: pg-isready-check + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + - role=primary + command: ["pg_isready"] + args: ["-U", "postgres"] + timeout: 10s + - exec: + name: cnpg-cluster-status + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + - role=primary + command: ["psql"] + args: + - "-U" + - "postgres" + - "-c" + - "SELECT version(); SELECT pg_is_in_recovery(); SELECT count(*) AS active_connections FROM pg_stat_activity;" + timeout: 10s + - clusterResources: {} + - copy: + name: postgres-config + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + - role=primary + containerPath: /controller/run.json + containerName: postgres + analyzers: + - textAnalyze: + checkName: CloudnativePG Operator Running + fileName: cnpg-operator-logs/*.log + regex: "Starting manager" + outcomes: + - pass: + when: "true" + message: CloudnativePG operator is running + - fail: + when: "false" + message: CloudnativePG operator may not be running - check operator pod logs + - textAnalyze: + checkName: PostgreSQL Accepting Connections + fileName: pg-isready-check/*/pg_isready-*.txt + regex: "accepting connections" + outcomes: + - pass: + when: "true" + message: PostgreSQL is accepting connections + - fail: + when: "false" + message: PostgreSQL is not accepting connections - check cluster pod logs + - textAnalyze: + checkName: PostgreSQL Not in Recovery + fileName: cnpg-cluster-status/*/psql-*.txt + regex: "pg_is_in_recovery.*f" + outcomes: + - pass: + when: "true" + message: Primary PostgreSQL instance is not in recovery mode + - warn: + when: "false" + message: Primary PostgreSQL instance may be in recovery mode + - textAnalyze: + checkName: PostgreSQL WAL Errors + fileName: postgres-cluster-logs/*.log + regex: "FATAL|PANIC|could not write to WAL" + outcomes: + - fail: + when: "true" + message: PostgreSQL logs contain FATAL/PANIC errors or WAL write failures + - pass: + when: "false" + message: No critical PostgreSQL errors detected in logs + - textAnalyze: + checkName: CNPG Failover Events + fileName: cnpg-operator-logs/*.log + regex: "Initiating failover|failover completed" + outcomes: + - warn: + when: "true" + message: CloudnativePG failover events detected - review operator logs for details + - pass: + when: "false" + message: No failover events detected +{{- end -}} diff --git a/applications/mlflow/charts/postgres-support/templates/secret-supportbundle.yaml b/applications/mlflow/charts/postgres-support/templates/secret-supportbundle.yaml new file mode 100644 index 00000000..dd2ddf89 --- /dev/null +++ b/applications/mlflow/charts/postgres-support/templates/secret-supportbundle.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-postgres-supportbundle + labels: + troubleshoot.sh/kind: support-bundle +type: Opaque +stringData: + support-bundle-spec: | +{{ include "postgres-support.supportbundle" . | indent 4 }} diff --git a/applications/mlflow/docs/auto-update.md b/applications/mlflow/docs/auto-update.md new file mode 100644 index 00000000..940b318d --- /dev/null +++ b/applications/mlflow/docs/auto-update.md @@ -0,0 +1,90 @@ +# MLflow Auto-Update Behavior + +This document describes how KOTS automatic updates work with MLflow's multi-chart HelmChart weight ordering, including edge cases and recommended settings. + +## How Auto-Deploy Works + +KOTS can automatically deploy new releases based on semantic versioning. The `semverAutoDeploy` field in the Application manifest controls this behavior: + +| Value | Behavior | +|-------|----------| +| `disabled` | No automatic deployment (manual deploy only) | +| `auto-patch` | Automatically deploy patch updates (e.g., 1.0.1 → 1.0.2) | +| `auto-minor-patch` | Automatically deploy minor and patch updates (e.g., 1.0.x → 1.1.x) | +| `auto-major-minor-patch` | Automatically deploy all semver updates | + +This application uses `auto-patch` as a balanced default: critical fixes deploy automatically while feature releases require manual review. + +## Multi-Chart Weight Ordering + +MLflow uses two HelmChart resources with different weights to control installation and upgrade order: + +| Chart | Weight | Purpose | +|-------|--------|---------| +| `infra` | -10 | Infrastructure operators (CloudnativePG, MinIO Operator) | +| `mlflow` | 10 | Application chart (MLflow server, database clusters, MinIO tenants) | + +KOTS deploys charts in ascending weight order. Lower weights deploy first. This ordering is critical because: + +1. **Install**: The infra chart installs CRD-providing operators (CloudnativePG, MinIO Operator) before the mlflow chart creates custom resources that depend on those CRDs. +2. **Upgrade**: Operator upgrades (new CRD versions, controller changes) complete before application resources are reconciled against the updated operators. + +When `semverAutoDeploy` triggers an automatic update, KOTS respects this weight ordering. The infra chart upgrades first (`--wait --timeout 600s`), and only after it succeeds does the mlflow chart upgrade begin. + +## Edge Cases + +### CRD Changes During Operator Upgrades + +When a new release updates a CRD-providing operator (e.g., CloudnativePG), the infra chart upgrade installs updated CRDs before the mlflow chart reconciles. Helm's `--wait` flag on the infra chart ensures the operator pod is running and ready before proceeding. However, if the operator needs time to migrate existing custom resources to a new CRD version, the 600-second timeout may not be sufficient for large clusters. + +**Mitigation**: For clusters with many PostgreSQL clusters or MinIO tenants, consider increasing the `--timeout` value in `infra-chart.yaml` or using `disabled` auto-deploy to control upgrade timing. + +### Config Field Changes Requiring Re-Deploy + +Some KOTS config changes (e.g., switching from embedded to external PostgreSQL) alter which charts are deployed via HelmChart `exclude` conditions. Auto-deploy only triggers on new release versions, not config changes. When a config change requires re-deploy: + +1. The admin saves new config values in the KOTS Admin Console +2. KOTS generates a new version from the config change +3. The admin must manually deploy this version, even with auto-deploy enabled + +Auto-deploy does not apply to config-triggered versions — only to upstream releases from the vendor. + +### Semver Rollback Behavior + +KOTS does not automatically deploy versions older than the currently deployed version, even if the channel sequence is higher. For example, if version 1.2.0 is deployed and the vendor publishes 1.1.5 (a backport), auto-deploy skips it because 1.1.5 < 1.2.0 in semver ordering. + +To deploy an older version, `allowRollback` must be enabled and the rollback must be triggered manually from the Admin Console. + +### Required Versions + +If a release is marked as `isRequired` by the vendor, KOTS will not skip it during auto-deploy. Required versions are always deployed in sequence before any later version. This means a required release that introduces a breaking migration will be applied even if a newer patch is also available. + +### Conditional Chart Exclusion + +The infra chart has an `exclude` condition that skips it when external PostgreSQL is configured. When auto-deploy triggers an update in this configuration: + +- Only the mlflow chart deploys (weight ordering is irrelevant with a single chart) +- The operator CRDs remain at their last-installed version +- Ensure external database compatibility is validated before enabling auto-deploy in this configuration + +## Recommended Settings + +### Production Channels + +```yaml +semverAutoDeploy: auto-patch +``` + +Patch-only auto-deploy is recommended for production. This ensures critical bug fixes and security patches are applied automatically while feature releases (minor/major) require explicit review and testing. + +### Development / Staging Channels + +```yaml +semverAutoDeploy: auto-minor-patch +``` + +Development and staging environments benefit from more aggressive auto-deploy to catch integration issues early. Minor version bumps often introduce new features that need validation before reaching production. + +### Air-Gap Environments + +Auto-deploy has no effect in air-gap environments. New releases must be manually uploaded to the Admin Console as airgap bundles. The `semverAutoDeploy` field is ignored when the instance cannot reach the update server. diff --git a/applications/mlflow/docs/backup-restore.md b/applications/mlflow/docs/backup-restore.md new file mode 100644 index 00000000..8d92bdb0 --- /dev/null +++ b/applications/mlflow/docs/backup-restore.md @@ -0,0 +1,145 @@ +# MLflow Backup and Restore + +This document describes the backup and restore procedures for MLflow deployed via KOTS with embedded PostgreSQL (CloudnativePG) and MinIO object storage. + +## How KOTS Snapshots Work + +KOTS uses [Velero](https://velero.io/) to create point-in-time snapshots of the application. When `allowSnapshots` is enabled in the Application manifest, the KOTS Admin Console exposes backup and restore controls under the **Snapshots** tab. + +A KOTS snapshot captures: + +- All Kubernetes resources in the application namespace (Deployments, Services, Secrets, ConfigMaps, CRDs, etc.) +- Persistent Volume Claims (PVCs) and their data via Velero's volume snapshot or file-system backup plugins + +### Stateful Volumes + +MLflow has two categories of stateful PVCs that contain critical data: + +| Component | Managed By | PVC Pattern | Data | +|-----------|-----------|-------------|------| +| PostgreSQL | CloudnativePG Operator | `-postgres-` | MLflow experiment metadata, run parameters, metrics | +| MinIO | MinIO Operator | `data-minio-pool-0-` | MLflow model artifacts, datasets, logged files | + +Both operators dynamically provision PVCs. Velero includes all PVCs in the application namespace by default when taking a KOTS snapshot. + +## Prerequisites + +- Velero installed with a compatible storage provider (AWS S3, GCP, Azure, or MinIO as a backup target) +- A configured Velero `BackupStorageLocation` pointing to an external object store (do **not** use the in-cluster MinIO as the backup target) +- The KOTS Admin Console preflight check for Velero should pass before taking backups + +## Full Backup Procedure + +### Via KOTS Admin Console + +1. Open the Admin Console and navigate to **Snapshots** > **Full Snapshots** +2. Click **Start a snapshot** +3. Wait for the snapshot to reach **Completed** status +4. Verify the snapshot shows the expected PVC count (PostgreSQL + MinIO volumes) + +### Via KOTS CLI + +```bash +# Create a full snapshot (application + admin console) +kubectl kots backup --namespace + +# List existing backups +kubectl kots backup ls --namespace +``` + +## Restore Procedure + +Restoring MLflow requires attention to operator ordering. The CloudnativePG and MinIO operators must be running before their managed resources (Cluster CRs, Tenant CRs) are restored, or the restored custom resources will have no controller to reconcile them. + +### Restore Steps + +1. **Ensure operators are installed first.** If restoring to a fresh cluster (disaster recovery), install Embedded Cluster or deploy the infrastructure chart (`infra`) before restoring the application. The infra chart installs the CloudnativePG and MinIO operators. + +2. **Initiate the restore** from the KOTS Admin Console or CLI: + + ```bash + # List available backups + kubectl kots backup ls --namespace + + # Restore from a specific backup + kubectl kots restore --from-backup --namespace + ``` + +3. **Wait for operators to reconcile.** After restore completes: + - The CloudnativePG operator detects the restored `Cluster` CR and reconciles the PostgreSQL instances against the restored PVC data + - The MinIO operator detects the restored `Tenant` CR and reconciles the MinIO pool against the restored PVC data + +4. **Monitor pod readiness:** + + ```bash + # Check PostgreSQL cluster status + kubectl get clusters.postgresql.cnpg.io -n + kubectl get pods -l cnpg.io/cluster -n + + # Check MinIO tenant status + kubectl get tenants.minio.min.io -n + kubectl get pods -l v1.min.io/tenant -n + + # Check MLflow deployment + kubectl get deployment mlflow -n + ``` + +### Operator Ordering Considerations + +| Scenario | Operator State | Action Required | +|----------|---------------|----------------| +| Restore to existing cluster | Operators already running | No special action; restore proceeds normally | +| Restore to fresh EC install | Operators installed by EC | Ensure EC install completes before restore | +| Restore to fresh KOTS install | Operators in infra chart | Ensure infra chart (weight: -10) deploys first | + +If operators are not present when CRs are restored, the CRs will exist but remain unreconciled. In this case, reinstall the infra chart and the operators will pick up the existing CRs. + +## Verification Steps Post-Restore + +Run these checks after a restore to confirm data integrity: + +### 1. PostgreSQL Connectivity + +```bash +# Verify the CNPG cluster reports as healthy +kubectl get clusters.postgresql.cnpg.io -n -o jsonpath='{.items[0].status.phase}' +# Expected: "Cluster in healthy state" + +# Connect and verify data +kubectl exec -it -n -- psql -U mlflow -d mlflow -c "SELECT count(*) FROM experiments;" +``` + +### 2. MinIO Object Access + +```bash +# Port-forward to MinIO +kubectl port-forward svc/minio -n 9000:9000 & + +# Verify bucket contents (requires mc CLI) +mc alias set local http://localhost:9000 +mc ls local/mlflow/ +``` + +### 3. MLflow Application Health + +```bash +# Verify MLflow pod is running +kubectl get pods -l app.kubernetes.io/name=mlflow -n + +# Check MLflow can read experiments +kubectl port-forward svc/mlflow -n 5000:5000 & +curl -s http://localhost:5000/api/2.0/mlflow/experiments/search | head -c 200 +``` + +### 4. KOTS Admin Console Status + +After restore, the Admin Console should show: +- Application status: **Ready** +- All status informers green (`deployment/mlflow`, `services/mlflow`) + +## Limitations + +- **External PostgreSQL / S3**: If using external database or object storage (not embedded), those services are outside the KOTS snapshot scope. Back them up independently using your provider's backup tooling. +- **Backup target**: Do not configure Velero to store backups in the same MinIO instance that is being backed up. Use an external storage location. +- **Concurrent writes during backup**: For maximum consistency, consider scaling down the MLflow deployment before taking a snapshot, though Velero's file-system backup is crash-consistent. +- **Large artifacts**: MinIO PVC backups can be large if significant model artifacts are stored. Ensure the Velero backup storage location has sufficient capacity. diff --git a/applications/mlflow/release/kots-app.yaml b/applications/mlflow/release/kots-app.yaml index 8ebca90f..44bced35 100644 --- a/applications/mlflow/release/kots-app.yaml +++ b/applications/mlflow/release/kots-app.yaml @@ -9,6 +9,8 @@ spec: # MLflow MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. allowRollback: false + allowSnapshots: true + semverAutoDeploy: auto-patch statusInformers: - deployment/mlflow - services/mlflow diff --git a/applications/mlflow/release/kots-config.yaml b/applications/mlflow/release/kots-config.yaml index 1d84fa01..0b41bd04 100644 --- a/applications/mlflow/release/kots-config.yaml +++ b/applications/mlflow/release/kots-config.yaml @@ -127,6 +127,21 @@ spec: when: '{{repl ConfigOptionEquals "postgres_type" "external_postgres"}}' type: text default: postgres + # License settings (read-only, sourced from license entitlements) + - name: license_settings + title: License + description: License entitlements (read-only, set by your license) + items: + - name: license_tier + title: License Tier + type: text + value: '{{repl LicenseFieldValue "tier"}}' + readonly: true + - name: license_max_users + title: Max Users + type: text + value: '{{repl LicenseFieldValue "max_users"}}' + readonly: true # Object Store Settings - name: objectstore_settings title: Object Storage diff --git a/applications/mlflow/release/kots-preflight.yaml b/applications/mlflow/release/kots-preflight.yaml index 695aad87..c1917555 100644 --- a/applications/mlflow/release/kots-preflight.yaml +++ b/applications/mlflow/release/kots-preflight.yaml @@ -3,6 +3,13 @@ kind: Preflight metadata: name: mlflow spec: + collectors: + - registryImages: + collectorName: critical-images + images: + - 'repl{{ HasLocalRegistry | ternary (print LocalRegistryHost "/" LocalRegistryNamespace "/mlflow:v3.3.2") "ghcr.io/mlflow/mlflow:v3.3.2" }}' + - 'repl{{ HasLocalRegistry | ternary (print LocalRegistryHost "/" LocalRegistryNamespace "/postgresql:15.2") "ghcr.io/cloudnative-pg/postgresql:15.2" }}' + - 'repl{{ HasLocalRegistry | ternary (print LocalRegistryHost "/" LocalRegistryNamespace "/minio:RELEASE.2024-05-01T01-11-10Z") "quay.io/minio/minio:RELEASE.2024-05-01T01-11-10Z" }}' analyzers: - clusterVersion: outcomes: @@ -28,9 +35,45 @@ spec: - pass: message: There are at least 2 cores in the cluster - storageClass: - checkName: Check for default storage class + checkName: Storage class for PVC-backed services outcomes: - fail: - message: No default storage class found + message: | + No default storage class found. PostgreSQL and MinIO require persistent + volume claims for data storage. Configure a default storage class before + installing. - pass: - message: Default storage class found + message: Default storage class found for PostgreSQL and MinIO persistent volumes. + - customResourceDefinition: + checkName: Velero is installed for snapshot support + customResourceDefinitionName: backups.velero.io + outcomes: + - warn: + when: "< 1" + message: >- + Velero is not installed. KOTS snapshots for backup and restore + of PostgreSQL and MinIO data volumes will not be available. + Install Velero with a compatible storage provider to enable snapshots. + - pass: + when: ">= 1" + message: Velero is installed and available for KOTS snapshots. + - registryImages: + checkName: Registry and Image Availability + collectorName: critical-images + outcomes: + - fail: + when: "missing > 0" + message: | + One or more required container images are missing from the registry. + In air-gap environments, ensure all images from the airgap bundle have + been pushed to the local registry. In online environments, verify network + access to ghcr.io and quay.io. + Required images: mlflow, cloudnative-pg/postgresql, minio + - warn: + when: "errors > 0" + message: | + Failed to verify one or more container images in the registry. This may + indicate registry authentication issues or network connectivity problems. + Verify the registry is accessible from the cluster. + - pass: + message: All required container images are available in the registry. diff --git a/applications/mlflow/release/mlflow-chart.yaml b/applications/mlflow/release/mlflow-chart.yaml index e224a720..5dea8ddf 100644 --- a/applications/mlflow/release/mlflow-chart.yaml +++ b/applications/mlflow/release/mlflow-chart.yaml @@ -31,6 +31,9 @@ spec: accessKeyId: repl{{ ConfigOption "embedded_s3_access_key" }} secretAccessKey: repl{{ ConfigOption "embedded_s3_secret_key" }} podAntiAffinityTopologyKey: "kubernetes.io/hostname" + license: + tier: repl{{ ConfigOption "license_tier" }} + maxUsers: repl{{ ConfigOption "license_max_users" }} postgres: auth: password: repl{{ ConfigOption "embedded_postgres_password"}} diff --git a/applications/mlflow/release/postgres-support-chart.yaml b/applications/mlflow/release/postgres-support-chart.yaml new file mode 100644 index 00000000..bad100a4 --- /dev/null +++ b/applications/mlflow/release/postgres-support-chart.yaml @@ -0,0 +1,12 @@ +apiVersion: kots.io/v1beta2 +kind: HelmChart +metadata: + name: postgres-support +spec: + chart: + name: postgres-support + chartVersion: 0.1.0 + exclude: 'repl{{ ConfigOptionEquals `postgres_type` `external_postgres` }}' + weight: -5 + values: {} + builder: {}