replicatedhq · kriscoleman · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml
@@ -176,7 +176,7 @@ jobs:
           - distribution: k3s
             version: 1.32
             nodes: 1
-          - distribution: gke
+          - distribution: eks
             version: 1.32
             nodes: 3
         config:
@@ -315,7 +315,7 @@ jobs:
           - distribution: k3s
             version: 1.32
             nodes: 1
-          - distribution: gke
+          - distribution: eks
             version: 1.32
             nodes: 3
     steps:

diff --git a/.gitignore b/.gitignore
@@ -63,5 +63,12 @@ applications/wg-easy/release/
 applications/flipt/release/
 applications/flipt/chart/Chart.lock
 
-**/.claude/settings.local.json
+.claude/
+CLAUDE.md
 .worktrees/
+
+# Gas Town workspace artifacts
+.beads/
+.claude/
+.runtime/
+CLAUDE.md
diff --git a/applications/mlflow/README.md b/applications/mlflow/README.md
@@ -32,6 +32,7 @@ helm install mlflow oci://registry.replicated.com/mlflow/stable
 
 - [MLflow Helm Chart Documentation](./charts/mlflow/README.md) - Installation and configuration details
 - [Configuration Reference](./charts/mlflow/README_CONFIG.md) - Detailed configuration options
+- [Auto-Update Behavior](./docs/auto-update.md) - How KOTS auto-deploy works with multi-chart weight ordering
 - [Development Guide](./DEVELOPMENT.md) - Guide for development including containerized environment
 
 ## For Developers
@@ -97,6 +98,28 @@ This solution offers flexibility in how you store MLflow data:
 
 See the [Configuration Reference](./charts/mlflow/README_CONFIG.md) for detailed setup instructions.
 
+## Preflight Checks
+
+KOTS installations run automated preflight checks to validate the target environment before deploying. These checks catch common issues early and provide actionable remediation guidance.
+
+| Check | Type | What it validates |
+|-------|------|-------------------|
+| Kubernetes Version | Cluster | Kubernetes 1.21+ required, 1.28+ recommended |
+| CPU Capacity | Node Resources | At least 4 CPU cores across all nodes |
+| Storage Class | Storage | A default storage class exists for PostgreSQL and MinIO PVCs |
+| Registry & Image Availability | Air-gap | Critical container images (mlflow, postgresql, minio) are accessible in the configured registry |
+
+### Air-Gap Image Validation
+
+The registry preflight check validates that critical container images are available before installation begins. In air-gap environments, this confirms all images from the airgap bundle were successfully pushed to the local registry. In online environments, it verifies network access to upstream registries (ghcr.io, quay.io).
+
+Images validated:
+- `mlflow` — MLflow tracking server
+- `cloudnative-pg/postgresql` — PostgreSQL database for metadata storage
+- `minio` — S3-compatible object storage for artifacts
+
+If this check fails in an air-gap environment, re-push the airgap bundle to the local registry. In online environments, verify that the cluster has outbound network access to the image registries.
+
 ## Getting Started
 
 ### Prerequisites

diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml
@@ -15,7 +15,7 @@ vars:
   PORT: 5000
 
   # Chart configuration
-  CHARTS: mlflow infra
+  CHARTS: mlflow infra postgres-support
 
   # Environment detection
   CI:
@@ -39,7 +39,9 @@ vars:
     sh: helm show chart ./charts/mlflow | grep '^version:' | cut -d ' ' -f 2
   INFRA_VERSION:
     sh: helm show chart ./charts/infra | grep '^version:' | cut -d ' ' -f 2
-
+  POSTGRES_SUPPORT_VERSION:
+    sh: helm show chart ./charts/postgres-support | grep '^version:' | cut -d ' ' -f 2
+
   # Release configuration
   # APP_NAME can be overridden by setting REPLICATED_APP environment variable 
   APP_NAME: '{{.REPLICATED_APP | default "diamon-mlflow"}}'
@@ -338,10 +340,12 @@ tasks:
         # Get chart versions
         MLFLOW_VERSION="{{.MLFLOW_VERSION}}"
         INFRA_VERSION="{{.INFRA_VERSION}}"
-
+        POSTGRES_SUPPORT_VERSION="{{.POSTGRES_SUPPORT_VERSION}}"
+
         echo "Working with chart versions:"
         echo "MLflow chart version: $MLFLOW_VERSION"
         echo "Infra chart version: $INFRA_VERSION"
+        echo "Postgres Support chart version: $POSTGRES_SUPPORT_VERSION"
 
         # Update MLflow HelmChart manifest
         MLFLOW_HELMCHART="{{.RELEASE_DIR}}/mlflow-chart.yaml"
@@ -374,7 +378,21 @@ tasks:
         else
           echo "⚠️ Infra HelmChart not found at $INFRA_HELMCHART"
         fi
-
+
+        # Update Postgres Support HelmChart manifest
+        PG_SUPPORT_HELMCHART="{{.RELEASE_DIR}}/postgres-support-chart.yaml"
+        if [ -f "$PG_SUPPORT_HELMCHART" ]; then
+          echo "Updating version in $PG_SUPPORT_HELMCHART to $POSTGRES_SUPPORT_VERSION..."
+
+          if command -v yq &> /dev/null; then
+            yq eval ".spec.chart.chartVersion = \"$POSTGRES_SUPPORT_VERSION\"" -i "$PG_SUPPORT_HELMCHART"
+          else
+            sed -i.bak "s/chartVersion:.*/chartVersion: \"$POSTGRES_SUPPORT_VERSION\"/" "$PG_SUPPORT_HELMCHART" && rm "${PG_SUPPORT_HELMCHART}.bak"
+          fi
+        else
+          echo "⚠️ Postgres Support HelmChart not found at $PG_SUPPORT_HELMCHART"
+        fi
+
         echo "✅ Release manifest versions updated successfully."
       - cmd: task versions:verify || echo "⚠️ Version check failed after update. Please verify manually."
         ignore_error: true

diff --git a/applications/mlflow/charts/mlflow/templates/deployment.yaml b/applications/mlflow/charts/mlflow/templates/deployment.yaml
@@ -214,6 +214,8 @@ spec:
         {{- end }}
         {{- end }}
         volumeMounts:
+        - name: tmp
+          mountPath: /tmp
         {{- if .Values.mlflow.extraPipPackages }}
         - name: pip-packages
           mountPath: /pip-packages
@@ -226,7 +228,11 @@ spec:
         {{- with .Values.mlflow.extraVolumeMounts }}
           {{- toYaml . | nindent 8 }}
         {{- end }}
-        {{- with .Values.mlflow.resources }}
+        {{- $resources := .Values.mlflow.resources | default dict }}
+        {{- if and .Values.gpu.enabled .Values.gpu.resources }}
+        {{- $resources = mergeOverwrite (deepCopy $resources) .Values.gpu.resources }}
+        {{- end }}
+        {{- with $resources }}
         resources:
           {{- toYaml . | nindent 10 }}
         {{- end }}
@@ -256,6 +262,8 @@ spec:
         {{ toYaml . | nindent 6 }}
       {{- end }}
       volumes:
+      - name: tmp
+        emptyDir: {}
       {{- if .Values.mlflow.extraPipPackages }}
       - name: pip-packages
         emptyDir: {}
@@ -276,7 +284,11 @@ spec:
       hostAliases:
         {{- toYaml . | nindent 8 }}
       {{- end }}
-      {{- with .Values.mlflow.nodeSelector }}
+      {{- $nodeSelector := .Values.mlflow.nodeSelector | default dict }}
+      {{- if and .Values.gpu.enabled .Values.gpu.nodeSelector }}
+      {{- $nodeSelector = merge .Values.gpu.nodeSelector $nodeSelector }}
+      {{- end }}
+      {{- with $nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -313,7 +325,11 @@ spec:
       topologySpreadConstraints:
         {{- toYaml . | nindent 8 }}
       {{- end }}
-      {{- with .Values.mlflow.tolerations }}
+      {{- $tolerations := .Values.mlflow.tolerations | default list }}
+      {{- if and .Values.gpu.enabled .Values.gpu.tolerations }}
+      {{- $tolerations = concat $tolerations .Values.gpu.tolerations }}
+      {{- end }}
+      {{- with $tolerations }}
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
diff --git a/applications/mlflow/charts/mlflow/templates/license-configmap.yaml b/applications/mlflow/charts/mlflow/templates/license-configmap.yaml
@@ -0,0 +1,15 @@
+{{- if or .Values.license.tier .Values.license.maxUsers }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "mlflow.fullname" . }}-license
+  labels:
+    {{- include "mlflow.labels" . | nindent 4 }}
+data:
+  {{- with .Values.license.tier }}
+  LICENSE_TIER: {{ . | quote }}
+  {{- end }}
+  {{- with .Values.license.maxUsers }}
+  LICENSE_MAX_USERS: {{ . | quote }}
+  {{- end }}
+{{- end }}
diff --git a/applications/mlflow/charts/mlflow/templates/networkpolicy.yaml b/applications/mlflow/charts/mlflow/templates/networkpolicy.yaml
@@ -0,0 +1,63 @@
+{{- if .Values.networkPolicy.enabled }}
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: {{ include "mlflow.fullname" . }}
+  labels:
+    {{- include "mlflow.labels" . | nindent 4 }}
+spec:
+  podSelector:
+    matchLabels:
+      {{- include "mlflow.selectorLabels" . | nindent 6 }}
+  policyTypes:
+    - Ingress
+    - Egress
+  ingress:
+    {{- if .Values.networkPolicy.allowSameNamespace }}
+    # Allow all traffic from pods in the same namespace
+    - from:
+        - podSelector: {}
+    {{- end }}
+    {{- if .Values.networkPolicy.ingress.ingressNginx.enabled }}
+    # Allow ingress-nginx to reach mlflow on the tracking server port
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: {{ .Values.networkPolicy.ingress.ingressNginx.namespace }}
+      ports:
+        - protocol: TCP
+          port: {{ .Values.mlflow.service.port }}
+    {{- end }}
+  egress:
+    {{- if .Values.networkPolicy.allowSameNamespace }}
+    # Allow all traffic to pods in the same namespace
+    - to:
+        - podSelector: {}
+    {{- end }}
+    {{- if .Values.networkPolicy.egress.dns.enabled }}
+    # Allow DNS resolution
+    - to:
+        - namespaceSelector: {}
+      ports:
+        - protocol: UDP
+          port: {{ .Values.networkPolicy.egress.dns.port }}
+        - protocol: TCP
+          port: {{ .Values.networkPolicy.egress.dns.port }}
+    {{- end }}
+    {{- if .Values.networkPolicy.egress.postgres.enabled }}
+    # Allow mlflow to reach postgres
+    - to:
+        - podSelector: {}
+      ports:
+        - protocol: TCP
+          port: {{ .Values.networkPolicy.egress.postgres.port }}
+    {{- end }}
+    {{- if .Values.networkPolicy.egress.minio.enabled }}
+    # Allow mlflow to reach minio
+    - to:
+        - podSelector: {}
+      ports:
+        - protocol: TCP
+          port: {{ .Values.networkPolicy.egress.minio.port }}
+    {{- end }}
+{{- end }}
diff --git a/applications/mlflow/charts/mlflow/values.yaml b/applications/mlflow/charts/mlflow/values.yaml
@@ -47,11 +47,22 @@ mlflow:
   # -- Pod Labels for the mlflow deployment
   podLabels: {}
   # -- Configure the Security Context for the Pod
-  podSecurityContext: {}
+  podSecurityContext:
+    runAsNonRoot: true
+    runAsUser: 1000
+    runAsGroup: 1000
+    fsGroup: 1000
+    seccompProfile:
+      type: RuntimeDefault
   # -- Set the resource requests / limits for the container.
   resources: {}
   # -- Configure the Security Context for the Container
-  containerSecurityContext: {}
+  containerSecurityContext:
+    readOnlyRootFilesystem: true
+    allowPrivilegeEscalation: false
+    capabilities:
+      drop:
+        - ALL
   # -- Specify probes for the container
   # [[ref]](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/)
   probes:
@@ -325,10 +336,74 @@ mlflow:
         # -- Specify whether to ignore TLS
         ignoreTls: false
 
+# -- License entitlements passed from KOTS Config via HelmChart CR values
+license:
+  # -- License tier (e.g. "community", "team", "enterprise")
+  tier: ""
+  # -- Maximum number of users allowed by the license
+  maxUsers: ""
+
+# -- GPU node scheduling configuration for ML/AI workloads
+# When enabled, merges GPU-specific nodeSelector, tolerations, and resource
+# limits into the mlflow deployment alongside any existing scheduling config.
+#
+# This pattern uses NVIDIA GPU resources by default. To adapt for other providers:
+#   - AMD ROCm: change resource key to "amd.com/gpu", update nodeSelector/tolerations
+#     to match your AMD GPU node labels (e.g., "amd.com/gpu.present: 'true'")
+#   - Intel: use "gpu.intel.com/i915" as the resource key
+#   - Cloud-specific: GKE uses "cloud.google.com/gke-accelerator", EKS uses
+#     "k8s.amazonaws.com/accelerator", AKS uses "kubernetes.azure.com/accelerator"
+gpu:
+  # -- Enable GPU node scheduling
+  enabled: false
+  # -- Node selector for GPU-capable nodes
+  # Common labels: "nvidia.com/gpu.present: 'true'" (NVIDIA GPU Operator),
+  # "cloud.google.com/gke-accelerator: nvidia-tesla-t4" (GKE)
+  nodeSelector: {}
+  #   nvidia.com/gpu.present: "true"
+  # -- Tolerations for GPU node taints
+  # Most GPU node pools use taints to prevent non-GPU workloads from scheduling
+  tolerations: []
+  #   - key: nvidia.com/gpu
+  #     operator: Exists
+  #     effect: NoSchedule
+  # -- Resource limits for GPU allocation
+  resources:
+    limits: {}
+    #   nvidia.com/gpu: "1"
+
 replicated:
   # -- Specifies whetherto enable the Replicated SDK
   enabled: true
 
+# -- NetworkPolicy configuration for restricting pod traffic
+networkPolicy:
+  # -- Specifies whether to create a NetworkPolicy resource
+  enabled: false
+  # -- Allow ingress from pods in the same namespace
+  allowSameNamespace: true
+  # -- Ingress rules
+  ingress:
+    # -- Allow ingress-nginx to reach mlflow on the tracking server port
+    ingressNginx:
+      enabled: true
+      # -- Namespace where the ingress controller lives
+      namespace: ingress-nginx
+  # -- Egress rules
+  egress:
+    # -- Allow mlflow to reach postgres on port 5432
+    postgres:
+      enabled: true
+      port: 5432
+    # -- Allow mlflow to reach minio on port 9000
+    minio:
+      enabled: true
+      port: 9000
+    # -- Allow DNS resolution (required for service discovery)
+    dns:
+      enabled: true
+      port: 53
+
 minio:
   enabled: true
   secrets:

diff --git a/applications/mlflow/charts/postgres-support/Chart.yaml b/applications/mlflow/charts/postgres-support/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+name: postgres-support
+description: Support bundle specs for CloudnativePG PostgreSQL
+type: application
+version: 0.1.0