diff --git a/applications/mlflow/Taskfile.yml b/applications/mlflow/Taskfile.yml index 3f65a07a..23fd8a63 100644 --- a/applications/mlflow/Taskfile.yml +++ b/applications/mlflow/Taskfile.yml @@ -15,7 +15,7 @@ vars: PORT: 5000 # Chart configuration - CHARTS: mlflow infra + CHARTS: mlflow infra postgres-support # Environment detection CI: @@ -39,7 +39,9 @@ vars: sh: helm show chart ./charts/mlflow | grep '^version:' | cut -d ' ' -f 2 INFRA_VERSION: sh: helm show chart ./charts/infra | grep '^version:' | cut -d ' ' -f 2 - + POSTGRES_SUPPORT_VERSION: + sh: helm show chart ./charts/postgres-support | grep '^version:' | cut -d ' ' -f 2 + # Release configuration # APP_NAME can be overridden by setting REPLICATED_APP environment variable APP_NAME: '{{.REPLICATED_APP | default "diamon-mlflow"}}' @@ -338,10 +340,12 @@ tasks: # Get chart versions MLFLOW_VERSION="{{.MLFLOW_VERSION}}" INFRA_VERSION="{{.INFRA_VERSION}}" - + POSTGRES_SUPPORT_VERSION="{{.POSTGRES_SUPPORT_VERSION}}" + echo "Working with chart versions:" echo "MLflow chart version: $MLFLOW_VERSION" echo "Infra chart version: $INFRA_VERSION" + echo "Postgres Support chart version: $POSTGRES_SUPPORT_VERSION" # Update MLflow HelmChart manifest MLFLOW_HELMCHART="{{.RELEASE_DIR}}/mlflow-chart.yaml" @@ -374,7 +378,21 @@ tasks: else echo "⚠️ Infra HelmChart not found at $INFRA_HELMCHART" fi - + + # Update Postgres Support HelmChart manifest + PG_SUPPORT_HELMCHART="{{.RELEASE_DIR}}/postgres-support-chart.yaml" + if [ -f "$PG_SUPPORT_HELMCHART" ]; then + echo "Updating version in $PG_SUPPORT_HELMCHART to $POSTGRES_SUPPORT_VERSION..." + + if command -v yq &> /dev/null; then + yq eval ".spec.chart.chartVersion = \"$POSTGRES_SUPPORT_VERSION\"" -i "$PG_SUPPORT_HELMCHART" + else + sed -i.bak "s/chartVersion:.*/chartVersion: \"$POSTGRES_SUPPORT_VERSION\"/" "$PG_SUPPORT_HELMCHART" && rm "${PG_SUPPORT_HELMCHART}.bak" + fi + else + echo "⚠️ Postgres Support HelmChart not found at $PG_SUPPORT_HELMCHART" + fi + echo "✅ Release manifest versions updated successfully." - cmd: task versions:verify || echo "⚠️ Version check failed after update. Please verify manually." ignore_error: true diff --git a/applications/mlflow/charts/postgres-support/Chart.yaml b/applications/mlflow/charts/postgres-support/Chart.yaml new file mode 100644 index 00000000..105f3b2c --- /dev/null +++ b/applications/mlflow/charts/postgres-support/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: postgres-support +description: Support bundle specs for CloudnativePG PostgreSQL +type: application +version: 0.1.0 diff --git a/applications/mlflow/charts/postgres-support/templates/_supportbundle.tpl b/applications/mlflow/charts/postgres-support/templates/_supportbundle.tpl new file mode 100644 index 00000000..05b8248a --- /dev/null +++ b/applications/mlflow/charts/postgres-support/templates/_supportbundle.tpl @@ -0,0 +1,111 @@ +{{- define "postgres-support.supportbundle" -}} +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: postgres-supportbundle +spec: + collectors: + - logs: + name: cnpg-operator-logs + namespace: {{ .Release.Namespace }} + selector: + - app.kubernetes.io/name=cloudnative-pg + limits: + maxAge: 720h + maxLines: 10000 + - logs: + name: postgres-cluster-logs + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + limits: + maxAge: 720h + maxLines: 10000 + - exec: + name: pg-isready-check + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + - role=primary + command: ["pg_isready"] + args: ["-U", "postgres"] + timeout: 10s + - exec: + name: cnpg-cluster-status + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + - role=primary + command: ["psql"] + args: + - "-U" + - "postgres" + - "-c" + - "SELECT version(); SELECT pg_is_in_recovery(); SELECT count(*) AS active_connections FROM pg_stat_activity;" + timeout: 10s + - clusterResources: {} + - copy: + name: postgres-config + namespace: {{ .Release.Namespace }} + selector: + - cnpg.io/cluster + - role=primary + containerPath: /controller/run.json + containerName: postgres + analyzers: + - textAnalyze: + checkName: CloudnativePG Operator Running + fileName: cnpg-operator-logs/*.log + regex: "Starting manager" + outcomes: + - pass: + when: "true" + message: CloudnativePG operator is running + - fail: + when: "false" + message: CloudnativePG operator may not be running - check operator pod logs + - textAnalyze: + checkName: PostgreSQL Accepting Connections + fileName: pg-isready-check/*/pg_isready-*.txt + regex: "accepting connections" + outcomes: + - pass: + when: "true" + message: PostgreSQL is accepting connections + - fail: + when: "false" + message: PostgreSQL is not accepting connections - check cluster pod logs + - textAnalyze: + checkName: PostgreSQL Not in Recovery + fileName: cnpg-cluster-status/*/psql-*.txt + regex: "pg_is_in_recovery.*f" + outcomes: + - pass: + when: "true" + message: Primary PostgreSQL instance is not in recovery mode + - warn: + when: "false" + message: Primary PostgreSQL instance may be in recovery mode + - textAnalyze: + checkName: PostgreSQL WAL Errors + fileName: postgres-cluster-logs/*.log + regex: "FATAL|PANIC|could not write to WAL" + outcomes: + - fail: + when: "true" + message: PostgreSQL logs contain FATAL/PANIC errors or WAL write failures + - pass: + when: "false" + message: No critical PostgreSQL errors detected in logs + - textAnalyze: + checkName: CNPG Failover Events + fileName: cnpg-operator-logs/*.log + regex: "Initiating failover|failover completed" + outcomes: + - warn: + when: "true" + message: CloudnativePG failover events detected - review operator logs for details + - pass: + when: "false" + message: No failover events detected +{{- end -}} diff --git a/applications/mlflow/charts/postgres-support/templates/secret-supportbundle.yaml b/applications/mlflow/charts/postgres-support/templates/secret-supportbundle.yaml new file mode 100644 index 00000000..dd2ddf89 --- /dev/null +++ b/applications/mlflow/charts/postgres-support/templates/secret-supportbundle.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-postgres-supportbundle + labels: + troubleshoot.sh/kind: support-bundle +type: Opaque +stringData: + support-bundle-spec: | +{{ include "postgres-support.supportbundle" . | indent 4 }} diff --git a/applications/mlflow/release/postgres-support-chart.yaml b/applications/mlflow/release/postgres-support-chart.yaml new file mode 100644 index 00000000..bad100a4 --- /dev/null +++ b/applications/mlflow/release/postgres-support-chart.yaml @@ -0,0 +1,12 @@ +apiVersion: kots.io/v1beta2 +kind: HelmChart +metadata: + name: postgres-support +spec: + chart: + name: postgres-support + chartVersion: 0.1.0 + exclude: 'repl{{ ConfigOptionEquals `postgres_type` `external_postgres` }}' + weight: -5 + values: {} + builder: {}