diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 0000000..6168356 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,81 @@ +# Manual Cloud Deploy Workflows + +This repo uses three manual deployment workflows and one reusable validation workflow: + +- `deploy-aws-rds.yml` +- `deploy-gcp-postgres.yml` +- `azure-postgres-opentofu.yml` +- `managed-db-validate.yml` (reusable via `workflow_call`) + +Deploy workflows are run manually from the Actions tab. + +AWS and GCP also support trusted PR comment triggers: + +- AWS: `/deploy-aws-rds [target|command] [command]` +- GCP: `/deploy-gcp-pg [target|command] [command]` + +## Deploy Inputs + +AWS and GCP workflows support `target` (`pg15`-`pg18` or `all`) and `command` (`plan`, `apply`, `destroy`). + +Azure workflow supports: + +- `action`: `plan`, `apply`, `destroy` +- `postgres_version`: `pg15`, `pg16`, `pg17`, `pg18` +- `personal_ip`: optional (falls back to secret) + +## Secrets + +### AWS + +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` +- `AWS_ALLOWED_CIDR_BLOCK` +- `AWS_DB_PASSWORD` + +### GCP + +- `GCP_SA_KEY` +- `DEPLOY_PERSONAL_IP_CIDR` (unless provided as workflow input) +- `GCP_DB_PASSWORD` + +### Azure + +- OIDC: `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_SUBSCRIPTION_ID` + - or service principal JSON: `AZURE_CREDENTIALS` +- `AZURE_PERSONAL_IP` (unless provided as workflow input) +- `AZURE_DB_PASSWORD` + +### Shared deploy controls + +- `DEPLOY_TRIGGER_USER` (used by AWS/GCP manual and comment-triggered deploy checks) + +## Validation Workflow + +`managed-db-validate.yml` installs `pgFirstAid.sql`, recreates `view_pgFirstAid_managed.sql`, and runs integration tests (including pgTAP coverage through the integration test harness). + +It supports three connection modes: + +- `direct`: caller passes `pg_host` +- `aws`: resolves host from `aws_db_identifier` +- `gcp`: resolves host from `gcp_project_id` + `gcp_instance_name` + +Current wiring: + +- Azure apply calls `managed-db-validate.yml` automatically after deploy. +- AWS apply calls `managed-db-validate.yml` for each selected version after deploy. +- GCP apply calls `managed-db-validate.yml` for each selected version after deploy. + +## Secret Handling + +- DB passwords are passed to OpenTofu as `TF_VAR_db_password`. +- Password variables in the OpenTofu stacks are marked `sensitive = true`. +- Workflows use step-level environment variables and masking for secret values used in shell steps. +- Avoid printing secret values in custom debug statements. + +## Recommended Run Order + +1. Run `plan` +2. Run `apply` +3. Confirm validation results +4. Run `destroy` when done with test resources diff --git a/.github/workflows/azure-postgres-opentofu.yml b/.github/workflows/azure-postgres-opentofu.yml new file mode 100644 index 0000000..32f519d --- /dev/null +++ b/.github/workflows/azure-postgres-opentofu.yml @@ -0,0 +1,154 @@ +name: Azure PostgreSQL OpenTofu + +on: + workflow_dispatch: + inputs: + action: + description: "OpenTofu action" + required: true + type: choice + default: plan + options: + - plan + - apply + - destroy + postgres_version: + description: "Target PostgreSQL version" + required: true + type: choice + default: pg18 + options: + - pg15 + - pg16 + - pg17 + - pg18 + personal_ip: + description: "IP allowed to connect (example: 203.0.113.10). Leave blank to use AZURE_PERSONAL_IP secret." + required: false + type: string + +concurrency: + group: azure-postgres-${{ inputs.postgres_version }} + cancel-in-progress: false + +jobs: + opentofu: + name: ${{ inputs.action }} ${{ inputs.postgres_version }} + runs-on: [self-hosted, linux, pgfirstaid-ci] + outputs: + pg_host: ${{ steps.capture_connection.outputs.pg_host }} + pg_port: ${{ steps.capture_connection.outputs.pg_port }} + pg_user: ${{ steps.capture_connection.outputs.pg_user }} + pg_database: ${{ steps.capture_connection.outputs.pg_database }} + permissions: + contents: read + id-token: write + defaults: + run: + working-directory: testing/azure/deploy/${{ inputs.postgres_version }} + + env: + TF_IN_AUTOMATION: "true" + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + clean: false + + - name: Azure login (OIDC) + if: ${{ secrets.AZURE_CLIENT_ID != '' && secrets.AZURE_TENANT_ID != '' && secrets.AZURE_SUBSCRIPTION_ID != '' }} + uses: azure/login@v2 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + + - name: Azure login (service principal JSON) + if: ${{ !(secrets.AZURE_CLIENT_ID != '' && secrets.AZURE_TENANT_ID != '' && secrets.AZURE_SUBSCRIPTION_ID != '') }} + uses: azure/login@v2 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + + - name: Resolve personal IP + shell: bash + env: + AZURE_PERSONAL_IP: ${{ secrets.AZURE_PERSONAL_IP }} + run: | + PERSONAL_IP="${{ inputs.personal_ip }}" + if [ -z "$PERSONAL_IP" ]; then + PERSONAL_IP="$AZURE_PERSONAL_IP" + fi + + if [ -z "$PERSONAL_IP" ]; then + echo "::error::No personal IP provided. Set input 'personal_ip' or secret 'AZURE_PERSONAL_IP'." + exit 1 + fi + + echo "TF_VAR_personal_ip=$PERSONAL_IP" >> "$GITHUB_ENV" + + - name: Resolve DB password + shell: bash + env: + AZURE_DB_PASSWORD: ${{ secrets.AZURE_DB_PASSWORD }} + run: | + DB_PASSWORD="$AZURE_DB_PASSWORD" + + if [ -z "$DB_PASSWORD" ]; then + echo "::error::Missing secret 'AZURE_DB_PASSWORD'." + exit 1 + fi + + echo "::add-mask::$DB_PASSWORD" + echo "TF_VAR_db_password=$DB_PASSWORD" >> "$GITHUB_ENV" + + - name: OpenTofu init + run: tofu init -input=false + + - name: OpenTofu validate + run: tofu validate + + - name: OpenTofu plan + if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} + run: tofu plan -input=false -out=tfplan + + - name: OpenTofu apply + if: ${{ inputs.action == 'apply' }} + run: tofu apply -input=false -auto-approve tfplan + + - name: Show connection details + if: ${{ inputs.action == 'apply' }} + run: | + echo "Server: $(tofu output -raw server_name)" + echo "FQDN: $(tofu output -raw server_fqdn)" + echo "Database: $(tofu output -raw database_name)" + + - name: Capture connection outputs + id: capture_connection + if: ${{ inputs.action == 'apply' }} + run: | + echo "pg_host=$(tofu output -raw server_fqdn)" >> "$GITHUB_OUTPUT" + echo "pg_port=5432" >> "$GITHUB_OUTPUT" + echo "pg_user=$(tofu output -raw db_user)" >> "$GITHUB_OUTPUT" + echo "pg_database=$(tofu output -raw database_name)" >> "$GITHUB_OUTPUT" + + - name: OpenTofu destroy + if: ${{ inputs.action == 'destroy' }} + run: tofu destroy -input=false -auto-approve + + validate: + if: ${{ inputs.action == 'apply' }} + needs: opentofu + uses: ./.github/workflows/managed-db-validate.yml + with: + pg_host: ${{ needs.opentofu.outputs.pg_host }} + pg_port: ${{ needs.opentofu.outputs.pg_port }} + pg_user: ${{ needs.opentofu.outputs.pg_user }} + pg_database: ${{ needs.opentofu.outputs.pg_database }} + pg_sslmode: require + test_view_mode: managed + secrets: + pg_password: ${{ secrets.AZURE_DB_PASSWORD }} diff --git a/.github/workflows/deploy-aws-rds.yml b/.github/workflows/deploy-aws-rds.yml new file mode 100644 index 0000000..7ea6824 --- /dev/null +++ b/.github/workflows/deploy-aws-rds.yml @@ -0,0 +1,257 @@ +name: Deploy AWS RDS PostgreSQL + +on: + workflow_dispatch: + inputs: + target: + description: "Deploy target under testing/aws/deploy" + required: true + type: choice + options: + - all + - pg15 + - pg16 + - pg17 + - pg18 + default: all + command: + description: "OpenTofu command" + required: true + type: choice + options: + - plan + - apply + - destroy + default: plan + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: read + +jobs: + resolve: + if: | + ( + github.event_name == 'workflow_dispatch' && + github.actor == vars.DEPLOY_TRIGGER_USER + ) || + ( + github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + startsWith(github.event.comment.body, '/deploy-aws-rds') && + github.event.comment.user.login == vars.DEPLOY_TRIGGER_USER && + ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + ) + ) + name: Resolve deploy arguments + runs-on: ubuntu-latest + + outputs: + command: ${{ steps.args.outputs.command }} + versions_json: ${{ steps.args.outputs.versions_json }} + checkout_ref: ${{ steps.pr.outputs.head_sha || github.sha }} + + steps: + - name: Validate trigger user is configured + run: | + if [ -z "${{ vars.DEPLOY_TRIGGER_USER }}" ]; then + echo "::error::Repository variable DEPLOY_TRIGGER_USER is not set." + exit 1 + fi + + - name: Resolve deploy arguments + id: args + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + target="${{ inputs.target }}" + command="${{ inputs.command }}" + else + comment="${{ github.event.comment.body }}" + read -r trigger arg1 arg2 _ <<<"${comment}" + + if [ "${trigger}" != "/deploy-aws-rds" ]; then + echo "::error::Comment must start with '/deploy-aws-rds'" + exit 1 + fi + + if [ -z "${arg1}" ]; then + target="all" + command="plan" + elif [ "${arg1}" = "plan" ] || [ "${arg1}" = "apply" ] || [ "${arg1}" = "destroy" ]; then + target="all" + command="${arg1}" + else + target="${arg1}" + if [ -z "${arg2}" ]; then + command="plan" + else + command="${arg2}" + fi + fi + fi + + case "${target}" in + all|pg15|pg16|pg17|pg18) ;; + *) + echo "::error::Invalid target: ${target}" + exit 1 + ;; + esac + + case "${command}" in + plan|apply|destroy) ;; + *) + echo "::error::Invalid command: ${command}" + exit 1 + ;; + esac + + if [ "${target}" = "all" ]; then + versions_json='["pg15","pg16","pg17","pg18"]' + else + versions_json="[\"${target}\"]" + fi + + echo "target=${target}" >> "$GITHUB_OUTPUT" + echo "command=${command}" >> "$GITHUB_OUTPUT" + echo "versions_json=${versions_json}" >> "$GITHUB_OUTPUT" + + - name: Resolve pull request head + id: pr + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + with: + script: | + const pull_number = context.payload.issue.number; + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number, + }); + + core.setOutput('head_sha', pr.head.sha); + core.setOutput('is_fork', String(pr.head.repo.fork)); + + - name: Block fork pull requests + if: github.event_name == 'issue_comment' && steps.pr.outputs.is_fork == 'true' + run: | + echo "::error::Comment-triggered deploy is blocked for fork PRs." + exit 1 + + - name: Show resolved inputs + run: | + echo "Target: ${{ steps.args.outputs.target }}" + echo "Versions: ${{ steps.args.outputs.versions_json }}" + echo "Command: ${{ steps.args.outputs.command }}" + + deploy: + name: ${{ needs.resolve.outputs.command }} ${{ matrix.postgres_version }} + runs-on: ubuntu-latest + needs: resolve + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + + env: + AWS_REGION: us-west-2 + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ needs.resolve.outputs.checkout_ref }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: Resolve OpenTofu variables + env: + AWS_ALLOWED_CIDR_BLOCK: ${{ secrets.AWS_ALLOWED_CIDR_BLOCK }} + AWS_DB_PASSWORD: ${{ secrets.AWS_DB_PASSWORD }} + run: | + if [ -z "$AWS_ALLOWED_CIDR_BLOCK" ]; then + echo "::error::Missing secret AWS_ALLOWED_CIDR_BLOCK" + exit 1 + fi + + if [ -z "$AWS_DB_PASSWORD" ]; then + echo "::error::Missing secret AWS_DB_PASSWORD" + exit 1 + fi + + echo "::add-mask::$AWS_DB_PASSWORD" + echo "TF_VAR_allowed_cidr_block=$AWS_ALLOWED_CIDR_BLOCK" >> "$GITHUB_ENV" + echo "TF_VAR_db_password=$AWS_DB_PASSWORD" >> "$GITHUB_ENV" + + - name: OpenTofu init + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu init -input=false + + - name: OpenTofu validate + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu validate + + - name: OpenTofu plan + if: needs.resolve.outputs.command == 'plan' || needs.resolve.outputs.command == 'apply' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu plan -input=false + + - name: OpenTofu apply + if: needs.resolve.outputs.command == 'apply' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu apply -auto-approve -input=false + + - name: OpenTofu destroy + if: needs.resolve.outputs.command == 'destroy' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu destroy -auto-approve -input=false + + - name: Show endpoint output + if: needs.resolve.outputs.command == 'apply' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: | + endpoint="$(tofu output -raw endpoint)" + { + echo "### Deployment Output" + echo + echo "- Version: ${{ matrix.postgres_version }}" + echo "- Endpoint: ${endpoint}" + } >> "$GITHUB_STEP_SUMMARY" + + validate: + if: needs.resolve.outputs.command == 'apply' + name: validate ${{ matrix.postgres_version }} + needs: [resolve, deploy] + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + uses: ./.github/workflows/managed-db-validate.yml + with: + cloud_provider: aws + aws_region: us-west-2 + aws_db_identifier: ${{ matrix.postgres_version }} + pg_port: "5432" + pg_user: randoneering + pg_database: pgFirstAid + pg_sslmode: require + test_view_mode: managed + secrets: + pg_password: ${{ secrets.AWS_DB_PASSWORD }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/deploy-gcp-postgres.yml b/.github/workflows/deploy-gcp-postgres.yml new file mode 100644 index 0000000..ee72675 --- /dev/null +++ b/.github/workflows/deploy-gcp-postgres.yml @@ -0,0 +1,285 @@ +name: Deploy GCP Cloud SQL PostgreSQL + +on: + workflow_dispatch: + inputs: + target: + description: "Deploy target under testing/gcp/deploy" + required: true + type: choice + options: + - all + - pg15 + - pg16 + - pg17 + - pg18 + default: all + command: + description: "OpenTofu command" + required: true + type: choice + options: + - plan + - apply + - destroy + default: plan + personal_ip_cidr: + description: "Your CIDR for Cloud SQL access (optional). Leave blank to use secret DEPLOY_PERSONAL_IP_CIDR" + required: false + type: string + default: "" + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: read + +jobs: + resolve: + if: | + ( + github.event_name == 'workflow_dispatch' && + github.actor == vars.DEPLOY_TRIGGER_USER + ) || + ( + github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + startsWith(github.event.comment.body, '/deploy-gcp-pg') && + github.event.comment.user.login == vars.DEPLOY_TRIGGER_USER && + ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + ) + ) + name: Resolve deploy arguments + runs-on: ubuntu-latest + + outputs: + command: ${{ steps.args.outputs.command }} + versions_json: ${{ steps.args.outputs.versions_json }} + runner_ip_cidr: ${{ steps.args.outputs.runner_ip_cidr }} + personal_ip_cidr: ${{ steps.args.outputs.personal_ip_cidr }} + checkout_ref: ${{ steps.pr.outputs.head_sha || github.sha }} + + steps: + - name: Validate trigger user is configured + run: | + if [ -z "${{ vars.DEPLOY_TRIGGER_USER }}" ]; then + echo "::error::Repository variable DEPLOY_TRIGGER_USER is not set." + exit 1 + fi + + - name: Resolve deploy arguments + id: args + env: + DEPLOY_PERSONAL_IP_CIDR: ${{ secrets.DEPLOY_PERSONAL_IP_CIDR }} + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + target="${{ inputs.target }}" + command="${{ inputs.command }}" + personal_ip_input="${{ inputs.personal_ip_cidr }}" + else + comment="${{ github.event.comment.body }}" + read -r trigger arg1 arg2 _ <<<"${comment}" + + if [ "${trigger}" != "/deploy-gcp-pg" ]; then + echo "::error::Comment must start with '/deploy-gcp-pg'" + exit 1 + fi + + if [ -z "${arg1}" ]; then + target="all" + command="plan" + elif [ "${arg1}" = "plan" ] || [ "${arg1}" = "apply" ] || [ "${arg1}" = "destroy" ]; then + target="all" + command="${arg1}" + else + target="${arg1}" + if [ -z "${arg2}" ]; then + command="plan" + else + command="${arg2}" + fi + fi + + personal_ip_input="" + fi + + case "${target}" in + all|pg15|pg16|pg17|pg18) ;; + *) + echo "::error::Invalid target: ${target}" + exit 1 + ;; + esac + + case "${command}" in + plan|apply|destroy) ;; + *) + echo "::error::Invalid command: ${command}" + exit 1 + ;; + esac + + if [ "${target}" = "all" ]; then + versions_json='["pg15","pg16","pg17","pg18"]' + else + versions_json="[\"${target}\"]" + fi + + runner_ip="$(curl -fsSL https://checkip.amazonaws.com | tr -d '\n')" + runner_ip_cidr="${runner_ip}/32" + + if [ -n "${personal_ip_input}" ]; then + personal_ip_cidr="${personal_ip_input}" + elif [ -n "$DEPLOY_PERSONAL_IP_CIDR" ]; then + personal_ip_cidr="$DEPLOY_PERSONAL_IP_CIDR" + else + echo "::error::Set input personal_ip_cidr or repository secret DEPLOY_PERSONAL_IP_CIDR" + exit 1 + fi + + echo "target=${target}" >> "$GITHUB_OUTPUT" + echo "command=${command}" >> "$GITHUB_OUTPUT" + echo "versions_json=${versions_json}" >> "$GITHUB_OUTPUT" + echo "runner_ip_cidr=${runner_ip_cidr}" >> "$GITHUB_OUTPUT" + echo "personal_ip_cidr=${personal_ip_cidr}" >> "$GITHUB_OUTPUT" + + - name: Resolve pull request head + id: pr + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + with: + script: | + const pull_number = context.payload.issue.number; + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number, + }); + + core.setOutput('head_sha', pr.head.sha); + core.setOutput('is_fork', String(pr.head.repo.fork)); + + - name: Block fork pull requests + if: github.event_name == 'issue_comment' && steps.pr.outputs.is_fork == 'true' + run: | + echo "::error::Comment-triggered deploy is blocked for fork PRs." + exit 1 + + - name: Show resolved inputs + run: | + echo "Target: ${{ steps.args.outputs.target }}" + echo "Versions: ${{ steps.args.outputs.versions_json }}" + echo "Command: ${{ steps.args.outputs.command }}" + echo "Runner IP CIDR: ${{ steps.args.outputs.runner_ip_cidr }}" + + deploy: + name: ${{ needs.resolve.outputs.command }} ${{ matrix.postgres_version }} + runs-on: ubuntu-latest + needs: resolve + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ needs.resolve.outputs.checkout_ref }} + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + create_credentials_file: true + export_environment_variables: true + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: Resolve OpenTofu variables + env: + GCP_DB_PASSWORD: ${{ secrets.GCP_DB_PASSWORD }} + run: | + if [ -z "$GCP_DB_PASSWORD" ]; then + echo "::error::Missing secret GCP_DB_PASSWORD" + exit 1 + fi + + echo "::add-mask::$GCP_DB_PASSWORD" + echo "TF_VAR_db_password=$GCP_DB_PASSWORD" >> "$GITHUB_ENV" + + - name: OpenTofu init + working-directory: testing/gcp/deploy/${{ matrix.postgres_version }} + run: tofu init -input=false + + - name: Write CI tfvars + working-directory: testing/gcp/deploy/${{ matrix.postgres_version }} + run: | + cat > ci.auto.tfvars <> "$GITHUB_STEP_SUMMARY" + + validate: + if: needs.resolve.outputs.command == 'apply' + name: validate ${{ matrix.postgres_version }} + needs: [resolve, deploy] + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + uses: ./.github/workflows/managed-db-validate.yml + with: + cloud_provider: gcp + gcp_project_id: pgfirstaid + gcp_instance_name: ${{ format('pgfirstaid-{0}', matrix.postgres_version) }} + pg_port: "5432" + pg_user: randoneering + pg_database: pgFirstAid + pg_sslmode: require + test_view_mode: managed + secrets: + pg_password: ${{ secrets.GCP_DB_PASSWORD }} + gcp_sa_key: ${{ secrets.GCP_SA_KEY }} diff --git a/.github/workflows/integration-pg-matrix.yml b/.github/workflows/integration-pg-matrix.yml index 8adc7eb..23c7e2d 100644 --- a/.github/workflows/integration-pg-matrix.yml +++ b/.github/workflows/integration-pg-matrix.yml @@ -2,6 +2,7 @@ name: Python + pgTAP Integration (PG15-PG18) on: pull_request: + types: [opened, synchronize] paths: - pgFirstAid.sql - view_pgFirstAid.sql @@ -9,7 +10,6 @@ on: - testing/integration/** - testing/pgTAP/** - .github/workflows/integration-pg-matrix.yml - workflow_dispatch: concurrency: group: integration-${{ github.ref }} diff --git a/.github/workflows/managed-db-validate.yml b/.github/workflows/managed-db-validate.yml new file mode 100644 index 0000000..eb85e73 --- /dev/null +++ b/.github/workflows/managed-db-validate.yml @@ -0,0 +1,171 @@ +name: Managed DB Validate + +on: + workflow_call: + inputs: + cloud_provider: + required: false + type: string + default: direct + pg_host: + required: false + type: string + default: "" + pg_port: + required: false + type: string + default: "5432" + pg_user: + required: true + type: string + pg_database: + required: true + type: string + pg_sslmode: + required: false + type: string + default: require + test_view_mode: + required: false + type: string + default: managed + aws_region: + required: false + type: string + default: us-west-2 + aws_db_identifier: + required: false + type: string + default: "" + gcp_project_id: + required: false + type: string + default: "" + gcp_instance_name: + required: false + type: string + default: "" + secrets: + pg_password: + required: true + aws_access_key_id: + required: false + aws_secret_access_key: + required: false + gcp_sa_key: + required: false + +jobs: + validate: + runs-on: [self-hosted, linux, pgfirstaid-ci] + permissions: + contents: read + defaults: + run: + working-directory: testing/integration + env: + PGHOST: ${{ inputs.pg_host }} + PGPORT: ${{ inputs.pg_port }} + PGUSER: ${{ inputs.pg_user }} + PGPASSWORD: ${{ secrets.pg_password }} + PGDATABASE: ${{ inputs.pg_database }} + PGSSLMODE: ${{ inputs.pg_sslmode }} + PGFA_TEST_VIEW_MODE: ${{ inputs.test_view_mode }} + PGFA_TEST_ACTIVE_CONN_TARGET: "52" + PGFA_TEST_ACTIVE_CONN_SLEEP_SECONDS: "20" + PGFA_TEST_WAIT_TIMEOUT_SECONDS: "45" + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Configure AWS credentials + if: ${{ inputs.cloud_provider == 'aws' }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.aws_access_key_id }} + aws-secret-access-key: ${{ secrets.aws_secret_access_key }} + aws-region: ${{ inputs.aws_region }} + + - name: Authenticate to Google Cloud + if: ${{ inputs.cloud_provider == 'gcp' }} + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.gcp_sa_key }} + + - name: Setup gcloud + if: ${{ inputs.cloud_provider == 'gcp' }} + uses: google-github-actions/setup-gcloud@v2 + + - name: Resolve PostgreSQL host + shell: bash + run: | + host="${PGHOST}" + + if [ -z "$host" ] && [ "${{ inputs.cloud_provider }}" = "aws" ]; then + if [ -z "${{ inputs.aws_db_identifier }}" ]; then + echo "::error::aws_db_identifier is required when cloud_provider=aws and pg_host is not set" + exit 1 + fi + + host="$(aws rds describe-db-instances --db-instance-identifier "${{ inputs.aws_db_identifier }}" --query 'DBInstances[0].Endpoint.Address' --output text)" + fi + + if [ -z "$host" ] && [ "${{ inputs.cloud_provider }}" = "gcp" ]; then + if [ -z "${{ inputs.gcp_project_id }}" ] || [ -z "${{ inputs.gcp_instance_name }}" ]; then + echo "::error::gcp_project_id and gcp_instance_name are required when cloud_provider=gcp and pg_host is not set" + exit 1 + fi + + host="$(gcloud sql instances describe "${{ inputs.gcp_instance_name }}" --project "${{ inputs.gcp_project_id }}" --format='value(ipAddresses[0].ipAddress)')" + fi + + if [ -z "$host" ]; then + echo "::error::Could not determine PGHOST. Provide pg_host or cloud discovery inputs." + exit 1 + fi + + echo "PGHOST=$host" >> "$GITHUB_ENV" + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.14" + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Validate required PG env vars + run: | + missing=0 + for var in PGHOST PGPORT PGUSER PGPASSWORD PGDATABASE; do + if [ -z "${!var}" ]; then + echo "::error::Missing required secret/env: ${var}" + missing=1 + fi + done + if [ "$missing" -ne 0 ]; then + exit 1 + fi + + - name: Verify PostgreSQL client is installed + run: | + if ! command -v psql >/dev/null 2>&1; then + echo "::error::psql not found on runner. Install postgresql-client on the self-hosted VM." + exit 1 + fi + psql --version + + - name: Sync dependencies + run: uv sync + + - name: Install pgFirstAid function + run: psql -v ON_ERROR_STOP=1 -f ../../pgFirstAid.sql + + - name: Recreate managed view only + run: | + psql -v ON_ERROR_STOP=1 -c "DROP VIEW IF EXISTS v_pgfirstaid" + psql -v ON_ERROR_STOP=1 -f ../../view_pgFirstAid_managed.sql + + - name: Run integration tests (includes pgTAP suite) + run: uv run python -m pytest tests/integration -m integration diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 05795a9..833a72e 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -1,3 +1,272 @@ +create or replace +function pgfirstaid_pg_stat_statements_checks() +returns table ( + severity TEXT, + category TEXT, + check_name TEXT, + object_name TEXT, + issue_description TEXT, + current_value TEXT, + recommended_action TEXT, + documentation_link TEXT, + severity_order INTEGER +) as $$ +begin + if not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements') then + return; + end if; + + return query +with pss as ( + select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows + from + pg_stat_statements + where + calls > 0 + order by + total_exec_time desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.mean_exec_time > 100 + order by + pss.mean_exec_time desc + limit 10; + + return query +with pss as ( + select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time + from + pg_stat_statements + where + (temp_blks_read + temp_blks_written) > 0 + order by + (temp_blks_read + temp_blks_written) desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 + order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time + order by + pss.stddev_exec_time desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 + order by + pss.calls desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 + order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 + order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 + order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc + limit 10; +end; +$$ language plpgsql; + create or replace function pg_firstAid() returns table ( @@ -674,8 +943,159 @@ where and now() - query_start > interval '5 minutes' order by (now() - query_start) desc; +-- MEDIUM: pg_stat_statements extension missing +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'pg_stat_statements Extension Missing' as check_name, + 'pg_stat_statements' as object_name, + 'pg_stat_statements is not installed, so query fingerprint and workload-level performance checks are unavailable' as issue_description, + 'Extension not found in pg_extension' as current_value, + 'Self-hosted: add pg_stat_statements to shared_preload_libraries, restart PostgreSQL, then run CREATE EXTENSION pg_stat_statements; AWS RDS: add pg_stat_statements to the parameter group shared_preload_libraries, reboot, then CREATE EXTENSION; GCP Cloud SQL: enable cloudsql.enable_pg_stat_statements, restart if required, then CREATE EXTENSION; Azure Database for PostgreSQL: add pg_stat_statements to shared_preload_libraries, restart, then CREATE EXTENSION' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Extensions.html \ + https://cloud.google.com/sql/docs/postgres/flags \ + https://learn.microsoft.com/azure/postgresql/flexible-server/concepts-server-parameters' as documentation_link, + 3 as severity_order +where + not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements' + ); +-- MEDIUM: pg_stat_statements dependent checks +insert + into + health_results +select + * +from + pgfirstaid_pg_stat_statements_checks(); +-- MEDIUM: Top 10 expensive active queries by runtime +with eq as ( +select + pgs.pid, + pgs.usename, + pgs.datname, + pgs.client_addr, + now() - pgs.query_start as runtime, + pgs.query +from + pg_stat_activity pgs +where + pgs.state = 'active' + and pgs.query_start is not null + and pgs.pid <> pg_backend_pid() + and now() - pgs.query_start > interval '30 seconds' +order by + runtime desc +limit 10) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Expensive Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || eq.pid::text, + 'usename: ' || eq.usename, + 'datname: ' || eq.datname, + 'client_address: ' || coalesce(eq.client_addr::text, 'local'), + 'runtime: ' || to_char(eq.runtime, 'HH24:MI:SS') + ) as object_name, + 'Top 10 active queries running longer than 30 seconds, ordered by runtime. Long-running active queries can signal lock waits, missing indexes, or inefficient plans' as issue_description, + left(regexp_replace(eq.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Review these queries with EXPLAIN (ANALYZE, BUFFERS) and reduce lock waits or full scans' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + eq; +-- MEDIUM: Lock-wait-heavy active queries +with lw as ( +select + pid, + usename, + datname, + client_addr, + wait_event, + query_start, + now() - query_start as runtime, + query +from + pg_stat_activity +where + state = 'active' + and wait_event_type = 'Lock' + and query_start is not null + and now() - query_start > interval '30 seconds' + and pid <> pg_backend_pid() +order by + runtime desc +limit 10) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Lock-Wait-Heavy Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || lw.pid::text, + 'usename: ' || lw.usename, + 'datname: ' || lw.datname, + 'client_address: ' || coalesce(lw.client_addr::text, 'local'), + 'wait_event: ' || coalesce(lw.wait_event, 'unknown'), + 'runtime: ' || to_char(lw.runtime, 'HH24:MI:SS') + ) as object_name, + 'Active queries waiting on locks for extended time can block throughput and cause cascading latency' as issue_description, + left(regexp_replace(lw.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Reduce transaction duration, enforce consistent lock ordering, and investigate blockers first' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/explicit-locking.html' as documentation_link, + 3 as severity_order +from + lw; +-- MEDIUM: Idle in transaction over 5 minutes +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Idle In Transaction Over 5 Minutes' as check_name, + concat_ws(' | ', + 'pid: ' || psa.pid::text, + 'usename: ' || psa.usename, + 'datname: ' || psa.datname, + 'client_address: ' || coalesce(psa.client_addr::text, 'local'), + 'idle_duration: ' || to_char(now() - psa.state_change, 'HH24:MI:SS') + ) as object_name, + 'Sessions left idle in transaction hold snapshots and locks longer than necessary, which can hurt query performance and vacuum progress' as issue_description, + left(regexp_replace(psa.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Commit or rollback promptly and move application processing outside transaction boundaries' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/routine-vacuuming.html' as documentation_link, + 3 as severity_order +from + pg_stat_activity psa +where + psa.state = 'idle in transaction' + and psa.state_change is not null + and now() - psa.state_change > interval '5 minutes' + and psa.pid <> pg_backend_pid() +order by + now() - psa.state_change desc; -- LOW: Missing indexes on foreign keys - insert + insert into health_results select diff --git a/testing/aws/deploy/pg15/locals.tf b/testing/aws/deploy/pg15/locals.tf index 891a331..eb96df2 100644 --- a/testing/aws/deploy/pg15/locals.tf +++ b/testing/aws/deploy/pg15/locals.tf @@ -2,7 +2,7 @@ locals { service = "pg15" database_name = "pgFirstAid" engine = "postgres" - engine_version = "" + engine_version = "15.17" engine_family = "postgres15" db_parameter_group = [ { diff --git a/testing/aws/deploy/pg15/main.tf b/testing/aws/deploy/pg15/main.tf index 6c9a1d7..801cea2 100644 --- a/testing/aws/deploy/pg15/main.tf +++ b/testing/aws/deploy/pg15/main.tf @@ -11,8 +11,17 @@ module "nonaurora" { family = local.engine_family db_parameter_group = local.db_parameter_group allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg15/variables.tf b/testing/aws/deploy/pg15/variables.tf index 478bbb6..3b35d84 100644 --- a/testing/aws/deploy/pg15/variables.tf +++ b/testing/aws/deploy/pg15/variables.tf @@ -2,3 +2,9 @@ variable "allowed_cidr_block" { description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" type = string } + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/deploy/pg16/main.tf b/testing/aws/deploy/pg16/main.tf index 6c9a1d7..801cea2 100644 --- a/testing/aws/deploy/pg16/main.tf +++ b/testing/aws/deploy/pg16/main.tf @@ -11,8 +11,17 @@ module "nonaurora" { family = local.engine_family db_parameter_group = local.db_parameter_group allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg16/variables.tf b/testing/aws/deploy/pg16/variables.tf index 478bbb6..3b35d84 100644 --- a/testing/aws/deploy/pg16/variables.tf +++ b/testing/aws/deploy/pg16/variables.tf @@ -2,3 +2,9 @@ variable "allowed_cidr_block" { description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" type = string } + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/deploy/pg17/main.tf b/testing/aws/deploy/pg17/main.tf index 6c9a1d7..801cea2 100644 --- a/testing/aws/deploy/pg17/main.tf +++ b/testing/aws/deploy/pg17/main.tf @@ -11,8 +11,17 @@ module "nonaurora" { family = local.engine_family db_parameter_group = local.db_parameter_group allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg17/variables.tf b/testing/aws/deploy/pg17/variables.tf index 478bbb6..3b35d84 100644 --- a/testing/aws/deploy/pg17/variables.tf +++ b/testing/aws/deploy/pg17/variables.tf @@ -2,3 +2,9 @@ variable "allowed_cidr_block" { description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" type = string } + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/deploy/pg18/main.tf b/testing/aws/deploy/pg18/main.tf index 6c9a1d7..801cea2 100644 --- a/testing/aws/deploy/pg18/main.tf +++ b/testing/aws/deploy/pg18/main.tf @@ -11,8 +11,17 @@ module "nonaurora" { family = local.engine_family db_parameter_group = local.db_parameter_group allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg18/variables.tf b/testing/aws/deploy/pg18/variables.tf index 478bbb6..3b35d84 100644 --- a/testing/aws/deploy/pg18/variables.tf +++ b/testing/aws/deploy/pg18/variables.tf @@ -2,3 +2,9 @@ variable "allowed_cidr_block" { description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" type = string } + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/opentofu/modules/nonaurora/main.tf b/testing/aws/opentofu/modules/nonaurora/main.tf index 490a6d4..a25824b 100644 --- a/testing/aws/opentofu/modules/nonaurora/main.tf +++ b/testing/aws/opentofu/modules/nonaurora/main.tf @@ -12,35 +12,6 @@ data "aws_vpc" "default" { default = true } -resource "aws_security_group" "rds_sg" { - name = "${var.service}-rds-sg" - description = "Security group for ${var.service} RDS instance" - vpc_id = data.aws_vpc.default.id - - ingress { - description = "PostgreSQL access from public IP" - from_port = var.port - to_port = var.port - protocol = "tcp" - cidr_blocks = [var.allowed_cidr_block] - } - - egress { - description = "Allow all outbound traffic" - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = merge( - var.required_tags, - { - Name = "${var.service}-rds-sg" - } - ) -} - resource "random_password" "password" { length = 20 special = false @@ -52,11 +23,11 @@ resource "aws_db_instance" "rds_instance" { engine_version = var.engine_version db_name = var.database_name username = local.username - password = random_password.password.result + password = var.db_password != "" ? var.db_password : random_password.password.result instance_class = local.instance_class parameter_group_name = aws_db_parameter_group.param_group.name publicly_accessible = true - vpc_security_group_ids = [aws_security_group.rds_sg.id] + vpc_security_group_ids = ["sg-0333981e44680b34b"] allocated_storage = local.allocated_storage apply_immediately = var.apply_immediately skip_final_snapshot = true diff --git a/testing/aws/opentofu/modules/nonaurora/variables.tf b/testing/aws/opentofu/modules/nonaurora/variables.tf index f39c6e8..033844f 100644 --- a/testing/aws/opentofu/modules/nonaurora/variables.tf +++ b/testing/aws/opentofu/modules/nonaurora/variables.tf @@ -23,12 +23,6 @@ variable "vpc_security_group_ids" { default = [] } -variable "allowed_cidr_block" { - description = "CIDR block allowed to access the RDS instance" - type = string - default = "" -} - variable "instance_class" { description = "Instance type to use" type = string @@ -47,6 +41,13 @@ variable "username" { default = "randoneering" } +variable "db_password" { + description = "Master DB password. If empty, a random password is generated" + type = string + default = "" + sensitive = true +} + variable "skip_final_snapshot" { description = "Should a final snapshot be created on instance destroy" type = bool diff --git a/testing/azure/deploy/pg15/locals.tf b/testing/azure/deploy/pg15/locals.tf index 48da82a..92307ef 100644 --- a/testing/azure/deploy/pg15/locals.tf +++ b/testing/azure/deploy/pg15/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg15" postgres_version = "15" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg15/main.tf b/testing/azure/deploy/pg15/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg15/main.tf +++ b/testing/azure/deploy/pg15/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg15/variables.tf b/testing/azure/deploy/pg15/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg15/variables.tf +++ b/testing/azure/deploy/pg15/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/deploy/pg16/locals.tf b/testing/azure/deploy/pg16/locals.tf index 882a558..f7d69a6 100644 --- a/testing/azure/deploy/pg16/locals.tf +++ b/testing/azure/deploy/pg16/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg16" postgres_version = "16" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg16/main.tf b/testing/azure/deploy/pg16/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg16/main.tf +++ b/testing/azure/deploy/pg16/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg16/variables.tf b/testing/azure/deploy/pg16/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg16/variables.tf +++ b/testing/azure/deploy/pg16/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/deploy/pg17/locals.tf b/testing/azure/deploy/pg17/locals.tf index c6eb5bd..87421d6 100644 --- a/testing/azure/deploy/pg17/locals.tf +++ b/testing/azure/deploy/pg17/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg17" postgres_version = "17" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg17/main.tf b/testing/azure/deploy/pg17/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg17/main.tf +++ b/testing/azure/deploy/pg17/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg17/variables.tf b/testing/azure/deploy/pg17/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg17/variables.tf +++ b/testing/azure/deploy/pg17/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/deploy/pg18/locals.tf b/testing/azure/deploy/pg18/locals.tf index e8d9561..9a110a4 100644 --- a/testing/azure/deploy/pg18/locals.tf +++ b/testing/azure/deploy/pg18/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg18" postgres_version = "18" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg18/main.tf b/testing/azure/deploy/pg18/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg18/main.tf +++ b/testing/azure/deploy/pg18/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg18/variables.tf b/testing/azure/deploy/pg18/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg18/variables.tf +++ b/testing/azure/deploy/pg18/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/opentofu/modules/postgres/main.tf b/testing/azure/opentofu/modules/postgres/main.tf index b5726ba..375e36d 100644 --- a/testing/azure/opentofu/modules/postgres/main.tf +++ b/testing/azure/opentofu/modules/postgres/main.tf @@ -15,7 +15,7 @@ resource "azurerm_postgresql_flexible_server" "postgres" { version = var.postgres_version administrator_login = var.db_user - administrator_password = random_password.password.result + administrator_password = var.db_password != "" ? var.db_password : random_password.password.result sku_name = "B_Standard_B1ms" storage_mb = 32768 @@ -23,8 +23,6 @@ resource "azurerm_postgresql_flexible_server" "postgres" { backup_retention_days = 7 geo_redundant_backup_enabled = false - zone = "1" - lifecycle { ignore_changes = [ zone, diff --git a/testing/azure/opentofu/modules/postgres/variables.tf b/testing/azure/opentofu/modules/postgres/variables.tf index a171962..3ae2171 100644 --- a/testing/azure/opentofu/modules/postgres/variables.tf +++ b/testing/azure/opentofu/modules/postgres/variables.tf @@ -31,6 +31,13 @@ variable "db_user" { default = "randoneering" } +variable "db_password" { + description = "Database admin password. If empty, a random password is generated" + type = string + default = "" + sensitive = true +} + variable "personal_ip" { description = "Personal IP to allow connections from (format: x.x.x.x)" type = string diff --git a/testing/gcp/deploy/pg15/main.tf b/testing/gcp/deploy/pg15/main.tf index 59bf02e..e765a22 100644 --- a/testing/gcp/deploy/pg15/main.tf +++ b/testing/gcp/deploy/pg15/main.tf @@ -11,7 +11,8 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg15/vars.tf b/testing/gcp/deploy/pg15/vars.tf index 0ea6a42..1c4a1c4 100644 --- a/testing/gcp/deploy/pg15/vars.tf +++ b/testing/gcp/deploy/pg15/vars.tf @@ -1,4 +1,13 @@ -variable "personal_ip" { - description ="Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) +} + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true } diff --git a/testing/gcp/deploy/pg16/main.tf b/testing/gcp/deploy/pg16/main.tf index 59bf02e..e765a22 100644 --- a/testing/gcp/deploy/pg16/main.tf +++ b/testing/gcp/deploy/pg16/main.tf @@ -11,7 +11,8 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg16/vars.tf b/testing/gcp/deploy/pg16/vars.tf index 0ea6a42..1c4a1c4 100644 --- a/testing/gcp/deploy/pg16/vars.tf +++ b/testing/gcp/deploy/pg16/vars.tf @@ -1,4 +1,13 @@ -variable "personal_ip" { - description ="Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) +} + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true } diff --git a/testing/gcp/deploy/pg17/main.tf b/testing/gcp/deploy/pg17/main.tf index 59bf02e..e765a22 100644 --- a/testing/gcp/deploy/pg17/main.tf +++ b/testing/gcp/deploy/pg17/main.tf @@ -11,7 +11,8 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg17/vars.tf b/testing/gcp/deploy/pg17/vars.tf index 0ea6a42..1c4a1c4 100644 --- a/testing/gcp/deploy/pg17/vars.tf +++ b/testing/gcp/deploy/pg17/vars.tf @@ -1,4 +1,13 @@ -variable "personal_ip" { - description ="Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) +} + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true } diff --git a/testing/gcp/deploy/pg18/main.tf b/testing/gcp/deploy/pg18/main.tf index 59bf02e..e765a22 100644 --- a/testing/gcp/deploy/pg18/main.tf +++ b/testing/gcp/deploy/pg18/main.tf @@ -11,7 +11,8 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg18/vars.tf b/testing/gcp/deploy/pg18/vars.tf index ab706b4..1c4a1c4 100644 --- a/testing/gcp/deploy/pg18/vars.tf +++ b/testing/gcp/deploy/pg18/vars.tf @@ -1,4 +1,13 @@ -variable "personal_ip" { - description = "Personal IP to allow access to the instance" +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) +} + +variable "db_password" { + description = "Database user password" type = string + sensitive = true } diff --git a/testing/gcp/opentofu/models/postgres/main.tf b/testing/gcp/opentofu/models/postgres/main.tf index 36e1583..9146940 100644 --- a/testing/gcp/opentofu/models/postgres/main.tf +++ b/testing/gcp/opentofu/models/postgres/main.tf @@ -3,6 +3,15 @@ resource "random_password" "password" { special = false } +locals { + resolved_authorized_networks = length(var.authorized_networks) > 0 ? var.authorized_networks : [ + { + name = "allow-personal" + value = var.personal_ip + } + ] +} + resource "google_sql_database_instance" "postgres" { name = var.instance_name database_version = var.postgres_version @@ -18,9 +27,13 @@ resource "google_sql_database_instance" "postgres" { ip_configuration { ipv4_enabled = true - authorized_networks { - name = "allow-personal" - value = var.personal_ip + dynamic "authorized_networks" { + for_each = local.resolved_authorized_networks + + content { + name = authorized_networks.value.name + value = authorized_networks.value.value + } } } @@ -40,5 +53,5 @@ resource "google_sql_database" "database" { resource "google_sql_user" "user" { name = var.db_user instance = google_sql_database_instance.postgres.name - password = random_password.password.result + password = var.db_password != "" ? var.db_password : random_password.password.result } diff --git a/testing/gcp/opentofu/models/postgres/variables.tf b/testing/gcp/opentofu/models/postgres/variables.tf index 794b064..45dccd1 100644 --- a/testing/gcp/opentofu/models/postgres/variables.tf +++ b/testing/gcp/opentofu/models/postgres/variables.tf @@ -31,8 +31,24 @@ variable "db_user" { default = "randoneering" } +variable "db_password" { + description = "Database user password. If empty, a random password is generated" + type = string + default = "" + sensitive = true +} + variable "personal_ip" { description = "Personal IP to allow connections from" type = string default = "0.0.0.0" } + +variable "authorized_networks" { + description = "List of authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) + default = [] +} diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 1893e30..5664c42 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(16); +SELECT plan(42); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -46,6 +46,123 @@ SELECT ok( 'View executes Excessive Sequential Scans check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Expensive Active Queries'), + 'Function executes Top 10 Expensive Active Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Expensive Active Queries'), + 'View executes Top 10 Expensive Active Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'pg_stat_statements Extension Missing'), + 'Function executes pg_stat_statements Extension Missing check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'pg_stat_statements Extension Missing'), + 'View executes pg_stat_statements Extension Missing check' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Total Execution Time') ELSE true END), + 'Function executes Top 10 Queries by Total Execution Time check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Total Execution Time') ELSE true END), + 'View executes Top 10 Queries by Total Execution Time check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Mean Execution Time Queries') ELSE true END), + 'Function executes High Mean Execution Time Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Mean Execution Time Queries') ELSE true END), + 'View executes High Mean Execution Time Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Temp Block Spills') ELSE true END), + 'Function executes Top 10 Queries by Temp Block Spills check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Temp Block Spills') ELSE true END), + 'View executes Top 10 Queries by Temp Block Spills check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Low Cache Hit Ratio Queries') ELSE true END), + 'Function executes Low Cache Hit Ratio Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Low Cache Hit Ratio Queries') ELSE true END), + 'View executes Low Cache Hit Ratio Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Runtime Variance Queries') ELSE true END), + 'Function executes High Runtime Variance Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Runtime Variance Queries') ELSE true END), + 'View executes High Runtime Variance Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Calls Low Value Queries') ELSE true END), + 'Function executes High Calls Low Value Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Calls Low Value Queries') ELSE true END), + 'View executes High Calls Low Value Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Rows Per Call Queries') ELSE true END), + 'Function executes High Rows Per Call Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Rows Per Call Queries') ELSE true END), + 'View executes High Rows Per Call Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Shared Block Reads Per Call Queries') ELSE true END), + 'Function executes High Shared Block Reads Per Call Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Shared Block Reads Per Call Queries') ELSE true END), + 'View executes High Shared Block Reads Per Call Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top Queries by WAL Bytes Per Call') ELSE true END), + 'Function executes Top Queries by WAL Bytes Per Call check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top Queries by WAL Bytes Per Call') ELSE true END), + 'View executes Top Queries by WAL Bytes Per Call check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Lock-Wait-Heavy Active Queries'), + 'Function executes Lock-Wait-Heavy Active Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Lock-Wait-Heavy Active Queries'), + 'View executes Lock-Wait-Heavy Active Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Idle In Transaction Over 5 Minutes'), + 'Function executes Idle In Transaction Over 5 Minutes check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Idle In Transaction Over 5 Minutes'), + 'View executes Idle In Transaction Over 5 Minutes check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Table with more than 50 columns'), 'Function executes Table with more than 50 columns check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 6f2f774..c102845 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -1,3 +1,272 @@ +create or replace +function pgfirstaid_pg_stat_statements_checks() +returns table ( + severity TEXT, + category TEXT, + check_name TEXT, + object_name TEXT, + issue_description TEXT, + current_value TEXT, + recommended_action TEXT, + documentation_link TEXT, + severity_order INTEGER +) as $$ +begin + if not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements') then + return; + end if; + + return query +with pss as ( + select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows + from + pg_stat_statements + where + calls > 0 + order by + total_exec_time desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.mean_exec_time > 100 + order by + pss.mean_exec_time desc + limit 10; + + return query +with pss as ( + select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time + from + pg_stat_statements + where + (temp_blks_read + temp_blks_written) > 0 + order by + (temp_blks_read + temp_blks_written) desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 + order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time + order by + pss.stddev_exec_time desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 + order by + pss.calls desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 + order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 + order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 + order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc + limit 10; +end; +$$ language plpgsql; + -- Adding dropping of the view instead of replace because of conversion issues with new health checks. -- This way we start with a fresh view. drop view if exists v_pgfirstAid; @@ -614,6 +883,147 @@ where state = 'active' and now() - query_start > interval '5 minutes' union all +-- MEDIUM: pg_stat_statements extension missing +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'pg_stat_statements Extension Missing' as check_name, + 'pg_stat_statements' as object_name, + 'pg_stat_statements is not installed, so query fingerprint and workload-level performance checks are unavailable' as issue_description, + 'Extension not found in pg_extension' as current_value, + 'Self-hosted: add pg_stat_statements to shared_preload_libraries, restart PostgreSQL, then run CREATE EXTENSION pg_stat_statements; AWS RDS: add pg_stat_statements to the parameter group shared_preload_libraries, reboot, then CREATE EXTENSION; GCP Cloud SQL: enable cloudsql.enable_pg_stat_statements, restart if required, then CREATE EXTENSION; Azure Database for PostgreSQL: add pg_stat_statements to shared_preload_libraries, restart, then CREATE EXTENSION' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Extensions.html \ + https://cloud.google.com/sql/docs/postgres/flags \ + https://learn.microsoft.com/azure/postgresql/flexible-server/concepts-server-parameters' as documentation_link, + 3 as severity_order +where + not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements' + ) +union all +-- MEDIUM: pg_stat_statements dependent checks +(select + * +from + pgfirstaid_pg_stat_statements_checks()) +union all +-- MEDIUM: Top 10 expensive active queries by runtime +(with eq as ( +select + pgs.pid, + pgs.usename, + pgs.datname, + pgs.client_addr, + now() - pgs.query_start as runtime, + pgs.query +from + pg_stat_activity pgs +where + pgs.state = 'active' + and pgs.query_start is not null + and pgs.pid <> pg_backend_pid() + and now() - pgs.query_start > interval '30 seconds' +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Expensive Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || eq.pid::text, + 'usename: ' || eq.usename, + 'datname: ' || eq.datname, + 'client_address: ' || coalesce(eq.client_addr::text, 'local'), + 'runtime: ' || to_char(eq.runtime, 'HH24:MI:SS') + ) as object_name, + 'Top 10 active queries running longer than 30 seconds, ordered by runtime. Long-running active queries can signal lock waits, missing indexes, or inefficient plans' as issue_description, + left(regexp_replace(eq.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Review these queries with EXPLAIN (ANALYZE, BUFFERS) and reduce lock waits or full scans' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + eq) +union all +-- MEDIUM: Lock-wait-heavy active queries +(with lw as ( +select + pid, + usename, + datname, + client_addr, + wait_event, + query_start, + now() - query_start as runtime, + query +from + pg_stat_activity +where + state = 'active' + and wait_event_type = 'Lock' + and query_start is not null + and now() - query_start > interval '30 seconds' + and pid <> pg_backend_pid() +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Lock-Wait-Heavy Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || lw.pid::text, + 'usename: ' || lw.usename, + 'datname: ' || lw.datname, + 'client_address: ' || coalesce(lw.client_addr::text, 'local'), + 'wait_event: ' || coalesce(lw.wait_event, 'unknown'), + 'runtime: ' || to_char(lw.runtime, 'HH24:MI:SS') + ) as object_name, + 'Active queries waiting on locks for extended time can block throughput and cause cascading latency' as issue_description, + left(regexp_replace(lw.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Reduce transaction duration, enforce consistent lock ordering, and investigate blockers first' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/explicit-locking.html' as documentation_link, + 3 as severity_order +from + lw) +union all +-- MEDIUM: Idle in transaction over 5 minutes +(select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Idle In Transaction Over 5 Minutes' as check_name, + concat_ws(' | ', + 'pid: ' || psa.pid::text, + 'usename: ' || psa.usename, + 'datname: ' || psa.datname, + 'client_address: ' || coalesce(psa.client_addr::text, 'local'), + 'idle_duration: ' || to_char(now() - psa.state_change, 'HH24:MI:SS') + ) as object_name, + 'Sessions left idle in transaction hold snapshots and locks longer than necessary, which can hurt query performance and vacuum progress' as issue_description, + left(regexp_replace(psa.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Commit or rollback promptly and move application processing outside transaction boundaries' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/routine-vacuuming.html' as documentation_link, + 3 as severity_order +from + pg_stat_activity psa +where + psa.state = 'idle in transaction' + and psa.state_change is not null + and now() - psa.state_change > interval '5 minutes' + and psa.pid <> pg_backend_pid() +order by + now() - psa.state_change desc) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index e60202c..7807d7f 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -1,3 +1,272 @@ +create or replace +function pgfirstaid_pg_stat_statements_checks() +returns table ( + severity TEXT, + category TEXT, + check_name TEXT, + object_name TEXT, + issue_description TEXT, + current_value TEXT, + recommended_action TEXT, + documentation_link TEXT, + severity_order INTEGER +) as $$ +begin + if not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements') then + return; + end if; + + return query +with pss as ( + select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows + from + pg_stat_statements + where + calls > 0 + order by + total_exec_time desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.mean_exec_time > 100 + order by + pss.mean_exec_time desc + limit 10; + + return query +with pss as ( + select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time + from + pg_stat_statements + where + (temp_blks_read + temp_blks_written) > 0 + order by + (temp_blks_read + temp_blks_written) desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 + order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time + order by + pss.stddev_exec_time desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 + order by + pss.calls desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 + order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 + order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 + order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc + limit 10; +end; +$$ language plpgsql; + -- Adding dropping of the view instead of replace because of conversion issues with new health checks. -- This way we start with a fresh view. drop view if exists v_pgfirstAid; @@ -608,6 +877,147 @@ where state = 'active' and now() - query_start > interval '5 minutes' union all +-- MEDIUM: pg_stat_statements extension missing +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'pg_stat_statements Extension Missing' as check_name, + 'pg_stat_statements' as object_name, + 'pg_stat_statements is not installed, so query fingerprint and workload-level performance checks are unavailable' as issue_description, + 'Extension not found in pg_extension' as current_value, + 'Self-hosted: add pg_stat_statements to shared_preload_libraries, restart PostgreSQL, then run CREATE EXTENSION pg_stat_statements; AWS RDS: add pg_stat_statements to the parameter group shared_preload_libraries, reboot, then CREATE EXTENSION; GCP Cloud SQL: enable cloudsql.enable_pg_stat_statements, restart if required, then CREATE EXTENSION; Azure Database for PostgreSQL: add pg_stat_statements to shared_preload_libraries, restart, then CREATE EXTENSION' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Extensions.html \ + https://cloud.google.com/sql/docs/postgres/flags \ + https://learn.microsoft.com/azure/postgresql/flexible-server/concepts-server-parameters' as documentation_link, + 3 as severity_order +where + not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements' + ) +union all +-- MEDIUM: pg_stat_statements dependent checks +(select + * +from + pgfirstaid_pg_stat_statements_checks()) +union all +-- MEDIUM: Top 10 expensive active queries by runtime +(with eq as ( +select + pgs.pid, + pgs.usename, + pgs.datname, + pgs.client_addr, + now() - pgs.query_start as runtime, + pgs.query +from + pg_stat_activity pgs +where + pgs.state = 'active' + and pgs.query_start is not null + and pgs.pid <> pg_backend_pid() + and now() - pgs.query_start > interval '30 seconds' +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Expensive Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || eq.pid::text, + 'usename: ' || eq.usename, + 'datname: ' || eq.datname, + 'client_address: ' || coalesce(eq.client_addr::text, 'local'), + 'runtime: ' || to_char(eq.runtime, 'HH24:MI:SS') + ) as object_name, + 'Top 10 active queries running longer than 30 seconds, ordered by runtime. Long-running active queries can signal lock waits, missing indexes, or inefficient plans' as issue_description, + left(regexp_replace(eq.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Review these queries with EXPLAIN (ANALYZE, BUFFERS) and reduce lock waits or full scans' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + eq) +union all +-- MEDIUM: Lock-wait-heavy active queries +(with lw as ( +select + pid, + usename, + datname, + client_addr, + wait_event, + query_start, + now() - query_start as runtime, + query +from + pg_stat_activity +where + state = 'active' + and wait_event_type = 'Lock' + and query_start is not null + and now() - query_start > interval '30 seconds' + and pid <> pg_backend_pid() +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Lock-Wait-Heavy Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || lw.pid::text, + 'usename: ' || lw.usename, + 'datname: ' || lw.datname, + 'client_address: ' || coalesce(lw.client_addr::text, 'local'), + 'wait_event: ' || coalesce(lw.wait_event, 'unknown'), + 'runtime: ' || to_char(lw.runtime, 'HH24:MI:SS') + ) as object_name, + 'Active queries waiting on locks for extended time can block throughput and cause cascading latency' as issue_description, + left(regexp_replace(lw.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Reduce transaction duration, enforce consistent lock ordering, and investigate blockers first' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/explicit-locking.html' as documentation_link, + 3 as severity_order +from + lw) +union all +-- MEDIUM: Idle in transaction over 5 minutes +(select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Idle In Transaction Over 5 Minutes' as check_name, + concat_ws(' | ', + 'pid: ' || psa.pid::text, + 'usename: ' || psa.usename, + 'datname: ' || psa.datname, + 'client_address: ' || coalesce(psa.client_addr::text, 'local'), + 'idle_duration: ' || to_char(now() - psa.state_change, 'HH24:MI:SS') + ) as object_name, + 'Sessions left idle in transaction hold snapshots and locks longer than necessary, which can hurt query performance and vacuum progress' as issue_description, + left(regexp_replace(psa.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Commit or rollback promptly and move application processing outside transaction boundaries' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/routine-vacuuming.html' as documentation_link, + 3 as severity_order +from + pg_stat_activity psa +where + psa.state = 'idle in transaction' + and psa.state_change is not null + and now() - psa.state_change > interval '5 minutes' + and psa.pid <> pg_backend_pid() +order by + now() - psa.state_change desc) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT