From fad47a9806c42b837f67cab3017a87a11775fb4e Mon Sep 17 00:00:00 2001 From: Nilakanta Mallick Date: Mon, 30 Mar 2026 19:11:39 +0530 Subject: [PATCH 1/2] Expand RDS PostgreSQL and ElastiCache monitoring guides --- docs/instrument/infra/aws/elasticache.md | 398 +++++++++++++++----- docs/instrument/infra/aws/rds.md | 451 +++++++++++++++++++---- 2 files changed, 685 insertions(+), 164 deletions(-) diff --git a/docs/instrument/infra/aws/elasticache.md b/docs/instrument/infra/aws/elasticache.md index 2cb1f12..083a27c 100644 --- a/docs/instrument/infra/aws/elasticache.md +++ b/docs/instrument/infra/aws/elasticache.md @@ -4,169 +4,399 @@ id: collecting-aws-elasticache-telemetry title: AWS ElastiCache Monitoring with OpenTelemetry - Redis & Memcached Metrics sidebar_label: AWS ElastiCache description: - Stream ElastiCache Redis and Memcached metrics via CloudWatch. Monitor - cache hit rates, evictions, memory usage, and latency with - OpenTelemetry. + Monitor AWS ElastiCache Redis and Memcached with OpenTelemetry and + CloudWatch Metrics Stream. Track cache hit rates, evictions, memory, + latency, and connected clients in base14 Scout. keywords: - [ - aws elasticache monitoring, - elasticache redis monitoring, - cloudwatch metrics stream, - aws cache monitoring, - elasticache observability, - ] + - aws elasticache monitoring + - elasticache redis monitoring + - elasticache metrics + - elasticache redis metrics + - elasticache observability + - cloudwatch metrics stream + - aws cache monitoring + - elasticache redis observability + - monitor elasticache + - elasticache cloudwatch metrics head: - - script - type: application/ld+json - | - {"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"How do I monitor AWS ElastiCache with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use CloudWatch Metrics Stream to collect ElastiCache metrics with 2-3 minute latency, and optionally add the OpenTelemetry Redis receiver for detailed cache-specific metrics. Both can be sent to base14 Scout."}},{"@type":"Question","name":"Should I use CloudWatch Metrics Stream or Prometheus exporters for ElastiCache?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch Metrics Stream is recommended over Prometheus exporters for ElastiCache. It provides faster delivery (2-3 min vs 5+ min), lower cost, better scalability, and automatic metric discovery."}},{"@type":"Question","name":"What ElastiCache metrics can I collect with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch Metrics Stream delivers CPU utilization, memory usage, cache hits/misses, network I/O, and evictions. The optional Redis receiver adds detailed metrics like command latency, connected clients, and keyspace stats."}},{"@type":"Question","name":"How do I filter ElastiCache metrics in CloudWatch Metrics Stream?","acceptedAnswer":{"@type":"Answer","text":"When configuring your CloudWatch Metrics Stream, select specific namespaces and choose only AWS/ElastiCache instead of all namespaces to reduce costs and data volume."}},{"@type":"Question","name":"Can I monitor both ElastiCache Redis and Memcached with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Yes. CloudWatch Metrics Stream supports both Redis and Memcached ElastiCache engines. For Redis, you can also add the OpenTelemetry Redis receiver for deeper cache-level metrics."}}]} + {"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"How do I monitor AWS ElastiCache with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use CloudWatch Metrics Stream to collect ElastiCache infrastructure metrics (CPU, memory, network) with 2-3 minute latency, and add the OpenTelemetry Redis receiver for cache-specific metrics like command latency, keyspace hits, and connected clients. Both feed into base14 Scout."}},{"@type":"Question","name":"What ElastiCache metrics does CloudWatch collect?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch collects CPUUtilization, EngineCPUUtilization, FreeableMemory, NetworkBytesIn/Out, CurrConnections, NewConnections, CacheHits, CacheMisses, Evictions, ReplicationLag, and BytesUsedForCache for ElastiCache Redis and Memcached."}},{"@type":"Question","name":"Should I use CloudWatch Metrics Stream or the Redis receiver for ElastiCache?","acceptedAnswer":{"@type":"Answer","text":"Use both. CloudWatch provides host-level metrics (CPU, memory, network). The OTel Redis receiver adds cache internals like per-command latency, keyspace statistics, and memory fragmentation ratio. Together they give complete visibility."}},{"@type":"Question","name":"How do I monitor ElastiCache Redis slow commands?","acceptedAnswer":{"@type":"Answer","text":"Enable Redis slow log in your ElastiCache parameter group by setting slowlog-log-slower-than to a threshold in microseconds (e.g., 10000 for 10ms). Forward slow logs via CloudWatch Logs to your OTel Collector for analysis."}},{"@type":"Question","name":"What is a good cache hit rate for ElastiCache Redis?","acceptedAnswer":{"@type":"Answer","text":"A healthy Redis cache hit rate is above 95%. Below 90% indicates that a significant portion of requests are missing the cache and hitting the backend database, which defeats the purpose of caching. Monitor CacheHits / (CacheHits + CacheMisses) to track this ratio."}},{"@type":"Question","name":"How do I set up alerts for ElastiCache?","acceptedAnswer":{"@type":"Answer","text":"Route ElastiCache metrics through CloudWatch Metrics Stream to base14 Scout, then alert on: cache hit rate below 90%, evictions above zero (sustained), memory usage above 80%, CPU above 70%, replication lag above 5 seconds, and current connections approaching the max."}},{"@type":"Question","name":"Can I monitor both ElastiCache Redis and Memcached with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Yes. CloudWatch Metrics Stream supports both engines. For Redis, add the OTel Redis receiver for deeper cache-level metrics. For Memcached, the OTel Memcached receiver collects hit rates, evictions, and connection counts."}}]} --- ## Overview -This guide will walk you through collecting rich telemetry data from your -ElastiCache caches using CloudWatch Metrics Stream. We recommend using -CloudWatch Metrics Stream over Prometheus exporters as it provides faster metric -delivery (2-3 minute latency) and is more efficient for AWS services. +This guide covers monitoring AWS ElastiCache (Redis and Memcached) +using OpenTelemetry and CloudWatch Metrics Stream. You'll collect +infrastructure metrics from CloudWatch, cache-specific metrics from +the Redis receiver, and slow logs — all flowing into base14 Scout. -## Collecting ElastiCache Metrics +## What You'll Monitor -For collecting ElastiCache metrics, we recommend using **CloudWatch Metrics -Stream** instead of Prometheus exporters. CloudWatch Metrics Stream provides: +ElastiCache monitoring combines CloudWatch metrics with optional +Redis receiver metrics for complete visibility: -- **Faster delivery**: 2-3 minute latency vs 5+ minutes with polling -- **Lower cost**: No need to run dedicated exporters -- **Better scalability**: Native AWS service integration -- **Automatic metric discovery**: No need to manually configure metric lists +**CloudWatch Metrics Stream (infrastructure + cache basics):** -### Step 1: Set up CloudWatch Metrics Stream +| Metric | What it tells you | +| ------ | ----------------- | +| `CPUUtilization` | Instance CPU usage (%) | +| `EngineCPUUtilization` | Redis/Memcached engine CPU (%) — more relevant than host CPU | +| `FreeableMemory` | Available RAM (bytes) | +| `BytesUsedForCache` | Memory used by the cache engine | +| `CacheHits` / `CacheMisses` | Cache effectiveness | +| `Evictions` | Keys removed due to memory pressure | +| `CurrConnections` / `NewConnections` | Client connection counts | +| `NetworkBytesIn` / `NetworkBytesOut` | Network throughput | +| `ReplicationLag` | Replica delay (seconds, Redis only) | +| `SaveInProgress` | Whether a background save is running (Redis) | +| `CurrItems` | Number of items in the cache | -Follow our comprehensive -[CloudWatch Metrics Stream guide](cloudwatch-metrics-stream.md) to set up the -infrastructure. +**OTel Redis receiver (cache internals, Redis only):** -### Step 2: Configure ElastiCache metrics filtering +| Metric | What it tells you | +| ------ | ----------------- | +| `redis.memory.used` | Actual memory consumed by Redis | +| `redis.maxmemory` | Configured memory limit | +| `redis.connected_clients` | Currently connected client count | +| `redis.keyspace.hits` / `redis.keyspace.misses` | Per-keyspace hit/miss rates | +| `redis.keys.expired` | Keys expired by TTL | +| `redis.keys.evicted` | Keys evicted under memory pressure | +| `redis.uptime` | Time since last restart (seconds) | +| `redis.memory.fragmentation_ratio` | Memory fragmentation (> 1.5 is a concern) | +| `redis.commands.processed` | Total commands processed | +| `redis.connections.received` | Total connections received since start | -When configuring your CloudWatch Metrics Stream in **Step 3** of the setup -guide, make sure to: +## Prerequisites -1. **Select specific namespaces** instead of "All namespaces" -2. **Choose only AWS/ElastiCache** from the namespace list -3. This ensures you only collect ElastiCache metrics, reducing costs and data - volume +| Requirement | Minimum | Recommended | +| ----------- | ------- | ----------- | +| ElastiCache | Redis 6.x or Memcached 1.6 | Redis 7.x | +| OTel Collector Contrib | 0.90.0 | latest | +| base14 Scout | Any | - | +| AWS permissions | CloudWatch, Kinesis Firehose, S3 | - | -### Step 3: Create OTEL Collector config for Redis metrics (Optional) +Before starting: -If you're using Redis and need detailed cache-specific metrics, create -`elasticache-metrics-collection-config.yaml`: +- ElastiCache cluster must be accessible from the host running the + OTel Collector (same VPC) +- For the Redis receiver: AUTH token if encryption in transit is + enabled +- CloudWatch Metrics Stream infrastructure set up (see Step 1) -```yaml +## Step 1: Set up CloudWatch Metrics Stream + +Follow our comprehensive +[CloudWatch Metrics Stream guide](cloudwatch-metrics-stream.md) to +set up the streaming infrastructure (S3 bucket, Kinesis Firehose, +Metrics Stream). + +When configuring the Metrics Stream: + +1. Select **specific namespaces** instead of "All namespaces" +2. Choose **AWS/ElastiCache** from the namespace list +3. This ensures you only collect ElastiCache metrics, reducing costs + and data volume + +## Step 2: Configure the OTel Collector for Redis metrics + +For Redis clusters, add the Redis receiver for cache-internal metrics +that CloudWatch doesn't expose: + +```yaml showLineNumbers title="elasticache-redis-config.yaml" receivers: redis: endpoint: ${env:REDIS_ENDPOINT} collection_interval: 60s - password: ${env:REDIS_PASSWORD} - # transport: tcp - # tls: - # insecure: false - # ca_file: /etc/ssl/certs/ca-certificates.crt - # cert_file: /etc/ssl/certs/redis.crt - # key_file: /etc/ssl/certs/redis.key + password: ${env:REDIS_AUTH_TOKEN} + tls: + insecure: false + ca_file: /etc/ssl/certs/ca-certificates.crt metrics: redis.maxmemory: enabled: true - redis.cmd.latency: - enabled: true redis.connected_clients: enabled: true redis.uptime: enabled: true redis.memory.used: enabled: true + redis.memory.fragmentation_ratio: + enabled: true redis.keys.expired: enabled: true + redis.keys.evicted: + enabled: true redis.keyspace.hits: enabled: true redis.keyspace.misses: enabled: true + redis.commands.processed: + enabled: true + redis.connections.received: + enabled: true + +processors: + resource: + attributes: + - key: environment + value: ${env:ENVIRONMENT} + action: upsert + - key: service.name + value: ${env:SERVICE_NAME} + action: upsert + - key: cloud.provider + value: aws + action: upsert + + batch: + timeout: 10s + send_batch_size: 1024 exporters: - otlp: - endpoint: ":4317" + otlphttp/b14: + endpoint: ${env:OTEL_EXPORTER_OTLP_ENDPOINT} tls: - insecure: true + insecure_skip_verify: true service: pipelines: - metrics/elasticache: + metrics: receivers: [redis] - exporters: [otlp] + processors: [batch, resource] + exporters: [otlphttp/b14] ``` -> **Note**: CloudWatch Metrics Stream will automatically deliver AWS/ElastiCache -> metrics (CPU utilization, memory usage, cache hits/misses, network I/O, etc.), -> while the Redis receiver collects detailed cache-specific metrics if needed. +### Environment variables -## Collecting Elasticache Logs +```bash showLineNumbers title=".env" +REDIS_ENDPOINT=your-cluster.xxxxx.ng.0001.use1.cache.amazonaws.com:6379 +REDIS_AUTH_TOKEN=your_auth_token +ENVIRONMENT=production +SERVICE_NAME=elasticache-redis +OTEL_EXPORTER_OTLP_ENDPOINT=https://.base14.io +``` -The log collection of Elasticache Cluster requires specifying the list of log -group names.From the AWS CloudWatch console , please find the log group(s) -relevant to the integration. +> **Note**: CloudWatch Metrics Stream delivers the infrastructure +> metrics (CPU, memory, connections, evictions) automatically. The +> Redis receiver above adds cache internals like keyspace hit rates, +> memory fragmentation, and connection details. For Memcached +> clusters, use the +> [Memcached receiver](../../component/memcached.md) instead. -### Create the Collector config file +## Step 3: Collect ElastiCache logs -```yaml +ElastiCache Redis supports two log types through CloudWatch: + +- **Slow log** — commands exceeding a latency threshold +- **Engine log** — connection events, failovers, configuration changes + +Configure the CloudWatch Logs receiver: + +```yaml showLineNumbers title="elasticache-logs-config.yaml" receivers: - awscloudwatch/elasticache_logs: - region: us-east-1 + awscloudwatchlogs/elasticache: + region: ${env:AWS_REGION} logs: poll_interval: 1m groups: named: - # replace with your Elasticache's log group name - /aws/elasticache/: + # Replace with your ElastiCache cluster ID + /aws/elasticache/cluster/${env:CLUSTER_ID}/slow-log: + /aws/elasticache/cluster/${env:CLUSTER_ID}/engine-log: processors: - attributes/add_source_elasticache: + attributes/add_source: actions: - key: source value: "elasticache" action: insert + - key: cloud.provider + value: "aws" + action: insert + batch: send_batch_size: 10000 send_batch_max_size: 11000 timeout: 10s exporters: - otlp: - endpoint: ":4317" + otlphttp/b14: + endpoint: ${env:OTEL_EXPORTER_OTLP_ENDPOINT} tls: - insecure: false + insecure_skip_verify: true service: pipelines: logs/elasticache: - receivers: [awscloudwatch/elasticache_logs] - processors: [attributes/add_source_elasticache, batch] - exporters: [otlp] + receivers: [awscloudwatchlogs/elasticache] + processors: [attributes/add_source, batch] + exporters: [otlphttp/b14] ``` -After deploying these changes, generate some traffic to your elasticache cluster -and check in Scout to see your elasticache's metrics and logs. +### Enable slow log in ElastiCache ---- +In your ElastiCache parameter group, set: + +```text +slowlog-log-slower-than = 10000 # Log commands over 10ms (microseconds) +slowlog-max-len = 128 # Keep last 128 slow commands +``` + +Then in the ElastiCache console, enable **Log delivery** for both +slow log and engine log, targeting CloudWatch Logs. + +## Step 4: Verify the setup + +Start the Collector and check for metrics: + +```bash showLineNumbers +# Test Redis connectivity from the Collector host +redis-cli -h ${REDIS_ENDPOINT%:*} -p 6379 \ + --tls --cacert /etc/ssl/certs/ca-certificates.crt \ + -a ${REDIS_AUTH_TOKEN} ping +``` + +Check Scout for both CloudWatch metrics (prefixed `aws.elasticache.*`) +and Redis metrics (prefixed `redis.*`). + +## Key alerts to configure + +| Metric | Warning | Critical | Why | +| ------ | ------- | -------- | --- | +| Cache hit rate | < 90% | < 80% | Low hit rate means cache isn't effective — requests hit the database instead | +| `Evictions` | > 0 (sustained) | > 100/min | Evictions mean memory pressure is forcing useful data out | +| `EngineCPUUtilization` | > 65% | > 80% | Redis is single-threaded — high CPU means commands are queuing | +| `BytesUsedForCache` | > 80% of max | > 90% of max | Approaching memory limit triggers aggressive eviction | +| `CurrConnections` | > 80% of max | > 90% of max | Connection exhaustion causes application errors | +| `ReplicationLag` | > 5s | > 30s | High lag means replicas serve stale data | +| `redis.memory.fragmentation_ratio` | > 1.5 | > 2.0 | High fragmentation wastes memory — consider a restart | +| Slow log entries | > 10/min | > 50/min | Frequent slow commands indicate saturation — check slow log | + +**Cache hit rate formula:** +`CacheHits / (CacheHits + CacheMisses) * 100` + +> **Why EngineCPUUtilization, not CPUUtilization?** ElastiCache Redis +> is single-threaded. `CPUUtilization` shows total host CPU across +> all cores, which can look low even when the Redis engine core is +> saturated. `EngineCPUUtilization` shows the single-core usage that +> actually matters. + +## Troubleshooting + +### Redis receiver shows no metrics + +**Cause**: Collector can't reach the ElastiCache cluster. + +**Fix**: + +1. ElastiCache is VPC-only — the Collector must run in the same VPC + or a peered VPC +2. Check the security group allows inbound on port 6379 from the + Collector's security group +3. If encryption in transit is enabled, the Redis receiver must use + TLS (`tls.insecure: false` with a CA cert) +4. Test connectivity: + `redis-cli -h -p 6379 --tls -a ping` + +### CloudWatch metrics not appearing + +**Cause**: Metrics Stream not configured for the AWS/ElastiCache +namespace. + +**Fix**: + +1. In CloudWatch > Metrics > Streams, verify the stream is active +2. Check that the namespace filter includes `AWS/ElastiCache` +3. Verify Kinesis Firehose delivery is succeeding +4. Allow 5-10 minutes for initial metrics to flow + +### High evictions but low memory usage + +**Cause**: The `maxmemory-policy` is set to a volatile policy +(like `volatile-lru`) and keys without TTLs are filling memory, +while keys with TTLs get evicted. + +**Fix**: + +1. Check the eviction policy: + `redis-cli CONFIG GET maxmemory-policy` +2. If using `volatile-lru`, consider switching to `allkeys-lru` +3. Review key TTL distribution — sample keys and check their TTLs + to identify keys without expiration + +### Cache hit rate dropping + +**Cause**: Application pattern change, insufficient memory, or key +expiration settings. + +**Fix**: + +1. Check if evictions are increasing (memory pressure pushing out + useful keys) +2. Review whether application code is requesting keys that were + never cached +3. Compare `CurrItems` trend — a sudden drop suggests mass + expiration +4. Consider increasing node size or adding shards + +## FAQ + +**How do I monitor ElastiCache Redis slow commands?** + +Enable the slow log in your ElastiCache parameter group by setting +`slowlog-log-slower-than` to a threshold in microseconds (10000 = +10ms). Enable log delivery to CloudWatch Logs, then forward to +Scout via the CloudWatch Logs receiver. + +**What is a good cache hit rate?** + +Above 95% is healthy. Below 90% means a significant portion of +requests miss the cache and hit the backend database. Track the +ratio over time — a gradual decline often indicates growing data +volume without proportional cache capacity. + +**Can I monitor Memcached clusters with this setup?** + +Yes. CloudWatch Metrics Stream covers Memcached infrastructure +metrics. For cache-specific metrics, the OTel Collector has a +[Memcached receiver](../../component/memcached.md) that collects +hit rates, evictions, connection counts, and memory usage — the +Memcached equivalent of the Redis receiver above. + +**Should I monitor ElastiCache Serverless differently?** + +ElastiCache Serverless uses the same CloudWatch metrics namespace +(`AWS/ElastiCache`) but adds metrics like +`ElastiCacheProcessingUnits` for capacity tracking. The CloudWatch +Metrics Stream setup is identical — just include the +`AWS/ElastiCache` namespace. + +**How do I monitor multiple ElastiCache clusters?** + +Add multiple Redis receiver blocks with distinct names: + +```yaml +receivers: + redis/sessions: + endpoint: sessions-cluster.xxxxx.cache.amazonaws.com:6379 + redis/cache: + endpoint: cache-cluster.xxxxx.cache.amazonaws.com:6379 +``` -With this setup, your AWS Elasticache cluster becomes fully observable through -Scout. You'll gain real-time visibility into performance metrics and logs -without any changes to your application code. +Then include both in the pipeline: +`receivers: [redis/sessions, redis/cache]`. ## Related Guides -- [CloudWatch Metrics Stream Setup](./cloudwatch-metrics-stream.md) - Set up AWS - metrics streaming -- [ELB Monitoring](./elb.md) - Monitor AWS Application Load Balancers -- [RDS Monitoring](./rds.md) - Monitor AWS RDS databases -- [Redis Monitoring](../../component/redis.md) - Self-hosted Redis monitoring - guide +- [CloudWatch Metrics Stream Setup](./cloudwatch-metrics-stream.md) — + Configure AWS metrics streaming +- [Redis Monitoring](../../component/redis.md) — Self-hosted Redis + monitoring with OpenTelemetry +- [Memcached Monitoring](../../component/memcached.md) — Self-hosted + Memcached monitoring +- [RDS Monitoring](./rds.md) — Monitor AWS RDS databases +- [ELB Monitoring](./elb.md) — Monitor AWS Application Load Balancers - [OTel Collector Configuration](../../collector-setup/otel-collector-config.md) - \- Advanced collector configuration + — Collector setup basics diff --git a/docs/instrument/infra/aws/rds.md b/docs/instrument/infra/aws/rds.md index 83c0979..f097efd 100644 --- a/docs/instrument/infra/aws/rds.md +++ b/docs/instrument/infra/aws/rds.md @@ -1,74 +1,137 @@ --- date: 2025-04-26 id: collecting-aws-rds-postgres-telemetry -title: AWS RDS PostgreSQL Monitoring - Metrics, Logs & Query Performance +title: AWS RDS PostgreSQL Monitoring with OpenTelemetry - Metrics, Logs & Alerts sidebar_label: AWS RDS description: - Stream AWS RDS PostgreSQL metrics via CloudWatch. Monitor connections, - replication lag, IOPS, and query performance with OpenTelemetry and - base14 Scout. + Monitor AWS RDS PostgreSQL with OpenTelemetry and CloudWatch Metrics + Stream. Collect connections, replication lag, IOPS, query performance, + and Performance Insights data in base14 Scout. keywords: - [ - aws rds monitoring, - rds postgresql monitoring, - cloudwatch metrics stream, - aws database monitoring, - rds observability, - ] + - aws rds monitoring + - rds postgresql monitoring + - postgresql rds metrics + - aws rds postgres monitoring + - rds postgres observability + - cloudwatch metrics stream + - aws database monitoring + - rds performance insights + - aws rds postgresql observability + - rds postgres dashboard head: - - script - type: application/ld+json - | - {"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"How do I monitor AWS RDS PostgreSQL with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use CloudWatch Metrics Stream for infrastructure metrics (CPU, memory, disk I/O) and the OpenTelemetry PostgreSQL receiver for database-specific metrics like connections, query performance, and locks. Both feed into base14 Scout."}},{"@type":"Question","name":"What RDS metrics does CloudWatch Metrics Stream collect?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch Metrics Stream automatically delivers AWS/RDS metrics including CPU utilization, memory usage, disk I/O, read/write latency, database connections, and replication lag with 2-3 minute latency."}},{"@type":"Question","name":"Do I need both CloudWatch Metrics Stream and the PostgreSQL receiver for RDS?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch provides infrastructure-level RDS metrics while the PostgreSQL receiver collects database-specific metrics like locks, deadlocks, and sequential scans. Using both gives complete visibility in base14 Scout."}},{"@type":"Question","name":"How do I collect RDS PostgreSQL logs with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use the AWS CloudWatch Logs receiver in the OpenTelemetry Collector, specifying your RDS log group names. The collector polls CloudWatch Logs and forwards them to base14 Scout."}},{"@type":"Question","name":"How do I filter RDS metrics in CloudWatch Metrics Stream?","acceptedAnswer":{"@type":"Answer","text":"When configuring the Metrics Stream, select specific namespaces and choose only AWS/RDS instead of all namespaces to collect only RDS metrics, reducing costs and data volume."}}]} + {"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"How do I monitor AWS RDS PostgreSQL with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use CloudWatch Metrics Stream for infrastructure metrics (CPU, memory, disk I/O, connections) and the OpenTelemetry PostgreSQL receiver for database-specific metrics like locks, deadlocks, and sequential scans. Both feed into a single observability platform like base14 Scout."}},{"@type":"Question","name":"What RDS metrics does CloudWatch Metrics Stream collect?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch Metrics Stream delivers AWS/RDS metrics including CPUUtilization, FreeableMemory, ReadIOPS, WriteIOPS, ReadLatency, WriteLatency, DatabaseConnections, ReplicaLag, FreeStorageSpace, and DiskQueueDepth with 2-3 minute latency."}},{"@type":"Question","name":"Do I need both CloudWatch Metrics Stream and the PostgreSQL receiver?","acceptedAnswer":{"@type":"Answer","text":"Yes. CloudWatch provides infrastructure-level RDS metrics (CPU, memory, IOPS) while the PostgreSQL receiver collects database-specific metrics like locks, deadlocks, sequential scans, and tuple operations. Using both gives complete visibility."}},{"@type":"Question","name":"How do I collect RDS PostgreSQL logs with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use the AWS CloudWatch Logs receiver in the OpenTelemetry Collector, specifying your RDS log group names. The collector polls CloudWatch Logs and forwards them to your observability backend."}},{"@type":"Question","name":"How do I monitor RDS PostgreSQL query performance?","acceptedAnswer":{"@type":"Answer","text":"Enable Performance Insights on your RDS instance and publish the metrics to CloudWatch. Performance Insights provides per-query statistics including wait events, top SQL by load, and active session history. For deeper query monitoring, use the PostgreSQL pg_stat_statements extension with the OTel PostgreSQL receiver."}},{"@type":"Question","name":"What is the difference between CloudWatch metrics and Enhanced Monitoring for RDS?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch metrics are collected at 1-minute intervals and cover instance-level stats like CPU, memory, and IOPS. Enhanced Monitoring provides OS-level metrics at up to 1-second granularity, including per-process CPU, memory usage, and file system details. Enhanced Monitoring is useful for diagnosing issues that 1-minute CloudWatch intervals miss."}},{"@type":"Question","name":"How do I set up alerts for RDS PostgreSQL?","acceptedAnswer":{"@type":"Answer","text":"Route RDS metrics through CloudWatch Metrics Stream to base14 Scout, then configure alerts in Scout on key thresholds: CPU above 80%, connections above 80% of max, replication lag exceeding your SLA, storage below 20% free, and read/write latency spikes."}}]} --- ## Overview -This guide will walk you through collecting rich telemetry data from your RDS -postgres instance using CloudWatch Metrics Stream. We recommend using CloudWatch -Metrics Stream over Prometheus exporters as it provides faster metric delivery -(2-3 minute latency) and is more efficient for AWS services. +This guide covers monitoring AWS RDS PostgreSQL instances using +OpenTelemetry and CloudWatch Metrics Stream. You'll collect +infrastructure metrics from CloudWatch, database-specific metrics from +the PostgreSQL receiver, and logs from CloudWatch Logs — all flowing +into base14 Scout for unified visibility. -## Collecting RDS Postgres Metrics +## What You'll Monitor -For collecting RDS metrics, we recommend using **CloudWatch Metrics Stream** -instead of Prometheus exporters. CloudWatch Metrics Stream provides: +RDS PostgreSQL monitoring combines two metric sources that together +provide complete visibility: -- **Faster delivery**: 2-3 minute latency vs 5+ minutes with polling -- **Lower cost**: No need to run dedicated exporters -- **Better scalability**: Native AWS service integration -- **Automatic metric discovery**: No need to manually configure metric lists +**CloudWatch Metrics Stream (infrastructure):** -### Step 1: Set up CloudWatch Metrics Stream +| Metric | What it tells you | +| ------ | ----------------- | +| `CPUUtilization` | Instance CPU usage (%) | +| `FreeableMemory` | Available RAM (bytes) | +| `FreeStorageSpace` | Remaining disk space (bytes) | +| `ReadIOPS` / `WriteIOPS` | Disk read/write operations per second | +| `ReadLatency` / `WriteLatency` | Average time per disk I/O operation | +| `DatabaseConnections` | Active database connections | +| `ReplicaLag` | Replication delay for read replicas (seconds) | +| `DiskQueueDepth` | Number of I/O requests waiting | +| `NetworkReceiveThroughput` / `NetworkTransmitThroughput` | Network bytes in/out | +| `SwapUsage` | Swap space used (bytes) | +| `BurstBalance` | Remaining I/O burst credits (gp2/gp3) | + +**OTel PostgreSQL receiver (database internals):** + +| Metric | What it tells you | +| ------ | ----------------- | +| `postgresql.backends` | Active connections per database | +| `postgresql.commits` / `postgresql.rollbacks` | Transaction rates | +| `postgresql.database.locks` | Active locks by type | +| `postgresql.deadlocks` | Deadlock count | +| `postgresql.sequential_scans` / `postgresql.index.scans` | Scan type distribution | +| `postgresql.rows` | Rows affected by operations | +| `postgresql.table.size` / `postgresql.index.size` | Storage per table/index | +| `postgresql.table.vacuum.count` | Vacuum frequency | +| `postgresql.blks_hit` / `postgresql.blks_read` | Buffer cache hit ratio | +| `postgresql.replication.data_delay` | Replication byte lag | +| `postgresql.tup_inserted` / `postgresql.tup_updated` / `postgresql.tup_deleted` | Tuple operations | + +## Prerequisites + +| Requirement | Minimum | Recommended | +| ----------- | ------- | ----------- | +| RDS PostgreSQL | 11 | 14+ | +| OTel Collector Contrib | 0.90.0 | latest | +| base14 Scout | Any | - | +| AWS permissions | CloudWatch, Kinesis Firehose, S3 | - | + +Before starting: + +- RDS instance must be accessible from the host running the OTel + Collector (same VPC or VPC peering) +- A monitoring user with `pg_monitor` role for the PostgreSQL receiver +- CloudWatch Metrics Stream infrastructure set up (see Step 1) + +## Step 1: Set up CloudWatch Metrics Stream Follow our comprehensive -[CloudWatch Metrics Stream guide](cloudwatch-metrics-stream.md) to set up the -infrastructure. +[CloudWatch Metrics Stream guide](cloudwatch-metrics-stream.md) to set +up the streaming infrastructure (S3 bucket, Kinesis Firehose, Metrics +Stream). -### Step 2: Configure RDS metrics filtering +When configuring the Metrics Stream: -When configuring your CloudWatch Metrics Stream in **Step 3** of the setup -guide, make sure to: +1. Select **specific namespaces** instead of "All namespaces" +2. Choose **AWS/RDS** from the namespace list +3. This ensures you only collect RDS metrics, reducing costs and data + volume -1. **Select specific namespaces** instead of "All namespaces" -2. **Choose only AWS/RDS** from the namespace list -3. This ensures you only collect RDS metrics, reducing costs and data volume +## Step 2: Create a monitoring user on RDS -### Step 3: Create OTEL Collector config for PostgreSQL metrics +Connect to your RDS PostgreSQL instance and create a dedicated +monitoring user: -For database-specific metrics (like connection counts, query performance), -create `postgres-metrics-collection-config.yaml`: +```sql +CREATE USER otel_monitor WITH PASSWORD ''; +GRANT pg_monitor TO otel_monitor; +``` -```yaml +The `pg_monitor` role provides read-only access to all statistics +views needed for monitoring. No write permissions required. + +For RDS instances, ensure the security group allows connections from +the Collector host on port 5432. + +## Step 3: Configure the OTel Collector for PostgreSQL metrics + +Create `rds-postgres-config.yaml` with both the PostgreSQL receiver +and the CloudWatch metrics pipeline: + +```yaml showLineNumbers title="rds-postgres-config.yaml" receivers: postgresql: - endpoint: ${env:POSTGRESQL_ENDPOINT} + endpoint: ${env:RDS_ENDPOINT} collection_interval: 10s - username: ${env:POSTGRESQL_USERNAME} - password: ${env:POSTGRESQL_PASSWORD} - databases: ["pgtestdb"] + username: ${env:RDS_MONITOR_USER} + password: ${env:RDS_MONITOR_PASSWORD} + databases: ["${env:RDS_DATABASE}"] tls: insecure_skip_verify: true + metrics: postgresql.database.locks: enabled: true @@ -76,84 +139,312 @@ receivers: enabled: true postgresql.sequential_scans: enabled: true + postgresql.index.scans: + enabled: true + postgresql.backends: + enabled: true + postgresql.commits: + enabled: true + postgresql.rollbacks: + enabled: true + postgresql.db_size: + enabled: true + postgresql.table.count: + enabled: true + postgresql.table.size: + enabled: true + postgresql.index.size: + enabled: true + postgresql.table.vacuum.count: + enabled: true + postgresql.rows: + enabled: true + postgresql.blks_hit: + enabled: true + postgresql.blks_read: + enabled: true + postgresql.tup_inserted: + enabled: true + postgresql.tup_updated: + enabled: true + postgresql.tup_deleted: + enabled: true + postgresql.tup_fetched: + enabled: true + postgresql.replication.data_delay: + enabled: true + +processors: + resource: + attributes: + - key: environment + value: ${env:ENVIRONMENT} + action: upsert + - key: service.name + value: ${env:SERVICE_NAME} + action: upsert + - key: cloud.provider + value: aws + action: upsert + + batch: + timeout: 10s + send_batch_size: 1024 exporters: - otlp: - endpoint: ":4317" + otlphttp/b14: + endpoint: ${env:OTEL_EXPORTER_OTLP_ENDPOINT} tls: - insecure: true + insecure_skip_verify: true service: pipelines: - metrics/postgresql: + metrics: receivers: [postgresql] - exporters: [otlp] + processors: [batch, resource] + exporters: [otlphttp/b14] ``` -> **Note**: CloudWatch Metrics Stream will automatically deliver AWS/RDS metrics -> (CPU, memory, disk I/O, etc.), while the PostgreSQL receiver collects -> database-specific metrics. +### Environment variables -## Collecting RDS Logs +```bash showLineNumbers title=".env" +RDS_ENDPOINT=your-rds-instance.xxxxx.us-east-1.rds.amazonaws.com:5432 +RDS_MONITOR_USER=otel_monitor +RDS_MONITOR_PASSWORD=your_password +RDS_DATABASE=your_database +ENVIRONMENT=production +SERVICE_NAME=rds-postgres +OTEL_EXPORTER_OTLP_ENDPOINT=https://.base14.io +``` -The log collection of RDS instance requires specifying the list of log group -names. From the AWS CloudWatch console, please find the log group(s) relevant to -the integration. +> **Note**: CloudWatch Metrics Stream delivers the infrastructure +> metrics (CPU, memory, IOPS) automatically. The PostgreSQL receiver +> above collects the database-internal metrics. Together they give +> you the full picture. -### Create the Collector config file +## Step 4: Collect RDS PostgreSQL logs -```yaml +RDS PostgreSQL publishes logs to CloudWatch Log Groups. Use the +CloudWatch Logs receiver to forward them: + +```yaml showLineNumbers title="rds-postgres-logs-config.yaml" receivers: - awscloudwatch/rds_postgres_logs: - region: us-east-1 + awscloudwatchlogs/rds_postgres: + region: ${env:AWS_REGION} logs: poll_interval: 1m groups: named: - # replace with your RDS log group name - /aws/rds/: + # Replace with your RDS log group name + /aws/rds/instance/${env:RDS_INSTANCE_ID}/postgresql: processors: - attributes/add_source_postgres: + attributes/add_source: actions: - key: source value: "rds_postgres" action: insert + - key: cloud.provider + value: "aws" + action: insert + batch: send_batch_size: 10000 send_batch_max_size: 11000 timeout: 10s exporters: - otlp: - endpoint: ":4317" + otlphttp/b14: + endpoint: ${env:OTEL_EXPORTER_OTLP_ENDPOINT} tls: - insecure: false + insecure_skip_verify: true service: pipelines: - logs/postgres: - receivers: [awscloudwatch/rds_postgres_logs] - processors: [attributes/add_source_postgres, batch] - exporters: [otlp] + logs/rds: + receivers: [awscloudwatchlogs/rds_postgres] + processors: [attributes/add_source, batch] + exporters: [otlphttp/b14] ``` -After deploying these changes, generate some traffic to your database and check -the Postgres section in Scout to see your databases's metrics and logs. +### Enable recommended RDS log types ---- +In the RDS console under **Configuration > Log exports**, enable: + +- **PostgreSQL log** — query errors, connection events, autovacuum +- **Upgrade log** — major version upgrade details + +For query-level logging, set these RDS parameter group values: + +```text +log_statement = 'ddl' +log_min_duration_statement = 1000 # Log queries over 1 second +log_connections = on +log_disconnections = on +``` + +## Step 5: Enable Performance Insights (optional) + +RDS Performance Insights provides query-level monitoring that +CloudWatch and the PostgreSQL receiver don't cover: + +- **Top SQL by load** — which queries consume the most CPU and I/O +- **Wait events** — what queries are waiting on (CPU, I/O, lock, + network) +- **Active session history** — per-second breakdown of database load + +To enable: + +1. In the RDS console, modify your instance +2. Under **Performance Insights**, enable it +3. Choose a retention period (free tier: 7 days, paid: up to 2 years) +4. Optionally publish Performance Insights metrics to CloudWatch + +Performance Insights data flows through CloudWatch Metrics Stream +alongside your other RDS metrics. + +For deeper query-level monitoring beyond Performance Insights, see +[PostgreSQL Advanced Monitoring](../../component/postgres-advanced.md) +which covers `pg_stat_statements` and per-table I/O. + +## Verify the setup + +Start the Collector and check for metrics within 60 seconds: + +```bash showLineNumbers +# Test PostgreSQL connectivity from the Collector host +psql -h ${RDS_ENDPOINT%:*} -p 5432 -U otel_monitor \ + -d ${RDS_DATABASE} -c "SELECT version();" +``` + +```sql showLineNumbers +-- Verify monitoring permissions +SELECT * FROM pg_stat_database WHERE datname = 'your_database'; +SELECT * FROM pg_stat_user_tables LIMIT 5; +``` + +Check Scout for both CloudWatch metrics (prefixed `aws.rds.*`) and +PostgreSQL metrics (prefixed `postgresql.*`). + +## Key alerts to configure + +Once metrics are flowing, set up alerts on these thresholds: + +| Metric | Warning | Critical | Why | +| ------ | ------- | -------- | --- | +| `CPUUtilization` | > 70% | > 85% | Sustained high CPU degrades query performance | +| `DatabaseConnections` | > 80% of max | > 90% of max | Connection exhaustion causes application errors | +| `FreeStorageSpace` | < 20% | < 10% | Running out of storage crashes the instance | +| `ReplicaLag` | > 10s | > 60s | High lag means read replicas serve stale data | +| `ReadLatency` / `WriteLatency` | > 10ms | > 20ms | I/O latency spikes indicate storage bottlenecks | +| `DiskQueueDepth` | > 10 | > 20 | Deep queue means I/O is saturated | +| `postgresql.deadlocks` | > 0 | > 5/min | Deadlocks indicate application-level locking issues | +| Buffer hit ratio | < 95% | < 90% | Low hit ratio means too many disk reads | + +Buffer hit ratio: calculate as +`blks_hit / (blks_hit + blks_read) * 100`. + +## Troubleshooting + +### PostgreSQL receiver shows no metrics + +**Cause**: Collector can't reach the RDS instance. + +**Fix**: + +1. Verify the RDS instance security group allows inbound on port 5432 + from the Collector's IP or security group +2. Confirm the RDS instance is not in a private subnet without a route + to the Collector +3. Test connectivity: `psql -h -U otel_monitor -d ` +4. Check the monitoring user has `pg_monitor` role: + `SELECT rolname FROM pg_roles WHERE pg_has_role('otel_monitor', oid, 'member');` + +### CloudWatch metrics not appearing + +**Cause**: Metrics Stream not configured for the AWS/RDS namespace. + +**Fix**: + +1. In CloudWatch > Metrics > Streams, verify the stream is active +2. Check that the namespace filter includes `AWS/RDS` +3. Verify Kinesis Firehose delivery is succeeding (check the S3 + error bucket) +4. Allow 5-10 minutes for initial metrics to flow + +### Replication lag metrics showing zero + +**Cause**: No read replicas configured, or the instance is a replica +(not the primary). + +**Fix**: + +1. `ReplicaLag` is only populated on read replica instances +2. `postgresql.replication.data_delay` requires at least one replica + connected to the primary +3. On the primary, check: `SELECT * FROM pg_stat_replication;` + +### High connection count but low CPU + +**Cause**: Idle connections consuming connection slots. + +**Fix**: + +1. Check for idle connections: + `SELECT count(*) FROM pg_stat_activity WHERE state = 'idle';` +2. Consider connection pooling (PgBouncer or RDS Proxy) +3. Set `idle_in_transaction_session_timeout` in the parameter group + +## FAQ + +**How do I monitor RDS PostgreSQL query performance?** + +Enable Performance Insights on the RDS instance for top SQL by load +and wait event analysis. For per-query statistics, enable +`pg_stat_statements` and use the +[PostgreSQL Advanced guide](../../component/postgres-advanced.md). + +**What's the difference between CloudWatch and Enhanced Monitoring?** + +CloudWatch metrics are collected at 1-minute intervals and cover +instance-level stats. Enhanced Monitoring provides OS-level metrics at +up to 1-second granularity (per-process CPU, memory, file system). +Enable Enhanced Monitoring when you need to diagnose issues that +1-minute intervals miss. + +**Can I monitor multiple RDS instances with one Collector?** + +Yes. Add multiple PostgreSQL receiver blocks with distinct names: + +```yaml +receivers: + postgresql/primary: + endpoint: primary.xxxxx.rds.amazonaws.com:5432 + postgresql/replica: + endpoint: replica.xxxxx.rds.amazonaws.com:5432 +``` + +Then include both in the pipeline: +`receivers: [postgresql/primary, postgresql/replica]`. + +**How do I filter which CloudWatch metrics are streamed?** -With this setup, your RDS instance becomes fully observable through Scout. -You'll gain real-time visibility into performance metrics and logs without any -changes to your application code. +When configuring the Metrics Stream, select specific namespaces and +choose only `AWS/RDS` instead of all namespaces. This reduces costs +and data volume. ## Related Guides -- [CloudWatch Metrics Stream Setup](./cloudwatch-metrics-stream.md) - Configure - AWS metrics streaming -- [ELB Monitoring](./elb.md) - Monitor AWS Application Load Balancers -- [ElastiCache Monitoring](./elasticache.md) - Monitor Redis and Memcached -- [AWS ECS/Fargate Setup](../../collector-setup/ecs-setup.md) - Deploy Scout - Collector on AWS ECS -- [Monitor PostgreSQL Component](../../component/postgres.md) - Direct - PostgreSQL monitoring +- [CloudWatch Metrics Stream Setup](./cloudwatch-metrics-stream.md) — + Configure AWS metrics streaming +- [PostgreSQL Basic Monitoring](../../component/postgres.md) — Direct + PostgreSQL monitoring with the OTel receiver +- [PostgreSQL Advanced Monitoring](../../component/postgres-advanced.md) + — Query statistics, per-table I/O, replication details +- [pgX Deep PostgreSQL Analysis](https://base14.io/scout/pgx) — + Correlate query performance with application traces +- [ELB Monitoring](./elb.md) — Monitor AWS Application Load Balancers +- [ElastiCache Monitoring](./elasticache.md) — Monitor Redis and + Memcached on AWS +- [AWS ECS/Fargate Setup](../../collector-setup/ecs-setup.md) — Deploy + the Collector on AWS ECS From db38201296472e516d6ae1dae9fce4a30bb330bf Mon Sep 17 00:00:00 2001 From: nimishgj Date: Tue, 31 Mar 2026 11:54:08 +0530 Subject: [PATCH 2/2] remove rds performance insights step --- docs/instrument/infra/aws/rds.md | 47 ++++++-------------------------- 1 file changed, 9 insertions(+), 38 deletions(-) diff --git a/docs/instrument/infra/aws/rds.md b/docs/instrument/infra/aws/rds.md index f097efd..5491f3e 100644 --- a/docs/instrument/infra/aws/rds.md +++ b/docs/instrument/infra/aws/rds.md @@ -5,8 +5,8 @@ title: AWS RDS PostgreSQL Monitoring with OpenTelemetry - Metrics, Logs & Alerts sidebar_label: AWS RDS description: Monitor AWS RDS PostgreSQL with OpenTelemetry and CloudWatch Metrics - Stream. Collect connections, replication lag, IOPS, query performance, - and Performance Insights data in base14 Scout. + Stream. Collect connections, replication lag, IOPS, and query + performance data in base14 Scout. keywords: - aws rds monitoring - rds postgresql monitoring @@ -15,14 +15,13 @@ keywords: - rds postgres observability - cloudwatch metrics stream - aws database monitoring - - rds performance insights - aws rds postgresql observability - rds postgres dashboard head: - - script - type: application/ld+json - | - {"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"How do I monitor AWS RDS PostgreSQL with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use CloudWatch Metrics Stream for infrastructure metrics (CPU, memory, disk I/O, connections) and the OpenTelemetry PostgreSQL receiver for database-specific metrics like locks, deadlocks, and sequential scans. Both feed into a single observability platform like base14 Scout."}},{"@type":"Question","name":"What RDS metrics does CloudWatch Metrics Stream collect?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch Metrics Stream delivers AWS/RDS metrics including CPUUtilization, FreeableMemory, ReadIOPS, WriteIOPS, ReadLatency, WriteLatency, DatabaseConnections, ReplicaLag, FreeStorageSpace, and DiskQueueDepth with 2-3 minute latency."}},{"@type":"Question","name":"Do I need both CloudWatch Metrics Stream and the PostgreSQL receiver?","acceptedAnswer":{"@type":"Answer","text":"Yes. CloudWatch provides infrastructure-level RDS metrics (CPU, memory, IOPS) while the PostgreSQL receiver collects database-specific metrics like locks, deadlocks, sequential scans, and tuple operations. Using both gives complete visibility."}},{"@type":"Question","name":"How do I collect RDS PostgreSQL logs with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use the AWS CloudWatch Logs receiver in the OpenTelemetry Collector, specifying your RDS log group names. The collector polls CloudWatch Logs and forwards them to your observability backend."}},{"@type":"Question","name":"How do I monitor RDS PostgreSQL query performance?","acceptedAnswer":{"@type":"Answer","text":"Enable Performance Insights on your RDS instance and publish the metrics to CloudWatch. Performance Insights provides per-query statistics including wait events, top SQL by load, and active session history. For deeper query monitoring, use the PostgreSQL pg_stat_statements extension with the OTel PostgreSQL receiver."}},{"@type":"Question","name":"What is the difference between CloudWatch metrics and Enhanced Monitoring for RDS?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch metrics are collected at 1-minute intervals and cover instance-level stats like CPU, memory, and IOPS. Enhanced Monitoring provides OS-level metrics at up to 1-second granularity, including per-process CPU, memory usage, and file system details. Enhanced Monitoring is useful for diagnosing issues that 1-minute CloudWatch intervals miss."}},{"@type":"Question","name":"How do I set up alerts for RDS PostgreSQL?","acceptedAnswer":{"@type":"Answer","text":"Route RDS metrics through CloudWatch Metrics Stream to base14 Scout, then configure alerts in Scout on key thresholds: CPU above 80%, connections above 80% of max, replication lag exceeding your SLA, storage below 20% free, and read/write latency spikes."}}]} + {"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"How do I monitor AWS RDS PostgreSQL with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use CloudWatch Metrics Stream for infrastructure metrics (CPU, memory, disk I/O, connections) and the OpenTelemetry PostgreSQL receiver for database-specific metrics like locks, deadlocks, and sequential scans. Both feed into a single observability platform like base14 Scout."}},{"@type":"Question","name":"What RDS metrics does CloudWatch Metrics Stream collect?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch Metrics Stream delivers AWS/RDS metrics including CPUUtilization, FreeableMemory, ReadIOPS, WriteIOPS, ReadLatency, WriteLatency, DatabaseConnections, ReplicaLag, FreeStorageSpace, and DiskQueueDepth with 2-3 minute latency."}},{"@type":"Question","name":"Do I need both CloudWatch Metrics Stream and the PostgreSQL receiver?","acceptedAnswer":{"@type":"Answer","text":"Yes. CloudWatch provides infrastructure-level RDS metrics (CPU, memory, IOPS) while the PostgreSQL receiver collects database-specific metrics like locks, deadlocks, sequential scans, and tuple operations. Using both gives complete visibility."}},{"@type":"Question","name":"How do I collect RDS PostgreSQL logs with OpenTelemetry?","acceptedAnswer":{"@type":"Answer","text":"Use the AWS CloudWatch Logs receiver in the OpenTelemetry Collector, specifying your RDS log group names. The collector polls CloudWatch Logs and forwards them to your observability backend."}},{"@type":"Question","name":"How do I monitor RDS PostgreSQL query performance?","acceptedAnswer":{"@type":"Answer","text":"Enable the PostgreSQL pg_stat_statements extension and use the OTel PostgreSQL receiver to collect per-query statistics including execution counts, total time, and rows returned."}},{"@type":"Question","name":"What is the difference between CloudWatch metrics and Enhanced Monitoring for RDS?","acceptedAnswer":{"@type":"Answer","text":"CloudWatch metrics are collected at 1-minute intervals and cover instance-level stats like CPU, memory, and IOPS. Enhanced Monitoring provides OS-level metrics at up to 1-second granularity, including per-process CPU, memory usage, and file system details. Enhanced Monitoring is useful for diagnosing issues that 1-minute CloudWatch intervals miss."}},{"@type":"Question","name":"How do I set up alerts for RDS PostgreSQL?","acceptedAnswer":{"@type":"Answer","text":"Route RDS metrics through CloudWatch Metrics Stream to base14 Scout, then configure alerts in Scout on key thresholds: CPU above 80%, connections above 80% of max, replication lag exceeding your SLA, storage below 20% free, and read/write latency spikes."}}]} --- ## Overview @@ -93,12 +92,9 @@ Follow our comprehensive up the streaming infrastructure (S3 bucket, Kinesis Firehose, Metrics Stream). -When configuring the Metrics Stream: - -1. Select **specific namespaces** instead of "All namespaces" -2. Choose **AWS/RDS** from the namespace list -3. This ensures you only collect RDS metrics, reducing costs and data - volume +When configuring the Metrics Stream, select the **AWS/RDS** namespace +instead of "All namespaces" to only collect RDS metrics and reduce +costs. ## Step 2: Create a monitoring user on RDS @@ -283,30 +279,6 @@ log_connections = on log_disconnections = on ``` -## Step 5: Enable Performance Insights (optional) - -RDS Performance Insights provides query-level monitoring that -CloudWatch and the PostgreSQL receiver don't cover: - -- **Top SQL by load** — which queries consume the most CPU and I/O -- **Wait events** — what queries are waiting on (CPU, I/O, lock, - network) -- **Active session history** — per-second breakdown of database load - -To enable: - -1. In the RDS console, modify your instance -2. Under **Performance Insights**, enable it -3. Choose a retention period (free tier: 7 days, paid: up to 2 years) -4. Optionally publish Performance Insights metrics to CloudWatch - -Performance Insights data flows through CloudWatch Metrics Stream -alongside your other RDS metrics. - -For deeper query-level monitoring beyond Performance Insights, see -[PostgreSQL Advanced Monitoring](../../component/postgres-advanced.md) -which covers `pg_stat_statements` and per-table I/O. - ## Verify the setup Start the Collector and check for metrics within 60 seconds: @@ -399,10 +371,9 @@ Buffer hit ratio: calculate as **How do I monitor RDS PostgreSQL query performance?** -Enable Performance Insights on the RDS instance for top SQL by load -and wait event analysis. For per-query statistics, enable -`pg_stat_statements` and use the -[PostgreSQL Advanced guide](../../component/postgres-advanced.md). +Enable `pg_stat_statements` for per-query statistics and use the +[PostgreSQL Advanced guide](../../component/postgres-advanced.md) +for detailed query-level monitoring. **What's the difference between CloudWatch and Enhanced Monitoring?**