diff --git a/doc/mysql.md b/doc/mysql.md index 0b651ce..eb193e1 100644 --- a/doc/mysql.md +++ b/doc/mysql.md @@ -55,7 +55,7 @@ These params apply in general to all MySQL clusters, unless specified differentl - `User`, `Password`: these can be specified as plaintext, or in a `${some_env_variable}` format, in which case `freno` will look up its environment for specified variable. (e.g. to match the above config, a `shell` script invoking `freno` can `export mysql_password_env_variable=flyingcircus`) - `MetricQuery`: - Note: returned value is expected to be `[0..)` (`0` or more), where lower values are "better" and higher values are "worse". - - if not provided, `freno` will assume you're interested in replication lag, and will issue a `SHOW SLAVE STATUS` to extract `Seconds_behind_master` + - if not provided, `freno` will assume you're interested in replication lag, and will issue `SHOW REPLICA STATUS` (MySQL 8.0.22+) or fall back to `SHOW SLAVE STATUS` on older versions, extracting the seconds-behind value - We strongly recommend using a custom heartbeat mechanism such as `pt-heartbeat`, with subsecond resolution. The sample query above works well with `pt-heartbeat` subsecond timestamps. - Strictly speaking, you don't have to provide a replication-lag metric. This could be any query that reports any metric. However you're likely interested in replication lag to start with. - Note: the default time unit for replication lag is _seconds_ diff --git a/pkg/mysql/mysql_throttle_metric.go b/pkg/mysql/mysql_throttle_metric.go index a67031e..679d63c 100644 --- a/pkg/mysql/mysql_throttle_metric.go +++ b/pkg/mysql/mysql_throttle_metric.go @@ -6,10 +6,12 @@ package mysql import ( + "errors" "fmt" "strings" "time" + "github.com/go-sql-driver/mysql" "github.com/outbrain/golib/sqlutils" "github.com/patrickmn/go-cache" metrics "github.com/rcrowley/go-metrics" @@ -66,7 +68,7 @@ func (metric *MySQLThrottleMetric) Get() (float64, error) { } // ReadThrottleMetric returns replication lag for a given connection config; either by explicit query -// or via SHOW SLAVE STATUS +// or via SHOW REPLICA STATUS / SHOW SLAVE STATUS func ReadThrottleMetric(probe *Probe, clusterName string) (mySQLThrottleMetric *MySQLThrottleMetric) { if mySQLThrottleMetric := getCachedMySQLThrottleMetric(probe); mySQLThrottleMetric != nil { return mySQLThrottleMetric @@ -115,17 +117,46 @@ func ReadThrottleMetric(probe *Probe, clusterName string) (mySQLThrottleMetric * return mySQLThrottleMetric } - // No metric query? By default we look at replication lag as output of SHOW SLAVE STATUS + // No metric query? By default we look at replication lag. + // Try SHOW REPLICA STATUS first (MySQL 8.0.22+, required in 8.4+), fall back to + // SHOW SLAVE STATUS for older MySQL versions that don't recognise the new syntax. - mySQLThrottleMetric.Err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error { - slaveIORunning := m.GetString("Slave_IO_Running") - slaveSQLRunning := m.GetString("Slave_SQL_Running") - secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master") - if !secondsBehindMaster.Valid { - return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning) + mySQLThrottleMetric.Err = sqlutils.QueryRowsMap(db, `show replica status`, func(m sqlutils.RowMap) error { + replicaIORunning := m.GetString("Replica_IO_Running") + replicaSQLRunning := m.GetString("Replica_SQL_Running") + secondsBehindSource := m.GetNullInt64("Seconds_Behind_Source") + if !secondsBehindSource.Valid { + return fmt.Errorf("replication not running; Replica_IO_Running=%+v, Replica_SQL_Running=%+v", replicaIORunning, replicaSQLRunning) } - mySQLThrottleMetric.Value = float64(secondsBehindMaster.Int64) + mySQLThrottleMetric.Value = float64(secondsBehindSource.Int64) return nil }) + + // MySQL error 1064 means syntax error — the server doesn't understand SHOW REPLICA STATUS + // (MySQL < 8.0.22). Fall back to the legacy SHOW SLAVE STATUS command. + if mySQLThrottleMetric.Err != nil { + var mysqlErr *mysql.MySQLError + if errors.As(mySQLThrottleMetric.Err, &mysqlErr) && mysqlErr.Number == 1064 { + originalErr := mySQLThrottleMetric.Err + fallbackErr := sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error { + slaveIORunning := m.GetString("Slave_IO_Running") + slaveSQLRunning := m.GetString("Slave_SQL_Running") + secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master") + if !secondsBehindMaster.Valid { + return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning) + } + mySQLThrottleMetric.Value = float64(secondsBehindMaster.Int64) + return nil + }) + if fallbackErr == nil { + mySQLThrottleMetric.Err = nil + } else { + // Both commands failed; surface the original error as it's more informative. + mySQLThrottleMetric.Err = originalErr + } + } + // Non-syntax errors (permissions, connectivity, replication issues) are kept as-is. + } + return cacheMySQLThrottleMetric(probe, mySQLThrottleMetric) }