diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index e856cd35a6f0f..4be81489ff4fb 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) if (auto_explain_enabled()) { - /* - * Set up to track total elapsed time in ExecutorRun. Make sure the - * space is allocated in the per-query context so it will go away at - * ExecutorEnd. - */ + /* Set up to track total elapsed time in ExecutorRun. */ if (queryDesc->totaltime == NULL) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); - MemoryContextSwitchTo(oldcxt); - } + queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL); } } @@ -381,14 +371,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc) */ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - /* Log plan if duration is exceeded. */ - msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); + msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total); if (msec >= auto_explain_log_min_duration) { ExplainState *es = NewExplainState(); diff --git a/contrib/meson.build b/contrib/meson.build index def13257cbead..cab1b21167895 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -50,6 +50,7 @@ subdir('pg_logicalinspect') subdir('pg_overexplain') subdir('pg_prewarm') subdir('pgrowlocks') +subdir('pg_session_buffer_usage') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') diff --git a/contrib/pg_session_buffer_usage/Makefile b/contrib/pg_session_buffer_usage/Makefile new file mode 100644 index 0000000000000..75bd8e09b3d3d --- /dev/null +++ b/contrib/pg_session_buffer_usage/Makefile @@ -0,0 +1,23 @@ +# contrib/pg_session_buffer_usage/Makefile + +MODULE_big = pg_session_buffer_usage +OBJS = \ + $(WIN32RES) \ + pg_session_buffer_usage.o + +EXTENSION = pg_session_buffer_usage +DATA = pg_session_buffer_usage--1.0.sql +PGFILEDESC = "pg_session_buffer_usage - show buffer usage statistics for the current session" + +REGRESS = pg_session_buffer_usage + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_session_buffer_usage +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out new file mode 100644 index 0000000000000..242b400395043 --- /dev/null +++ b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out @@ -0,0 +1,283 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + ok +---- + t +(1 row) + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + count +------- + 1000 +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + blocks_increased +------------------ + t +(1 row) + +DROP TABLE test_buf_activity; +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + leader_buffers_match +---------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_dc_tab, dc_serial_result; +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; +ERROR: division by zero +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + exception_buffers_visible +--------------------------- + t +(1 row) + +DROP TABLE exc_tab; +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +INSERT INTO unique_tab VALUES (1, 'duplicate'); +ERROR: duplicate key value violates unique constraint "unique_tab_a_key" +DETAIL: Key (a)=(1) already exists. +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + unique_violation_buffers_visible +---------------------------------- + t +(1 row) + +DROP TABLE unique_tab; +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT subxact_exc_func(); + subxact_exc_func +------------------ + caught +(1 row) + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + subxact_buffers_visible +------------------------- + t +(1 row) + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT cursor_exc_func(); + cursor_exc_func +----------------------- + caught after 250 rows +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + cursor_subxact_buffers_visible +-------------------------------- + t +(1 row) + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CONTEXT: parallel worker +RESET debug_parallel_query; +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + worker_abort_buffers_not_propagated +------------------------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_abort_tab, par_abort_serial_result; +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_session_buffer_usage/meson.build b/contrib/pg_session_buffer_usage/meson.build new file mode 100644 index 0000000000000..34c7502beb463 --- /dev/null +++ b/contrib/pg_session_buffer_usage/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +pg_session_buffer_usage_sources = files( + 'pg_session_buffer_usage.c', +) + +if host_system == 'windows' + pg_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_session_buffer_usage', + '--FILEDESC', 'pg_session_buffer_usage - show buffer usage statistics for the current session',]) +endif + +pg_session_buffer_usage = shared_module('pg_session_buffer_usage', + pg_session_buffer_usage_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_session_buffer_usage + +install_data( + 'pg_session_buffer_usage--1.0.sql', + 'pg_session_buffer_usage.control', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_session_buffer_usage', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_session_buffer_usage', + ], + }, +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql new file mode 100644 index 0000000000000..b300fdbc64397 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql @@ -0,0 +1,31 @@ +/* contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_session_buffer_usage" to load this file. \quit + +CREATE FUNCTION pg_session_buffer_usage( + OUT shared_blks_hit bigint, + OUT shared_blks_read bigint, + OUT shared_blks_dirtied bigint, + OUT shared_blks_written bigint, + OUT local_blks_hit bigint, + OUT local_blks_read bigint, + OUT local_blks_dirtied bigint, + OUT local_blks_written bigint, + OUT temp_blks_read bigint, + OUT temp_blks_written bigint, + OUT shared_blk_read_time double precision, + OUT shared_blk_write_time double precision, + OUT local_blk_read_time double precision, + OUT local_blk_write_time double precision, + OUT temp_blk_read_time double precision, + OUT temp_blk_write_time double precision +) +RETURNS record +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage' +LANGUAGE C PARALLEL RESTRICTED; + +CREATE FUNCTION pg_session_buffer_usage_reset() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage_reset' +LANGUAGE C PARALLEL RESTRICTED; diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c new file mode 100644 index 0000000000000..f869956b3a9b0 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c @@ -0,0 +1,95 @@ +/*------------------------------------------------------------------------- + * + * pg_session_buffer_usage.c + * show buffer usage statistics for the current session + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * contrib/pg_session_buffer_usage/pg_session_buffer_usage.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/instrument.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +PG_MODULE_MAGIC_EXT( + .name = "pg_session_buffer_usage", + .version = PG_VERSION +); + +#define NUM_BUFFER_USAGE_COLUMNS 16 + +PG_FUNCTION_INFO_V1(pg_session_buffer_usage); +PG_FUNCTION_INFO_V1(pg_session_buffer_usage_reset); + +#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack + * change */ + +/* + * SQL function: pg_session_buffer_usage() + * + * Returns a single row with all BufferUsage counters accumulated since the + * start of the session. Excludes any usage not yet added to the top of the + * stack (e.g. if this gets called inside a statement that also had buffer + * activity). + */ +Datum +pg_session_buffer_usage(PG_FUNCTION_ARGS) +{ + TupleDesc tupdesc; + Datum values[NUM_BUFFER_USAGE_COLUMNS]; + bool nulls[NUM_BUFFER_USAGE_COLUMNS]; + BufferUsage *usage; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + memset(nulls, 0, sizeof(nulls)); + +#if HAVE_INSTR_STACK + usage = &instr_top.bufusage; +#else + usage = &pgBufferUsage; +#endif + + values[0] = Int64GetDatum(usage->shared_blks_hit); + values[1] = Int64GetDatum(usage->shared_blks_read); + values[2] = Int64GetDatum(usage->shared_blks_dirtied); + values[3] = Int64GetDatum(usage->shared_blks_written); + values[4] = Int64GetDatum(usage->local_blks_hit); + values[5] = Int64GetDatum(usage->local_blks_read); + values[6] = Int64GetDatum(usage->local_blks_dirtied); + values[7] = Int64GetDatum(usage->local_blks_written); + values[8] = Int64GetDatum(usage->temp_blks_read); + values[9] = Int64GetDatum(usage->temp_blks_written); + values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time)); + values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time)); + values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time)); + values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time)); + values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time)); + values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time)); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * SQL function: pg_session_buffer_usage_reset() + * + * Resets all BufferUsage counters on the top instrumentation stack to zero. + * Useful in tests to avoid the baseline/delta pattern. + */ +Datum +pg_session_buffer_usage_reset(PG_FUNCTION_ARGS) +{ +#if HAVE_INSTR_STACK + memset(&instr_top.bufusage, 0, sizeof(BufferUsage)); +#else + memset(&pgBufferUsage, 0, sizeof(BufferUsage)); +#endif + + PG_RETURN_VOID(); +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control new file mode 100644 index 0000000000000..fabd05ee02458 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control @@ -0,0 +1,5 @@ +# pg_session_buffer_usage extension +comment = 'show buffer usage statistics for the current session' +default_version = '1.0' +module_pathname = '$libdir/pg_session_buffer_usage' +relocatable = true diff --git a/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql new file mode 100644 index 0000000000000..8f5810fadd337 --- /dev/null +++ b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql @@ -0,0 +1,204 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; + +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + +DROP TABLE test_buf_activity; + +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +SELECT count(*) FROM par_dc_tab; + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_dc_tab, dc_serial_result; + +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- + +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); + +SELECT pg_session_buffer_usage_reset(); + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; + +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE exc_tab; + +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); + +SELECT pg_session_buffer_usage_reset(); +INSERT INTO unique_tab VALUES (1, 'duplicate'); + +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE unique_tab; + +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); + +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT subxact_exc_func(); + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; + +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; + +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT cursor_exc_func(); + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; + +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +RESET debug_parallel_query; + +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_abort_tab, par_abort_serial_result; + +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out index e4d6564ea5b5a..cba487f6be582 100644 --- a/contrib/pg_stat_statements/expected/utility.out +++ b/contrib/pg_stat_statements/expected/utility.out @@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (3 rows) +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT run_explain_buffers_test(); + run_explain_buffers_test +-------------------------- + +(1 row) + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + query | has_buffer_stats +-----------------------------------+------------------ + SELECT run_explain_buffers_test() | t +(1 row) + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +CALL pgss_call_rollback_proc(); +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------------+-------+---------------+---------------------+----------------------- + CALL pgss_call_rollback_proc() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out index 977e382d84894..611213daef6c2 100644 --- a/contrib/pg_stat_statements/expected/wal.out +++ b/contrib/pg_stat_statements/expected/wal.out @@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT pgss_error_func(); + pgss_error_func +----------------- + +(1 row) + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------+-------+---------------+---------------------+----------------------- + SELECT pgss_error_func() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 4a427533bd88d..8448f9c13faaa 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -909,22 +909,11 @@ pgss_planner(Query *parse, && pgss_track_planning && query_string && parse->queryId != INT64CONST(0)) { - instr_time start; - instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + Instrumentation instr = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; - INSTR_TIME_SET_CURRENT(start); + /* Track time and buffer/WAL usage as the planner can access them. */ + InstrInitOptions(&instr, INSTRUMENT_ALL); + InstrStart(&instr); nesting_level++; PG_TRY(); @@ -938,30 +927,20 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, parse->stmt_len, PGSS_PLAN, - INSTR_TIME_GET_MILLISEC(duration), + INSTR_TIME_GET_MILLISEC(instr.total), 0, - &bufusage, - &walusage, + &instr.bufusage, + &instr.walusage, NULL, NULL, 0, @@ -1013,19 +992,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) */ if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0)) { - /* - * Set up to track total elapsed time in ExecutorRun. Make sure the - * space is allocated in the per-query context so it will go away at - * ExecutorEnd. - */ + /* Set up to track total elapsed time in ExecutorRun. */ if (queryDesc->totaltime == NULL) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); - MemoryContextSwitchTo(oldcxt); - } + queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL); } } @@ -1082,21 +1051,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - pgss_store(queryDesc->sourceText, queryId, queryDesc->plannedstmt->stmt_location, queryDesc->plannedstmt->stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), + INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + &queryDesc->totaltime->instr.bufusage, + &queryDesc->totaltime->instr.walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1160,17 +1123,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, !IsA(parsetree, ExecuteStmt) && !IsA(parsetree, PrepareStmt)) { - instr_time start; - instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + Instrumentation instr = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; - INSTR_TIME_SET_CURRENT(start); + InstrInitOptions(&instr, INSTRUMENT_ALL); + InstrStart(&instr); nesting_level++; PG_TRY(); @@ -1186,6 +1143,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); @@ -1200,9 +1158,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * former value, which'd otherwise be a good idea. */ - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - /* * Track the total number of rows retrieved or affected by the utility * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED @@ -1214,23 +1169,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, saved_stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(duration), + INSTR_TIME_GET_MILLISEC(instr.total), rows, - &bufusage, - &walusage, + &instr.bufusage, + &instr.walusage, NULL, NULL, 0, diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql index dd97203c21025..7540e49c73caf 100644 --- a/contrib/pg_stat_statements/sql/utility.sql +++ b/contrib/pg_stat_statements/sql/utility.sql @@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +SELECT run_explain_buffers_test(); + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; + +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); + +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +CALL pgss_call_rollback_proc(); + +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; + -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql index 1dc1552a81ebc..467e321b2062e 100644 --- a/contrib/pg_stat_statements/sql/wal.sql +++ b/contrib/pg_stat_statements/sql/wal.sql @@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated, wal_records >= rows as wal_records_ge_rows FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); + +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT pgss_error_func(); + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 60d90329a651d..6f0cb2a285b38 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node) if (!resultRelInfo->ri_projectReturning) { TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - Instrumentation *instr = node->ss.ps.instrument; + NodeInstrumentation *instr = node->ss.ps.instrument; Assert(!dmstate->has_returning); diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5f6f1db0467e9..9219625faf6e3 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 7dee77fd366b0..912c96f2ff51a 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 146ee97a47dc4..5ab571b29fa92 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2435,8 +2435,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -2887,6 +2887,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2936,7 +2937,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2951,7 +2952,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index c7e38dbe19342..bb91cc600eb3b 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -984,8 +984,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -2110,6 +2110,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2178,7 +2179,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2193,7 +2194,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 82c5b28e0ad55..b4cbd0e682c74 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -641,8 +641,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; + QueryInstrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; Size dead_items_max_bytes = 0; @@ -658,6 +657,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -983,14 +984,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + instr = InstrQueryStopFinalize(instr); + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; StringInfoData buf; char *msgfmt; int32 diff; @@ -999,12 +1000,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; + BufferUsage bufferusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); total_blks_hit = bufferusage.shared_blks_hit + bufferusage.local_blks_hit; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 69ef1527e0609..dfe4fd9459c96 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1465,8 +1465,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * and PARALLEL_KEY_BUFFER_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(WalUsage), pcxt->nworkers)); @@ -1752,6 +1752,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + QueryInstrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1827,7 +1828,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1837,7 +1838,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index b9b678f372290..f6ac3c530b909 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1081,10 +1081,10 @@ XLogInsertRecord(XLogRecData *rdata, /* Report WAL traffic to the instrumentation. */ if (inserted) { - pgWalUsage.wal_bytes += rechdr->xl_tot_len; - pgWalUsage.wal_records++; - pgWalUsage.wal_fpi += num_fpi; - pgWalUsage.wal_fpi_bytes += fpi_bytes; + INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len); + INSTR_WALUSAGE_INCR(wal_records); + INSTR_WALUSAGE_ADD(wal_fpi, num_fpi); + INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes); /* Required for the flush of pending stats WAL data */ pgstat_report_fixed = true; @@ -2063,7 +2063,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = InvalidXLogRecPtr; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - pgWalUsage.wal_buffers_full++; + INSTR_WALUSAGE_INCR(wal_buffers_full); TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); /* diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 53adac9139bd4..38f8b379fa413 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, Oid save_userid; int save_sec_context; int save_nestlevel; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - BufferUsage bufferusage; + QueryInstrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } pg_rusage_init(&ru0); + + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + instr = InstrQueryStopFinalize(instr); + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_analyze_min_duration)) { long delay_in_ms; - WalUsage walusage; double read_rate = 0; double write_rate = 0; char *msgfmt; @@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + BufferUsage bufusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; /* * We do not expect an analyze to take > 25 days and it simplifies diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 93918a223b8d7..437246d1aa4e7 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - instr_time planstart, - planduration; - BufferUsage bufusage_start, - bufusage; + QueryInstrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrQueryAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrQueryAlloc(INSTRUMENT_TIMER); + if (es->memory) { /* @@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrQueryStart(instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + instr = InstrQueryStopFinalize(instr); if (es->memory) { @@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); } @@ -611,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1028,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1042,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1099,18 +1091,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) { Trigger *trig = rInfo->ri_TrigDesc->triggers + nt; - Instrumentation *instr = rInfo->ri_TrigInstrument + nt; + TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt; char *relname; char *conname = NULL; - /* Must clean up instrumentation state */ - InstrEndLoop(instr); - /* * We ignore triggers that were never invoked; they likely aren't * relevant to the current query type. */ - if (instr->ntuples == 0) + if (tginstr->firings == 0) continue; ExplainOpenGroup("Trigger", NULL, true, es); @@ -1135,11 +1124,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) if (show_relname) appendStringInfo(es->str, " on %s", relname); if (es->timing) - appendStringInfo(es->str, ": time=%.3f calls=%.0f\n", - INSTR_TIME_GET_MILLISEC(instr->total), - instr->ntuples); + appendStringInfo(es->str, ": time=%.3f calls=%d\n", + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), + tginstr->firings); else - appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples); + appendStringInfo(es->str, ": calls=%d\n", tginstr->firings); } else { @@ -1149,9 +1138,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) ExplainPropertyText("Relation", relname, es); if (es->timing) ExplainPropertyFloat("Time", "ms", - INSTR_TIME_GET_MILLISEC(instr->total), 3, + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3, es); - ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es); + ExplainPropertyInteger("Calls", NULL, tginstr->firings, es); } if (conname) @@ -1838,7 +1827,7 @@ ExplainNode(PlanState *planstate, List *ancestors, { double nloops = planstate->instrument->nloops; double startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops; - double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops; + double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->instr.total) / nloops; double rows = planstate->instrument->ntuples / nloops; if (es->format == EXPLAIN_FORMAT_TEXT) @@ -1891,11 +1880,11 @@ ExplainNode(PlanState *planstate, List *ancestors, /* prepare per-worker general execution details */ if (es->workers_state && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; double startup_ms; double total_ms; @@ -1904,7 +1893,7 @@ ExplainNode(PlanState *planstate, List *ancestors, if (nloops <= 0) continue; startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops; - total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops; + total_ms = INSTR_TIME_GET_MILLISEC(instrument->instr.total) / nloops; rows = instrument->ntuples / nloops; ExplainOpenWorker(n, es); @@ -1973,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTable->instr.bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2291,18 +2283,18 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->instr.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; if (nloops <= 0) @@ -2310,9 +2302,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->instr.bufusage, NULL); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->instr.walusage); ExplainCloseWorker(n, es); } } @@ -4110,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4135,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4190,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4231,44 +4227,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { - ExplainPropertyInteger("Shared Hit Blocks", NULL, + char *prefix = title ? psprintf("%s ", title) : pstrdup(""); + + ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL, usage->shared_blks_hit, es); - ExplainPropertyInteger("Shared Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL, usage->shared_blks_read, es); - ExplainPropertyInteger("Shared Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL, usage->shared_blks_dirtied, es); - ExplainPropertyInteger("Shared Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL, usage->shared_blks_written, es); - ExplainPropertyInteger("Local Hit Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL, usage->local_blks_hit, es); - ExplainPropertyInteger("Local Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL, usage->local_blks_read, es); - ExplainPropertyInteger("Local Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL, usage->local_blks_dirtied, es); - ExplainPropertyInteger("Local Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL, usage->local_blks_written, es); - ExplainPropertyInteger("Temp Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL, usage->temp_blks_read, es); - ExplainPropertyInteger("Temp Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL, usage->temp_blks_written, es); if (track_io_timing) { - ExplainPropertyFloat("Shared I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); - ExplainPropertyFloat("Shared I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time), 3, es); - ExplainPropertyFloat("Local I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time), 3, es); - ExplainPropertyFloat("Local I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time), 3, es); - ExplainPropertyFloat("Temp I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time), 3, es); - ExplainPropertyFloat("Temp I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 3c96061cf32ab..6868d8972acbb 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContext oldcontext; StringInfo buf = &myState->buf; int natts = typeinfo->natts; - instr_time start, - end; - BufferUsage instr_start; + QueryInstrumentation *instr = NULL; /* only measure time, buffers if requested */ - if (myState->es->timing) - INSTR_TIME_SET_CURRENT(start); - if (myState->es->buffers) - instr_start = pgBufferUsage; + if (myState->es->timing || myState->es->buffers) + { + InstrumentOption instrument_options = 0; + + if (myState->es->timing) + instrument_options |= INSTRUMENT_TIMER; + if (myState->es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + instr = InstrQueryAlloc(instrument_options); + InstrQueryStart(instr); + } /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) @@ -186,18 +191,16 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContextSwitchTo(oldcontext); MemoryContextReset(myState->tmpcontext); + if (myState->es->timing || myState->es->buffers) + instr = InstrQueryStopFinalize(instr); + /* Update timing data */ if (myState->es->timing) - { - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start); - } + INSTR_TIME_ADD(myState->metrics.timeSpent, instr->instr.total); /* Update buffer metrics */ if (myState->es->buffers) - BufferUsageAccumDiff(&myState->metrics.bufferUsage, - &pgBufferUsage, - &instr_start); + BufferUsageAdd(&myState->metrics.bufferUsage, &instr->instr.bufusage); return true; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 5b86a727587b2..d81f6b30e9cd3 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -578,13 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - instr_time planstart; - instr_time planduration; - BufferUsage bufusage_start, - bufusage; + QueryInstrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + int instrument_options = INSTRUMENT_TIMER; + + if (es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + + instr = InstrQueryAlloc(instrument_options); if (es->memory) { @@ -596,9 +599,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrQueryStart(instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -633,8 +634,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + instr = InstrQueryStopFinalize(instr); if (es->memory) { @@ -642,13 +642,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - plan_list = cplan->stmt_list; /* Explain each query */ @@ -658,7 +651,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 98d402c0a3be7..c336007314121 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -90,7 +90,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo, static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context); static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, @@ -2309,7 +2309,7 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context) { LOCAL_FCINFO(fcinfo, 0); @@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata, * If doing EXPLAIN ANALYZE, start charging time to this trigger. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Do the function evaluation in the per-tuple memory context, so that @@ -2389,10 +2389,10 @@ ExecCallTriggerFunc(TriggerData *trigdata, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); return (HeapTuple) DatumGetPointer(result); } @@ -3936,7 +3936,7 @@ static void AfterTriggerExecute(EState *estate, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2); @@ -4330,7 +4330,7 @@ AfterTriggerExecute(EState *estate, ResultRelInfo *src_relInfo, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, - FmgrInfo *finfo, Instrumentation *instr, + FmgrInfo *finfo, TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2) @@ -4371,7 +4371,7 @@ AfterTriggerExecute(EState *estate, * to include time spent re-fetching tuples in the trigger cost. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Fetch the required tuple(s). @@ -4588,10 +4588,10 @@ AfterTriggerExecute(EState *estate, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); } @@ -4707,7 +4707,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, Relation rel = NULL; TriggerDesc *trigdesc = NULL; FmgrInfo *finfo = NULL; - Instrumentation *instr = NULL; + TriggerInstrumentation *instr = NULL; TupleTableSlot *slot1 = NULL, *slot2 = NULL; diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 279108ca89f0b..75074fe4efa60 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -308,8 +308,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgBufferUsage or - * pgWalUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(BufferUsage), pcxt->nworkers)); @@ -995,6 +995,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + QueryInstrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1084,7 +1085,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1092,7 +1093,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index bfd3ebc601ec6..82253317e96f5 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -329,9 +330,28 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrQueryAlloc(estate->es_instrument); + if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + { + /* Allow instrumentation of Executor overall runtime */ + InstrQueryStart(queryDesc->totaltime); + + /* + * Remember all node entries for abort recovery. We do this once here + * after the first call to InstrQueryStart has pushed the parent + * entry. + */ + if ((estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) && + !queryDesc->already_executed) + ExecRememberNodeInstrumentation(queryDesc->planstate, + queryDesc->totaltime); + } /* * extract information from the query descriptor and the query feature. @@ -383,7 +403,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, estate->es_processed); + InstrQueryStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); } @@ -433,7 +453,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrQueryStart(queryDesc->totaltime); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -442,8 +462,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per-node and trigger statistics to their respective parent + * instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and the + * leader's own ExecFinalizeNodeInstrumentation handles propagation. If + * we accumulated here, the leader would double-count: worker parent nodes + * would already include their children's stats, and then the leader's + * accumulation would add the children again. + */ + if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker()) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, 0); + queryDesc->totaltime = InstrQueryStopFinalize(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); @@ -1270,7 +1308,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); } else { @@ -1484,6 +1522,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti && (ti->instr.need_bufusage || ti->instr.need_walusage)) + InstrAccum(instr_stack.current, &ti->instr); + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index ac84af294c9f6..e6ad86cb88779 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -87,7 +87,7 @@ typedef struct FixedParallelExecutorState * instrument_options: Same meaning here as in instrument.c. * * instrument_offset: Offset, relative to the start of this structure, - * of the first Instrumentation object. This will depend on the length of + * of the first NodeInstrumentation object. This will depend on the length of * the plan_node_id array. * * num_workers: Number of workers. @@ -104,11 +104,15 @@ struct SharedExecutorInstrumentation int num_workers; int num_plan_nodes; int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; - /* array of num_plan_nodes * num_workers Instrumentation objects follows */ + + /* + * array of num_plan_nodes * num_workers NodeInstrumentation objects + * follows + */ }; #define GetInstrumentationArray(sei) \ (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ - (Instrumentation *) (((char *) sei) + sei->instrument_offset)) + (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ typedef struct ExecParallelEstimateContext @@ -690,7 +694,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, * * If EXPLAIN is not in use and there are no extensions loaded that care, * we could skip this. But we have no way of knowing whether anyone's - * looking at pgBufferUsage, so do it unconditionally. + * looking at instrumentation, so do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(BufferUsage), pcxt->nworkers)); @@ -725,7 +729,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation_len = MAXALIGN(instrumentation_len); instrument_offset = instrumentation_len; instrumentation_len += - mul_size(sizeof(Instrumentation), + mul_size(sizeof(NodeInstrumentation), mul_size(e.nnodes, nworkers)); shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len); shm_toc_estimate_keys(&pcxt->estimator, 1); @@ -811,7 +815,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, */ if (estate->es_instrument) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len); @@ -821,7 +825,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation->num_plan_nodes = e.nnodes; instrument = GetInstrumentationArray(instrumentation); for (i = 0; i < nworkers * e.nnodes; ++i) - InstrInit(&instrument[i], estate->es_instrument); + InstrInitNode(&instrument[i], estate->es_instrument); shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instrumentation); pei->instrumentation = instrumentation; @@ -1053,7 +1057,7 @@ static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; int n; int ibytes; @@ -1081,9 +1085,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, * Switch into per-query memory context. */ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); - ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); + ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation)); planstate->worker_instrument = - palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); + palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument)); MemoryContextSwitchTo(oldcontext); planstate->worker_instrument->num_workers = instrumentation->num_workers; @@ -1313,7 +1317,7 @@ ExecParallelReportInstrumentation(PlanState *planstate, { int i; int plan_node_id = planstate->plan->plan_node_id; - Instrumentation *instrument; + NodeInstrumentation *instrument; InstrEndLoop(planstate->instrument); @@ -1452,6 +1456,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + QueryInstrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1512,7 +1517,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1528,7 +1533,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7e40b85251718..5e476939edfb5 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -120,8 +120,9 @@ #include "nodes/nodeFuncs.h" static TupleTableSlot *ExecProcNodeFirst(PlanState *node); -static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -413,8 +414,23 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) - result->instrument = InstrAlloc(1, estate->es_instrument, - result->async_capable); + { + result->instrument = InstrAllocNode(estate->es_instrument, + result->async_capable); + + /* IndexScan tracks table access separately from index access. */ + if (IsA(result, IndexScanState) && (estate->es_instrument & INSTRUMENT_BUFFERS) != 0) + { + IndexScanState *iss = castNode(IndexScanState, result); + + /* + * We intentionally don't collect timing here (even if enabled), + * since we don't need it, and IndexNext calls InstrPushStack / + * InstrPopStack (instead of InstrNode*) to reduce overhead. + */ + iss->iss_InstrumentTable = InstrAllocNode(INSTRUMENT_BUFFERS, false); + } + } return result; } @@ -462,7 +478,7 @@ ExecProcNodeFirst(PlanState *node) * have ExecProcNode() directly call the relevant function from now on. */ if (node->instrument) - node->ExecProcNode = ExecProcNodeInstr; + node->ExecProcNode = InstrNodeSetupExecProcNode(node->instrument); else node->ExecProcNode = node->ExecProcNodeReal; @@ -470,25 +486,6 @@ ExecProcNodeFirst(PlanState *node) } -/* - * ExecProcNode wrapper that performs instrumentation calls. By keeping - * this a separate function, we avoid overhead in the normal case where - * no instrumentation is wanted. - */ -static TupleTableSlot * -ExecProcNodeInstr(PlanState *node) -{ - TupleTableSlot *result; - - InstrStartNode(node->instrument); - - result = node->ExecProcNodeReal(node); - - InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0); - - return result; -} - /* ---------------------------------------------------------------- * MultiExecProcNode @@ -787,10 +784,10 @@ ExecShutdownNode_walker(PlanState *node, void *context) * at least once already. We don't expect much CPU consumption during * node shutdown, but in the case of Gather or Gather Merge, we may shut * down workers at this stage. If so, their buffer usage will get - * propagated into pgBufferUsage at this point, and we want to make sure - * that it gets associated with the Gather node. We skip this if the node - * has never been executed, so as to avoid incorrectly making it appear - * that it has. + * propagated into the current instrumentation stack entry at this point, + * and we want to make sure that it gets associated with the Gather node. + * We skip this if the node has never been executed, so as to avoid + * incorrectly making it appear that it has. */ if (node->instrument && node->instrument->running) InstrStartNode(node->instrument); @@ -828,6 +825,100 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecRememberNodeInstrumentation + * + * Register all per-node instrumentation entries as unfinalized children of + * the executor's instrumentation. This is needed for abort recovery: if the + * executor aborts, we need to walk each per-node entry to recover buffer/WAL + * data from nodes that never got finalized, that someone might be interested + * in as an aggregate. + */ +void +ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent) +{ + (void) ExecRememberNodeInstrumentation_walker(node, parent); +} + +static bool +ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) +{ + QueryInstrumentation *parent = (QueryInstrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + if (node->instrument) + { + InstrQueryRememberNode(parent, node->instrument); + + /* IndexScan has a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTable) + InstrQueryRememberNode(parent, iss->iss_InstrumentTable); + } + } + + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); +} + +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + Instrumentation *parent = (Instrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing our + * instrumentation as the parent context. This ensures children can + * accumulate to us even if they were never executed by the leader (e.g. + * nodes beneath Gather that only workers ran). + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + node->instrument ? &node->instrument->instr : parent); + + if (!node->instrument) + return false; + + /* IndexScan has a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTable) + iss->iss_InstrumentTable = InstrFinalizeNode(iss->iss_InstrumentTable, &node->instrument->instr); + } + + node->instrument = InstrFinalizeNode(node->instrument, parent); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc2522f..602facec401ce 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,124 +16,396 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" -BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +Instrumentation instr_top; +InstrStackState instr_stack = {0, 0, NULL, &instr_top}; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); -static void WalUsageAdd(WalUsage *dst, WalUsage *add); - - -/* Allocate new instrumentation structure(s) */ -Instrumentation * -InstrAlloc(int n, int instrument_options, bool async_mode) +void +InstrStackGrow(void) { - Instrumentation *instr; - - /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); - if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + if (instr_stack.entries == NULL) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; - - for (i = 0; i < n; i++) - { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - instr[i].async_mode = async_mode; - } + instr_stack.stack_space = 10; /* Allocate sufficient initial space + * for typical activity */ + instr_stack.entries = MemoryContextAlloc(TopMemoryContext, + sizeof(Instrumentation *) * instr_stack.stack_space); + } + else + { + instr_stack.stack_space *= 2; + instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, instr_stack.stack_space); } - - return instr; } -/* Initialize a pre-allocated instrumentation structure. */ +/* General purpose instrumentation handling */ void -InstrInit(Instrumentation *instr, int instrument_options) +InstrInitOptions(Instrumentation *instr, int instrument_options) { - memset(instr, 0, sizeof(Instrumentation)); instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } -/* Entry to a plan node */ +static inline void +InstrStartTimer(Instrumentation *instr) +{ + Assert(INSTR_TIME_IS_ZERO(instr->starttime)); + + INSTR_TIME_SET_CURRENT(instr->starttime); +} + +static inline void +InstrStopTimer(Instrumentation *instr) +{ + instr_time endtime; + + Assert(!INSTR_TIME_IS_ZERO(instr->starttime)); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); +} + void -InstrStartNode(Instrumentation *instr) +InstrStart(Instrumentation *instr) { if (instr->need_timer) - { - if (!INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrStartNode called twice in a row"); - else - INSTR_TIME_SET_CURRENT(instr->starttime); - } + InstrStartTimer(instr); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + InstrPushStack(instr); +} - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; +void +InstrStop(Instrumentation *instr) +{ + if (instr->need_timer) + InstrStopTimer(instr); + + if (instr->need_bufusage || instr->need_walusage) + InstrPopStack(instr); } -/* Exit from a plan node */ +/* + * Stops instrumentation, finalizes the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not the current + * top, as can happen with PG_FINALLY, or resource owners, which don't have a + * guaranteed cleanup order. + * + * We are careful here to achieve two goals: + * + * 1) Reset the stack to the parent of whichever of the released stack entries + * has the lowest index + * 2) Accumulate all instrumentation to the currently active instrumentation, + * so that callers get a complete picture of activity, even after an abort + */ void -InstrStopNode(Instrumentation *instr, double nTuples) +InstrStopFinalize(Instrumentation *instr) { - double save_tuplecount = instr->tuplecount; - instr_time endtime; + int idx = -1; - /* count the returned tuples */ - instr->tuplecount += nTuples; + for (int i = instr_stack.stack_size - 1; i >= 0; i--) + { + if (instr_stack.entries[i] == instr) + { + idx = i; + break; + } + } + + if (idx >= 0) + { + while (instr_stack.stack_size > idx + 1) + instr_stack.stack_size--; + + InstrPopStack(instr); + } - /* let's update the time only if the timer was requested */ if (instr->need_timer) + InstrStopTimer(instr); + + InstrAccum(instr_stack.current, instr); +} + + +/* Query instrumentation handling */ + +/* + * Use ResourceOwner mechanism to correctly reset instr_stack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res); + slist_mutable_iter iter; + + /* Accumulate data from all unfinalized child node entries. */ + slist_foreach_modify(iter, &qinstr->unfinalized_children) { - if (INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrStopNode called without start"); + NodeInstrumentation *child = slist_container(NodeInstrumentation, unfinalized_node, iter.cur); - INSTR_TIME_SET_CURRENT(endtime); - INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); + InstrAccum(&qinstr->instr, &child->instr); - INSTR_TIME_SET_ZERO(instr->starttime); + /* + * Free NodeInstrumentation now, since InstrFinalizeNode won't be + * called + */ + pfree(child); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + /* Ensure the stack is reset as expected, and we accumulate to the parent */ + InstrStopFinalize(&qinstr->instr); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* Free QueryInstrumentation now, since InstrStop won't be called */ + pfree(qinstr); +} - /* Is this the first tuple of this cycle? */ - if (!instr->running) +/* + * Allocate in TopMemoryContext so that the Instrumentation survives + * transaction abort — ResourceOwner release needs to access it. + */ +QueryInstrumentation * +InstrQueryAlloc(int instrument_options) +{ + QueryInstrumentation *instr = MemoryContextAllocZero(TopMemoryContext, sizeof(QueryInstrumentation)); + + InstrInitOptions(&instr->instr, instrument_options); + return instr; +} + +void +InstrQueryStart(QueryInstrumentation *qinstr) +{ + InstrStart(&qinstr->instr); + + if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage) { - instr->running = true; - instr->firsttuple = instr->counter; + Assert(CurrentResourceOwner != NULL); + qinstr->owner = CurrentResourceOwner; + + ResourceOwnerEnlarge(qinstr->owner); + ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr); } - else +} + +void +InstrQueryStop(QueryInstrumentation *qinstr) +{ + InstrStop(&qinstr->instr); + + if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage) { - /* - * In async mode, if the plan node hadn't emitted any tuples before, - * this might be the first tuple - */ - if (instr->async_mode && save_tuplecount < 1.0) - instr->firsttuple = instr->counter; + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; + } +} + +QueryInstrumentation * +InstrQueryStopFinalize(QueryInstrumentation *qinstr) +{ + QueryInstrumentation *copy; + + InstrStopFinalize(&qinstr->instr); + + if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage) + { + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; } + + /* + * Copy to the current memory context so the caller doesn't need to + * explicitly free the TopMemoryContext allocation. + */ + copy = palloc(sizeof(QueryInstrumentation)); + memcpy(copy, qinstr, sizeof(QueryInstrumentation)); + pfree(qinstr); + return copy; +} + +/* + * Register a child NodeInstrumentation entry for abort processing. + * + * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to + * recover buffer/WAL data from entries that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + */ +void +InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *child) +{ + if (child->instr.need_bufusage || child->instr.need_walusage) + slist_push_head(&parent->unfinalized_children, &child->unfinalized_node); +} + +/* start instrumentation during parallel executor startup */ +QueryInstrumentation * +InstrStartParallelQuery(void) +{ + QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrQueryStart(qinstr); + return qinstr; +} + +/* report usage after parallel executor shutdown */ +void +InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage) +{ + qinstr = InstrQueryStopFinalize(qinstr); + memcpy(bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage)); + memcpy(walusage, &qinstr->instr.walusage, sizeof(WalUsage)); +} + +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If it's active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ +void +InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +{ + BufferUsageAdd(&instr_stack.current->bufusage, bufusage); + WalUsageAdd(&instr_stack.current->walusage, walusage); + + WalUsageAdd(&pgWalUsage, walusage); +} + +/* Node instrumentation handling */ + +/* Allocate new node instrumentation structure */ +NodeInstrumentation * +InstrAllocNode(int instrument_options, bool async_mode) +{ + /* + * We can utilize TopTransactionContext instead of TopMemoryContext here + * because nodes don't get used for utility commands that restart + * transactions, which would require a context that survives longer + * (EXPLAIN ANALYZE is fine). + */ + NodeInstrumentation *instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation)); + + InstrInitNode(instr, instrument_options); + instr->async_mode = async_mode; + + return instr; +} + +/* Initialize a pre-allocated instrumentation structure. */ +void +InstrInitNode(NodeInstrumentation *instr, int instrument_options) +{ + memset(instr, 0, sizeof(NodeInstrumentation)); + InstrInitOptions(&instr->instr, instrument_options); +} + +/* Entry to a plan node. If you modify this, check InstrNodeSetupExecProcNode. */ +void +InstrStartNode(NodeInstrumentation *instr) +{ + InstrStart(&instr->instr); +} + +/* + * Updates the node instrumentation time counter. + * + * Note this is different from InstrStop because total is only updated in + * InstrEndLoop. We need the separate counter variable because we need to + * calculate start-up time for the first tuple in each cycle, and then + * accumulate it together. + */ +static inline void +InstrStopNodeTimer(NodeInstrumentation *instr) +{ + instr_time endtime; + + Assert(!INSTR_TIME_IS_ZERO(instr->instr.starttime)); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime); + INSTR_TIME_SET_ZERO(instr->instr.starttime); + + /* + * Is this the first tuple of this cycle? + * + * In async mode, if the plan node hadn't emitted any tuples before, this + * might be the first tuple + */ + if (!instr->running || (instr->async_mode && instr->tuplecount < 1.0)) + instr->firsttuple = instr->counter; +} + +/* Exit from a plan node. If you modify this, check InstrNodeSetupExecProcNode. */ +void +InstrStopNode(NodeInstrumentation *instr, double nTuples) +{ + if (instr->instr.need_timer) + InstrStopNodeTimer(instr); + + /* Only pop the stack, accumulation runs in InstrFinalizeNode */ + if (instr->instr.need_bufusage || instr->instr.need_walusage) + InstrPopStack(&instr->instr); + + instr->running = true; + + /* count the returned tuples */ + instr->tuplecount += nTuples; +} + +/* Add per-node instrumentation to the parent and move into per-query memory context */ +NodeInstrumentation * +InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent) +{ + NodeInstrumentation *dst = palloc(sizeof(NodeInstrumentation)); + + memcpy(dst, instr, sizeof(NodeInstrumentation)); + pfree(instr); + + /* Accumulate node's buffer/WAL usage to the parent */ + if (dst->instr.need_bufusage || dst->instr.need_walusage) + InstrAccum(parent, &dst->instr); + + return dst; } /* Update tuple count */ void -InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) { /* count the returned tuples */ instr->tuplecount += nTuples; @@ -141,24 +413,24 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) /* Finish a run cycle for a plan node */ void -InstrEndLoop(Instrumentation *instr) +InstrEndLoop(NodeInstrumentation *instr) { /* Skip if nothing has happened, or already shut down */ if (!instr->running) return; - if (!INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrEndLoop called on running node"); + /* Ensure InstrNodeStop was called */ + Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime)); /* Accumulate per-cycle statistics into totals */ INSTR_TIME_ADD(instr->startup, instr->firsttuple); - INSTR_TIME_ADD(instr->total, instr->counter); + INSTR_TIME_ADD(instr->instr.total, instr->counter); instr->ntuples += instr->tuplecount; instr->nloops += 1; /* Reset for next cycle (if any) */ instr->running = false; - INSTR_TIME_SET_ZERO(instr->starttime); + INSTR_TIME_SET_ZERO(instr->instr.starttime); INSTR_TIME_SET_ZERO(instr->counter); INSTR_TIME_SET_ZERO(instr->firsttuple); instr->tuplecount = 0; @@ -166,7 +438,7 @@ InstrEndLoop(Instrumentation *instr) /* aggregate instrumentation information */ void -InstrAggNode(Instrumentation *dst, Instrumentation *add) +InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) { if (!dst->running && add->running) { @@ -181,7 +453,7 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->tuplecount += add->tuplecount; INSTR_TIME_ADD(dst->startup, add->startup); - INSTR_TIME_ADD(dst->total, add->total); + INSTR_TIME_ADD(dst->instr.total, add->instr.total); dst->ntuples += add->ntuples; dst->ntuples2 += add->ntuples2; dst->nloops += add->nloops; @@ -189,41 +461,175 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->nfiltered2 += add->nfiltered2; /* Add delta of buffer usage since entry to node's totals */ - if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + if (dst->instr.need_bufusage) + BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage); - if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + if (dst->instr.need_walusage) + WalUsageAdd(&dst->instr.walusage, &add->instr.walusage); +} + +/* + * Specialized handling of instrumented ExecProcNode + * + * These functions are equivalent to running ExecProcNodeReal wrapped in + * InstrStartNode and InstrStopNode, but avoid the conditionals in the hot path + * by checking the instrumentation options when the ExecProcNode pointer gets + * first set, and then using a special-purpose function for each. This results + * in a more optimized set of compiled instructions. + */ + +#include "executor/tuptable.h" +#include "nodes/execnodes.h" + +/* Simplified pop: restore saved state instead of re-deriving from array */ +static inline void +InstrPopStackTo(Instrumentation *prev) +{ + Assert(instr_stack.stack_size > 0); + instr_stack.stack_size--; + instr_stack.current = prev; +} + +static TupleTableSlot * +ExecProcNodeInstrFull(PlanState *node) +{ + NodeInstrumentation *instr = node->instrument; + Instrumentation *prev = instr_stack.current; + TupleTableSlot *result; + + InstrPushStack(&instr->instr); + InstrStartTimer(&instr->instr); + + result = node->ExecProcNodeReal(node); + + InstrStopNodeTimer(instr); + InstrPopStackTo(prev); + + instr->running = true; + if (!TupIsNull(result)) + instr->tuplecount += 1.0; + + return result; +} + +static TupleTableSlot * +ExecProcNodeInstrRowsBuffersWalOnly(PlanState *node) +{ + NodeInstrumentation *instr = node->instrument; + Instrumentation *prev = instr_stack.current; + TupleTableSlot *result; + + InstrPushStack(&instr->instr); + + result = node->ExecProcNodeReal(node); + + InstrPopStackTo(prev); + + instr->running = true; + if (!TupIsNull(result)) + instr->tuplecount += 1.0; + + return result; +} + +static TupleTableSlot * +ExecProcNodeInstrRowsTimerOnly(PlanState *node) +{ + NodeInstrumentation *instr = node->instrument; + TupleTableSlot *result; + + InstrStartTimer(&instr->instr); + + result = node->ExecProcNodeReal(node); + + InstrStopNodeTimer(instr); + + instr->running = true; + if (!TupIsNull(result)) + instr->tuplecount += 1.0; + + return result; +} + +static TupleTableSlot * +ExecProcNodeInstrRowsOnly(PlanState *node) +{ + NodeInstrumentation *instr = node->instrument; + TupleTableSlot *result; + + result = node->ExecProcNodeReal(node); + + instr->running = true; + if (!TupIsNull(result)) + instr->tuplecount += 1.0; + + return result; +} + +/* + * Returns an ExecProcNode wrapper that performs instrumentation calls, + * tailored to the instrumentation options enabled for the node. + */ +ExecProcNodeMtd +InstrNodeSetupExecProcNode(NodeInstrumentation *instr) +{ + bool need_timer = instr->instr.need_timer; + bool need_buf = (instr->instr.need_bufusage || + instr->instr.need_walusage); + + if (need_timer && need_buf) + return ExecProcNodeInstrFull; + else if (need_buf) + return ExecProcNodeInstrRowsBuffersWalOnly; + else if (need_timer) + return ExecProcNodeInstrRowsTimerOnly; + else + return ExecProcNodeInstrRowsOnly; +} + +/* Trigger instrumentation handling */ +TriggerInstrumentation * +InstrAllocTrigger(int n, int instrument_options) +{ + TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); + int i; + + for (i = 0; i < n; i++) + InstrInitOptions(&tginstr[i].instr, instrument_options); + + return tginstr; } -/* note current values during parallel executor startup */ void -InstrStartParallelQuery(void) +InstrStartTrigger(TriggerInstrumentation *tginstr) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; + InstrStart(&tginstr->instr); } -/* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) { - memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); - memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ + InstrStop(&tginstr->instr); + tginstr->firings += firings; } -/* accumulate work done by workers in leader's stats */ void -InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrAccum(Instrumentation *dst, Instrumentation *add) { - BufferUsageAdd(&pgBufferUsage, bufusage); - WalUsageAdd(&pgWalUsage, walusage); + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); } /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -244,39 +650,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ +/* dst += add */ void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - -/* helper functions for WAL usage accumulation */ -static void -WalUsageAdd(WalUsage *dst, WalUsage *add) +WalUsageAdd(WalUsage *dst, const WalUsage *add) { dst->wal_bytes += add->wal_bytes; dst->wal_records += add->wal_records; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index a616abff04cf0..4794095092e09 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,7 +83,9 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; + bool found; /* * extract necessary information from index scan node @@ -128,8 +130,22 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->iss_InstrumentTable) + InstrPushStack(&node->iss_InstrumentTable->instr); + + found = index_fetch_heap(scandesc, slot); + + if (node->iss_InstrumentTable) + InstrPopStack(&node->iss_InstrumentTable->instr); + + if (unlikely(!found)) + continue; + + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + CHECK_FOR_INTERRUPTS(); /* @@ -812,6 +828,11 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument.nsearches; + if (node->iss_InstrumentTable) + { + BufferUsageAdd(&winstrument->worker_table_bufusage, &node->iss_InstrumentTable->instr.bufusage); + WalUsageAdd(&winstrument->worker_table_walusage, &node->iss_InstrumentTable->instr.walusage); + } } /* @@ -1819,4 +1840,14 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + if (node->iss_InstrumentTable) + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + { + BufferUsageAdd(&node->iss_InstrumentTable->instr.bufusage, + &node->iss_SharedInfo->winstrument[i].worker_table_bufusage); + WalUsageAdd(&node->iss_InstrumentTable->instr.walusage, + &node->iss_SharedInfo->winstrument[i].worker_table_walusage); + } } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5f3d083e93886..1ff50aecc10e1 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -827,7 +827,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { PinLocalBuffer(bufHdr, true); - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); return true; } @@ -848,7 +848,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { if (BufferTagsEqual(&tag, &bufHdr->tag)) { - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); return true; } UnpinBuffer(bufHdr); @@ -1249,21 +1249,21 @@ PinBufferForBlock(Relation rel, { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); } else { bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, strategy, foundPtr, io_context); if (*foundPtr) - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); } if (rel) { /* - * While pgBufferUsage's "read" counter isn't bumped unless we reach - * WaitReadBuffers() (so, not for hits, and not for buffers that are - * zeroed instead), the per-relation stats always count them. + * While the current buffer usage "read" counter isn't bumped unless + * we reach WaitReadBuffers() (so, not for hits, and not for buffers + * that are zeroed instead), the per-relation stats always count them. */ pgstat_count_buffer_read(rel); if (*foundPtr) @@ -1990,9 +1990,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) true); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_hit += 1; + INSTR_BUFUSAGE_INCR(local_blks_hit); else - pgBufferUsage.shared_blks_hit += 1; + INSTR_BUFUSAGE_INCR(shared_blks_hit); if (operation->rel) pgstat_count_buffer_hit(operation->rel); @@ -2060,9 +2060,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) io_start, 1, io_buffers_len * BLCKSZ); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len); else - pgBufferUsage.shared_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len); /* * Track vacuum cost when issuing IO, not after waiting for it. @@ -2955,7 +2955,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } - pgBufferUsage.shared_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by); *extended_by = extend_by; @@ -3100,7 +3100,7 @@ MarkBufferDirty(Buffer buffer) */ if (!(old_buf_state & BM_DIRTY)) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } @@ -4529,7 +4529,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_start, 1, BLCKSZ); - pgBufferUsage.shared_blks_written++; + INSTR_BUFUSAGE_INCR(shared_blks_written); /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and @@ -5690,7 +5690,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (dirtied) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 04a540379a22f..e6054e745e819 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) /* Mark not-dirty */ TerminateLocalBufferIO(bufHdr, true, 0, false); - pgBufferUsage.local_blks_written++; + INSTR_BUFUSAGE_INCR(local_blks_written); } static Buffer @@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, *extended_by = extend_by; - pgBufferUsage.local_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(local_blks_written, extend_by); return first_block; } @@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer) buf_state = pg_atomic_read_u64(&bufHdr->state); if (!(buf_state & BM_DIRTY)) - pgBufferUsage.local_blks_dirtied++; + INSTR_BUFUSAGE_INCR(local_blks_dirtied); buf_state |= BM_DIRTY; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index c4afe4d368a34..8b501dfcadd02 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -475,13 +475,13 @@ BufFileLoadBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start); } /* we choose not to advance curOffset here */ if (file->nbytes > 0) - pgBufferUsage.temp_blks_read++; + INSTR_BUFUSAGE_INCR(temp_blks_read); } /* @@ -549,13 +549,13 @@ BufFileDumpBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start); } file->curOffset += bytestowrite; wpos += bytestowrite; - pgBufferUsage.temp_blks_written++; + INSTR_BUFUSAGE_INCR(temp_blks_written); } file->dirty = false; diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 28de24538dced..60400f0c81f42 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc) * pg_stat_database only counts block read and write times, these are done for * IOOP_READ, IOOP_WRITE and IOOP_EXTEND. * - * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats - * for shared, local and temporary blocks. pg_stat_io does not track the - * activity of temporary blocks, so these are ignored here. + * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has + * write and read stats for shared, local and temporary blocks. pg_stat_io + * does not track the activity of temporary blocks, so these are ignored here. */ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, @@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, { pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time); } else if (io_op == IOOP_READ) { pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time); } } diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index d3a572428449d..340029a203422 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -51,8 +51,8 @@ typedef struct QueryDesc /* This field is set by ExecutePlan */ bool already_executed; /* true if previously executed */ - /* This is always set NULL by the core system, but plugins can change it */ - struct Instrumentation *totaltime; /* total time spent in ExecutorRun */ + /* This field is set by ExecutorRun, or plugins */ + struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */ } QueryDesc; /* in pquery.c */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index d46ba59895d62..c22199c686934 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9759f3ea5d8d9..ac7f0d21c374e 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -67,56 +68,260 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* + * Instrumentation base class for capturing time and WAL/buffer usage + * + * If used directly: + * - Allocate on the stack and zero initialize the struct + * - Call InstrInitOptions to set instrumentation options + * - Call InstrStart before the activity you want to measure + * - Call InstrStop / InstrStopFinalize after the activity to capture totals + * + * InstrStart/InstrStop may be called multiple times. The last stop call must + * be to InstrStopFinalize to ensure parent stack entries get the accumulated + * totals. If there is risk of transaction aborts you must call + * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the + * instrumentation stack. + * + * In a query context use QueryInstrumentation instead, which handles aborts + * using the resource owner logic. + */ typedef struct Instrumentation { - /* Parameters set at node creation: */ + /* Parameters set at creation: */ bool need_timer; /* true if we need timer data */ bool need_bufusage; /* true if we need buffer usage data */ bool need_walusage; /* true if we need WAL usage data */ + /* Internal state keeping: */ + instr_time starttime; /* start time of last InstrStart */ + /* Accumulated statistics: */ + instr_time total; /* total runtime */ + BufferUsage bufusage; /* total buffer usage */ + WalUsage walusage; /* total WAL usage */ +} Instrumentation; + +/* + * Query-related instrumentation tracking. + * + * Usage: + * - Allocate on the heap using InstrQueryAlloc (required for abort handling) + * - Call InstrQueryStart before the activity you want to measure + * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals + * + * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop + * call must be to InstrQueryStopFinalize to ensure parent stack entries get + * the accumulated totals. + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of the top level transaction after having called + * InstrQueryStart, without first calling InstrQueryStop. In the case of a + * transaction abort, logic equivalent to InstrQueryStop will be called + * automatically. + */ +struct ResourceOwnerData; +typedef struct QueryInstrumentation +{ + Instrumentation instr; + + /* Resource owner used for cleanup for aborts between InstrStart/InstrStop */ + struct ResourceOwnerData *owner; + + /* + * NodeInstrumentation child entries that need to be cleaned up on abort, + * since they are not registered as a resource owner themselves. + */ + slist_head unfinalized_children; /* head of unfinalized children list */ +} QueryInstrumentation; + +/* + * Specialized instrumentation for per-node execution statistics + * + * Relies on an outer QueryInstrumentation having been set up to handle the + * stack used for WAL/buffer usage statistics, and relies on it for managing + * aborts. Solely intended for the executor and anyone reporting about its + * activities (e.g. EXPLAIN ANALYZE). + */ +typedef struct NodeInstrumentation +{ + Instrumentation instr; + /* Parameters set at node creation: */ bool async_mode; /* true if node is in async mode */ /* Info about current plan cycle: */ bool running; /* true if we've completed first tuple */ - instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ - instr_time total; /* total time */ double ntuples; /* total tuples produced */ double ntuples2; /* secondary node-specific tuple counter */ double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ -} Instrumentation; -typedef struct WorkerInstrumentation + /* Abort handling */ + slist_node unfinalized_node; /* node in parent's unfinalized list */ +} NodeInstrumentation; + +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ +typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ - Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; -} WorkerInstrumentation; + NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; +} WorkerNodeInstrumentation; + +typedef struct TriggerInstrumentation +{ + Instrumentation instr; + int firings; /* number of times the instrumented trigger + * was fired */ +} TriggerInstrumentation; + +/* + * Dynamic array-based stack for tracking current WAL/buffer usage context. + * + * When the stack is empty, 'current' points to instr_top which accumulates + * session-level totals. + */ +typedef struct InstrStackState +{ + int stack_space; /* allocated capacity of entries array */ + int stack_size; /* current number of entries */ + + Instrumentation **entries; /* dynamic array of pointers */ + Instrumentation *current; /* top of stack, or &instr_top when empty */ +} InstrStackState; -extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int n, int instrument_options, - bool async_mode); -extern void InstrInit(Instrumentation *instr, int instrument_options); -extern void InstrStartNode(Instrumentation *instr); -extern void InstrStopNode(Instrumentation *instr, double nTuples); -extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples); -extern void InstrEndLoop(Instrumentation *instr); -extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); +/* + * The top instrumentation represents a running total of the current backend + * WAL/buffer usage information. This will not be updated immediately, but + * rather when the current stack entry gets accumulated which typically happens + * at query end. + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT Instrumentation instr_top; + +/* + * The instrumentation stack state. The 'current' field points to the + * currently active stack entry that is getting updated as activity happens, + * and will be accumulated to parent stacks when it gets finalized by + * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStackState instr_stack; + +extern void InstrStackGrow(void); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * instrumentation entry. + * + * Any caller using this directly must manage the passed in entry and call + * InstrPopStack on its own again, typically by using a PG_FINALLY block to + * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart + * instead when you want automatic handling of abort cases using the resource + * owner infrastructure. + */ +static inline void +InstrPushStack(Instrumentation *instr) +{ + if (unlikely(instr_stack.stack_size == instr_stack.stack_space)) + InstrStackGrow(); + + instr_stack.entries[instr_stack.stack_size++] = instr; + instr_stack.current = instr; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, instead call InstrPopAndFinalizeStack which can skip intermediate + * stack entries, or instead use InstrStart/InstrStop. + */ +static inline void +InstrPopStack(Instrumentation *instr) +{ + Assert(instr_stack.stack_size > 0); + Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr); + instr_stack.stack_size--; + instr_stack.current = instr_stack.stack_size > 0 + ? instr_stack.entries[instr_stack.stack_size - 1] + : &instr_top; +} + +extern void InstrInitOptions(Instrumentation *instr, int instrument_options); +extern void InstrStart(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr); +extern void InstrStopFinalize(Instrumentation *instr); +extern void InstrAccum(Instrumentation *dst, Instrumentation *add); + +extern QueryInstrumentation *InstrQueryAlloc(int instrument_options); +extern void InstrQueryStart(QueryInstrumentation *instr); +extern void InstrQueryStop(QueryInstrumentation *instr); +extern QueryInstrumentation *InstrQueryStopFinalize(QueryInstrumentation *instr); +extern void InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *instr); + +pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); + +extern NodeInstrumentation *InstrAllocNode(int instrument_options, + bool async_mode); +extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); +extern void InstrStartNode(NodeInstrumentation *instr); +extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent); +extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); +extern void InstrEndLoop(NodeInstrumentation *instr); +extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); + +typedef struct TupleTableSlot TupleTableSlot; +typedef struct PlanState PlanState; +typedef TupleTableSlot *(*ExecProcNodeMtd) (PlanState *pstate); +extern ExecProcNodeMtd InstrNodeSetupExecProcNode(NodeInstrumentation *instr); + +extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); +extern void InstrStartTrigger(TriggerInstrumentation *tginstr); +extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); + +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); +extern void WalUsageAdd(WalUsage *dst, const WalUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +#define INSTR_BUFUSAGE_INCR(fld) do { \ + instr_stack.current->bufusage.fld++; \ + } while(0) +#define INSTR_BUFUSAGE_ADD(fld,val) do { \ + instr_stack.current->bufusage.fld += val; \ + } while(0) +#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ + INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \ + } while (0) +#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ + INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \ + } while (0) + +#define INSTR_WALUSAGE_INCR(fld) do { \ + pgWalUsage.fld++; \ + instr_stack.current->walusage.fld++; \ + } while(0) +#define INSTR_WALUSAGE_ADD(fld,val) do { \ + pgWalUsage.fld += val; \ + instr_stack.current->walusage.fld += val; \ + } while(0) + #endif /* INSTRUMENT_H */ diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa57..170b6143ef602 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,10 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Used for passing iss_InstrumentTableStack data from parallel workers */ + BufferUsage worker_table_bufusage; + WalUsage worker_table_walusage; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 63c067d5aae61..c778641c13dec 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -524,7 +524,7 @@ typedef struct ResultRelInfo ExprState **ri_TrigWhenExprs; /* optional runtime measurements for triggers */ - Instrumentation *ri_TrigInstrument; + TriggerInstrumentation *ri_TrigInstrument; /* On-demand created slots for triggers / returning processing */ TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */ @@ -1175,8 +1175,10 @@ typedef struct PlanState ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a * wrapper */ - Instrumentation *instrument; /* Optional runtime stats for this node */ - WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ + NodeInstrumentation *instrument; /* Optional runtime stats for this + * node */ + WorkerNodeInstrumentation *worker_instrument; /* per-worker + * instrumentation */ /* Per-worker JIT instrumentation */ struct SharedJitInstrumentation *worker_jit_instrument; @@ -1726,6 +1728,13 @@ typedef struct IndexScanState IndexScanInstrumentation iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + /* + * Instrumentation utilized for tracking table access. This is separate + * from iss_Instrument since it needs to be allocated in the right context + * and IndexScanInstrumentation shouldn't contain pointers. + */ + NodeInstrumentation *iss_InstrumentTable; + /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; bool iss_ReachedEnd; diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb65..5463bc921f06e 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index 7c1f26b182cb0..e28e754369314 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -822,3 +822,200 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove (9 rows) reset work_mem; +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + exception_buffers_visible +--------------------------- + t +(1 row) + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + exception_buffers_nested_visible +---------------------------------- + t +(1 row) + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_cursor_explain_buffers(); + noscroll_ok | scroll_ok +-------------+----------- + t | t +(1 row) + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_parallel_explain_buffers(); + ratio +------- + 1 +(1 row) + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index ebdab42604beb..cf5c6335a1922 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -188,3 +188,197 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove -- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk) select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))'); reset work_mem; + +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); + +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; + +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. + +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; + +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_cursor_explain_buffers(); + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; + +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_parallel_explain_buffers(); + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 77e3c04144e8a..5580a080210af 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1320,6 +1320,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStackState Instrumentation Int128AggState Int8TransTypeData @@ -1785,6 +1786,7 @@ NextSampleBlock_function NextSampleTuple_function NextValueExpr Node +NodeInstrumentation NodeTag NonEmptyRange NoneCompressorState @@ -2427,6 +2429,7 @@ QueryCompletion QueryDesc QueryEnvironment QueryInfo +QueryInstrumentation QueryItem QueryItemType QueryMode @@ -3156,6 +3159,7 @@ TriggerDesc TriggerEvent TriggerFlags TriggerInfo +TriggerInstrumentation TriggerTransition TruncateStmt TsmRoutine @@ -3376,9 +3380,9 @@ WorkTableScan WorkTableScanState WorkerInfo WorkerInfoData -WorkerInstrumentation WorkerJobDumpPtrType WorkerJobRestorePtrType +WorkerNodeInstrumentation Working_State WriteBufPtrType WriteBytePtrType