diff --git a/contrib/Makefile b/contrib/Makefile index dd04c20acd25b..ac04f9eb99799 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -36,6 +36,7 @@ SUBDIRS = \ pg_overexplain \ pg_plan_advice \ pg_prewarm \ + pg_session_buffer_usage \ pg_stat_statements \ pg_surgery \ pg_trgm \ diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index e856cd35a6f0f..4be81489ff4fb 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) if (auto_explain_enabled()) { - /* - * Set up to track total elapsed time in ExecutorRun. Make sure the - * space is allocated in the per-query context so it will go away at - * ExecutorEnd. - */ + /* Set up to track total elapsed time in ExecutorRun. */ if (queryDesc->totaltime == NULL) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); - MemoryContextSwitchTo(oldcxt); - } + queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL); } } @@ -381,14 +371,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc) */ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - /* Log plan if duration is exceeded. */ - msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); + msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total); if (msec >= auto_explain_log_min_duration) { ExplainState *es = NewExplainState(); diff --git a/contrib/meson.build b/contrib/meson.build index 5a752eac34711..2b1399e56f320 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -51,6 +51,7 @@ subdir('pg_overexplain') subdir('pg_plan_advice') subdir('pg_prewarm') subdir('pgrowlocks') +subdir('pg_session_buffer_usage') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') diff --git a/contrib/pg_session_buffer_usage/Makefile b/contrib/pg_session_buffer_usage/Makefile new file mode 100644 index 0000000000000..75bd8e09b3d3d --- /dev/null +++ b/contrib/pg_session_buffer_usage/Makefile @@ -0,0 +1,23 @@ +# contrib/pg_session_buffer_usage/Makefile + +MODULE_big = pg_session_buffer_usage +OBJS = \ + $(WIN32RES) \ + pg_session_buffer_usage.o + +EXTENSION = pg_session_buffer_usage +DATA = pg_session_buffer_usage--1.0.sql +PGFILEDESC = "pg_session_buffer_usage - show buffer usage statistics for the current session" + +REGRESS = pg_session_buffer_usage + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_session_buffer_usage +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out new file mode 100644 index 0000000000000..5e3f90a7b6941 --- /dev/null +++ b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out @@ -0,0 +1,342 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + ok +---- + t +(1 row) + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + count +------- + 1000 +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + blocks_increased +------------------ + t +(1 row) + +DROP TABLE test_buf_activity; +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + leader_buffers_match +---------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_dc_tab, dc_serial_result; +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; +ERROR: division by zero +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + exception_buffers_visible +--------------------------- + t +(1 row) + +DROP TABLE exc_tab; +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +INSERT INTO unique_tab VALUES (1, 'duplicate'); +ERROR: duplicate key value violates unique constraint "unique_tab_a_key" +DETAIL: Key (a)=(1) already exists. +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + unique_violation_buffers_visible +---------------------------------- + t +(1 row) + +DROP TABLE unique_tab; +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT subxact_exc_func(); + subxact_exc_func +------------------ + caught +(1 row) + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + subxact_buffers_visible +------------------------- + t +(1 row) + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT cursor_exc_func(); + cursor_exc_func +----------------------- + caught after 250 rows +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + cursor_subxact_buffers_visible +-------------------------------- + t +(1 row) + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; +-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a +-- trigger that throws an error is still properly propagated. +CREATE TEMP TABLE trig_err_tab (a int); +CREATE TEMP TABLE trig_work_tab (a int, b char(200)); +INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; +-- Warm local buffers so trig_work_tab reads become hits +SELECT count(*) FROM trig_work_tab; + count +------- + 500 +(1 row) + +CREATE FUNCTION trig_err_func() RETURNS trigger AS $$ +BEGIN + PERFORM count(*) FROM trig_work_tab; + RAISE EXCEPTION 'trigger error'; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; +CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab + FOR EACH ROW EXECUTE FUNCTION trig_err_func(); +-- Measure how many local buffer hits a scan of trig_work_tab produces +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM trig_work_tab; + count +------- + 500 +(1 row) + +CREATE TEMP TABLE trig_serial_result AS +SELECT local_blks_hit AS serial_hits FROM pg_session_buffer_usage(); +-- Now trigger the same scan via a trigger that errors +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + INSERT INTO trig_err_tab VALUES (1); +ERROR: trigger error +CONTEXT: PL/pgSQL function trig_err_func() line 4 at RAISE +-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran. +-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated. +SELECT local_blks_hit >= s.serial_hits / 2 + AS trigger_abort_buffers_propagated +FROM pg_session_buffer_usage(), trig_serial_result s; + trigger_abort_buffers_propagated +---------------------------------- + t +(1 row) + +DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result; +DROP FUNCTION trig_err_func; +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CONTEXT: parallel worker +RESET debug_parallel_query; +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + worker_abort_buffers_not_propagated +------------------------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_abort_tab, par_abort_serial_result; +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_session_buffer_usage/meson.build b/contrib/pg_session_buffer_usage/meson.build new file mode 100644 index 0000000000000..34c7502beb463 --- /dev/null +++ b/contrib/pg_session_buffer_usage/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +pg_session_buffer_usage_sources = files( + 'pg_session_buffer_usage.c', +) + +if host_system == 'windows' + pg_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_session_buffer_usage', + '--FILEDESC', 'pg_session_buffer_usage - show buffer usage statistics for the current session',]) +endif + +pg_session_buffer_usage = shared_module('pg_session_buffer_usage', + pg_session_buffer_usage_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_session_buffer_usage + +install_data( + 'pg_session_buffer_usage--1.0.sql', + 'pg_session_buffer_usage.control', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_session_buffer_usage', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_session_buffer_usage', + ], + }, +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql new file mode 100644 index 0000000000000..b300fdbc64397 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql @@ -0,0 +1,31 @@ +/* contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_session_buffer_usage" to load this file. \quit + +CREATE FUNCTION pg_session_buffer_usage( + OUT shared_blks_hit bigint, + OUT shared_blks_read bigint, + OUT shared_blks_dirtied bigint, + OUT shared_blks_written bigint, + OUT local_blks_hit bigint, + OUT local_blks_read bigint, + OUT local_blks_dirtied bigint, + OUT local_blks_written bigint, + OUT temp_blks_read bigint, + OUT temp_blks_written bigint, + OUT shared_blk_read_time double precision, + OUT shared_blk_write_time double precision, + OUT local_blk_read_time double precision, + OUT local_blk_write_time double precision, + OUT temp_blk_read_time double precision, + OUT temp_blk_write_time double precision +) +RETURNS record +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage' +LANGUAGE C PARALLEL RESTRICTED; + +CREATE FUNCTION pg_session_buffer_usage_reset() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage_reset' +LANGUAGE C PARALLEL RESTRICTED; diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c new file mode 100644 index 0000000000000..f869956b3a9b0 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c @@ -0,0 +1,95 @@ +/*------------------------------------------------------------------------- + * + * pg_session_buffer_usage.c + * show buffer usage statistics for the current session + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * contrib/pg_session_buffer_usage/pg_session_buffer_usage.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/instrument.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +PG_MODULE_MAGIC_EXT( + .name = "pg_session_buffer_usage", + .version = PG_VERSION +); + +#define NUM_BUFFER_USAGE_COLUMNS 16 + +PG_FUNCTION_INFO_V1(pg_session_buffer_usage); +PG_FUNCTION_INFO_V1(pg_session_buffer_usage_reset); + +#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack + * change */ + +/* + * SQL function: pg_session_buffer_usage() + * + * Returns a single row with all BufferUsage counters accumulated since the + * start of the session. Excludes any usage not yet added to the top of the + * stack (e.g. if this gets called inside a statement that also had buffer + * activity). + */ +Datum +pg_session_buffer_usage(PG_FUNCTION_ARGS) +{ + TupleDesc tupdesc; + Datum values[NUM_BUFFER_USAGE_COLUMNS]; + bool nulls[NUM_BUFFER_USAGE_COLUMNS]; + BufferUsage *usage; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + memset(nulls, 0, sizeof(nulls)); + +#if HAVE_INSTR_STACK + usage = &instr_top.bufusage; +#else + usage = &pgBufferUsage; +#endif + + values[0] = Int64GetDatum(usage->shared_blks_hit); + values[1] = Int64GetDatum(usage->shared_blks_read); + values[2] = Int64GetDatum(usage->shared_blks_dirtied); + values[3] = Int64GetDatum(usage->shared_blks_written); + values[4] = Int64GetDatum(usage->local_blks_hit); + values[5] = Int64GetDatum(usage->local_blks_read); + values[6] = Int64GetDatum(usage->local_blks_dirtied); + values[7] = Int64GetDatum(usage->local_blks_written); + values[8] = Int64GetDatum(usage->temp_blks_read); + values[9] = Int64GetDatum(usage->temp_blks_written); + values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time)); + values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time)); + values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time)); + values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time)); + values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time)); + values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time)); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * SQL function: pg_session_buffer_usage_reset() + * + * Resets all BufferUsage counters on the top instrumentation stack to zero. + * Useful in tests to avoid the baseline/delta pattern. + */ +Datum +pg_session_buffer_usage_reset(PG_FUNCTION_ARGS) +{ +#if HAVE_INSTR_STACK + memset(&instr_top.bufusage, 0, sizeof(BufferUsage)); +#else + memset(&pgBufferUsage, 0, sizeof(BufferUsage)); +#endif + + PG_RETURN_VOID(); +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control new file mode 100644 index 0000000000000..fabd05ee02458 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control @@ -0,0 +1,5 @@ +# pg_session_buffer_usage extension +comment = 'show buffer usage statistics for the current session' +default_version = '1.0' +module_pathname = '$libdir/pg_session_buffer_usage' +relocatable = true diff --git a/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql new file mode 100644 index 0000000000000..e92068168b27c --- /dev/null +++ b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql @@ -0,0 +1,245 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; + +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + +DROP TABLE test_buf_activity; + +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +SELECT count(*) FROM par_dc_tab; + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_dc_tab, dc_serial_result; + +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- + +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); + +SELECT pg_session_buffer_usage_reset(); + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; + +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE exc_tab; + +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); + +SELECT pg_session_buffer_usage_reset(); +INSERT INTO unique_tab VALUES (1, 'duplicate'); + +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE unique_tab; + +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); + +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT subxact_exc_func(); + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; + +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; + +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT cursor_exc_func(); + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; + +-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a +-- trigger that throws an error is still properly propagated. +CREATE TEMP TABLE trig_err_tab (a int); +CREATE TEMP TABLE trig_work_tab (a int, b char(200)); +INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; + +-- Warm local buffers so trig_work_tab reads become hits +SELECT count(*) FROM trig_work_tab; + +CREATE FUNCTION trig_err_func() RETURNS trigger AS $$ +BEGIN + PERFORM count(*) FROM trig_work_tab; + RAISE EXCEPTION 'trigger error'; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab + FOR EACH ROW EXECUTE FUNCTION trig_err_func(); + +-- Measure how many local buffer hits a scan of trig_work_tab produces +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM trig_work_tab; + +CREATE TEMP TABLE trig_serial_result AS +SELECT local_blks_hit AS serial_hits FROM pg_session_buffer_usage(); + +-- Now trigger the same scan via a trigger that errors +SELECT pg_session_buffer_usage_reset(); +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + INSERT INTO trig_err_tab VALUES (1); + +-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran. +-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated. +SELECT local_blks_hit >= s.serial_hits / 2 + AS trigger_abort_buffers_propagated +FROM pg_session_buffer_usage(), trig_serial_result s; + +DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result; +DROP FUNCTION trig_err_func; + +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +RESET debug_parallel_query; + +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_abort_tab, par_abort_serial_result; + +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out index e4d6564ea5b5a..cba487f6be582 100644 --- a/contrib/pg_stat_statements/expected/utility.out +++ b/contrib/pg_stat_statements/expected/utility.out @@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (3 rows) +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT run_explain_buffers_test(); + run_explain_buffers_test +-------------------------- + +(1 row) + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + query | has_buffer_stats +-----------------------------------+------------------ + SELECT run_explain_buffers_test() | t +(1 row) + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +CALL pgss_call_rollback_proc(); +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------------+-------+---------------+---------------------+----------------------- + CALL pgss_call_rollback_proc() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out index 977e382d84894..611213daef6c2 100644 --- a/contrib/pg_stat_statements/expected/wal.out +++ b/contrib/pg_stat_statements/expected/wal.out @@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT pgss_error_func(); + pgss_error_func +----------------- + +(1 row) + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------+-------+---------------+---------------------+----------------------- + SELECT pgss_error_func() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 6cb14824ec3b9..9856dec3a5f5e 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -910,22 +910,11 @@ pgss_planner(Query *parse, && pgss_track_planning && query_string && parse->queryId != INT64CONST(0)) { - instr_time start; - instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + Instrumentation instr = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; - INSTR_TIME_SET_CURRENT(start); + /* Track time and buffer/WAL usage as the planner can access them. */ + InstrInitOptions(&instr, INSTRUMENT_ALL); + InstrStart(&instr); nesting_level++; PG_TRY(); @@ -939,30 +928,20 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, parse->stmt_len, PGSS_PLAN, - INSTR_TIME_GET_MILLISEC(duration), + INSTR_TIME_GET_MILLISEC(instr.total), 0, - &bufusage, - &walusage, + &instr.bufusage, + &instr.walusage, NULL, NULL, 0, @@ -1014,19 +993,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) */ if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0)) { - /* - * Set up to track total elapsed time in ExecutorRun. Make sure the - * space is allocated in the per-query context so it will go away at - * ExecutorEnd. - */ + /* Set up to track total elapsed time in ExecutorRun. */ if (queryDesc->totaltime == NULL) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); - MemoryContextSwitchTo(oldcxt); - } + queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL); } } @@ -1083,21 +1052,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - pgss_store(queryDesc->sourceText, queryId, queryDesc->plannedstmt->stmt_location, queryDesc->plannedstmt->stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), + INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + &queryDesc->totaltime->instr.bufusage, + &queryDesc->totaltime->instr.walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1161,17 +1124,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, !IsA(parsetree, ExecuteStmt) && !IsA(parsetree, PrepareStmt)) { - instr_time start; - instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + Instrumentation instr = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; - INSTR_TIME_SET_CURRENT(start); + InstrInitOptions(&instr, INSTRUMENT_ALL); + InstrStart(&instr); nesting_level++; PG_TRY(); @@ -1187,6 +1144,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrStopFinalize(&instr); nesting_level--; } PG_END_TRY(); @@ -1201,9 +1159,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * former value, which'd otherwise be a good idea. */ - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - /* * Track the total number of rows retrieved or affected by the utility * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED @@ -1215,23 +1170,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, saved_stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(duration), + INSTR_TIME_GET_MILLISEC(instr.total), rows, - &bufusage, - &walusage, + &instr.bufusage, + &instr.walusage, NULL, NULL, 0, diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql index dd97203c21025..7540e49c73caf 100644 --- a/contrib/pg_stat_statements/sql/utility.sql +++ b/contrib/pg_stat_statements/sql/utility.sql @@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +SELECT run_explain_buffers_test(); + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; + +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); + +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +CALL pgss_call_rollback_proc(); + +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; + -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql index 1dc1552a81ebc..467e321b2062e 100644 --- a/contrib/pg_stat_statements/sql/wal.sql +++ b/contrib/pg_stat_statements/sql/wal.sql @@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated, wal_records >= rows as wal_records_ge_rows FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); + +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT pgss_error_func(); + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 41e47cc795ba8..cc8ec24c30eb0 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -2779,7 +2779,7 @@ postgresIterateDirectModify(ForeignScanState *node) if (!resultRelInfo->ri_projectReturning) { TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - Instrumentation *instr = node->ss.ps.instrument; + NodeInstrumentation *instr = node->ss.ps.instrument; Assert(!dmstate->has_returning); diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 604e8578a8dcd..d28f4f22535b9 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -1005,7 +1006,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -1014,7 +1016,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1203,13 +1207,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 5b8b521802ee5..71070736acb98 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -509,6 +509,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2a0f8c8e3b8b2..1c95ec9f6055d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -51,8 +51,7 @@ #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001) #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002) #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003) -#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004) -#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005) +#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xB000000000000004) /* * Status for index builds performed in parallel. This is allocated in a @@ -148,8 +147,7 @@ typedef struct BrinLeader BrinShared *brinshared; Sharedsort *sharedsort; Snapshot snapshot; - WalUsage *walusage; - BufferUsage *bufferusage; + Instrumentation *instr; } BrinLeader; /* @@ -2387,8 +2385,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, BrinShared *brinshared; Sharedsort *sharedsort; BrinLeader *brinleader = palloc0_object(BrinLeader); - WalUsage *walusage; - BufferUsage *bufferusage; + Instrumentation *instr; bool leaderparticipates = true; int querylen; @@ -2430,18 +2427,14 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, shm_toc_estimate_keys(&pcxt->estimator, 2); /* - * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE - * and PARALLEL_KEY_BUFFER_USAGE. + * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_estimate_keys(&pcxt->estimator, 1); - shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); + mul_size(sizeof(Instrumentation), pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ @@ -2514,15 +2507,12 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, } /* - * Allocate space for each worker's WalUsage and BufferUsage; no need to + * Allocate space for each worker's Instrumentation; no need to * initialize. */ - walusage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage); - bufferusage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); + instr = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(Instrumentation), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr); /* Launch workers, saving status for leader/caller */ LaunchParallelWorkers(pcxt); @@ -2533,8 +2523,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, brinleader->brinshared = brinshared; brinleader->sharedsort = sharedsort; brinleader->snapshot = snapshot; - brinleader->walusage = walusage; - brinleader->bufferusage = bufferusage; + brinleader->instr = instr; /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) @@ -2573,7 +2562,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) * or we might get incomplete data.) */ for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) - InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); + InstrAccumParallelQuery(&brinleader->instr[i]); /* Free last reference to MVCC snapshot, if one was used */ if (IsMVCCSnapshot(brinleader->snapshot)) @@ -2886,8 +2875,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; - WalUsage *walusage; - BufferUsage *bufferusage; + QueryInstrumentation *instr; + Instrumentation *worker_instr; int sortmem; /* @@ -2935,7 +2924,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2948,10 +2937,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) heapRel, indexRel, sortmem, false); /* Report WAL/buffer usage during parallel execution */ - bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); - walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], - &walusage[ParallelWorkerNumber]); + worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false); + InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index e54782d9dd852..51bb098a2a243 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -45,8 +45,7 @@ #define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001) #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002) #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003) -#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004) -#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005) +#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xB000000000000004) /* * Status for index builds performed in parallel. This is allocated in a @@ -138,8 +137,7 @@ typedef struct GinLeader GinBuildShared *ginshared; Sharedsort *sharedsort; Snapshot snapshot; - WalUsage *walusage; - BufferUsage *bufferusage; + Instrumentation *instr; } GinLeader; typedef struct @@ -945,8 +943,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, GinBuildShared *ginshared; Sharedsort *sharedsort; GinLeader *ginleader = palloc0_object(GinLeader); - WalUsage *walusage; - BufferUsage *bufferusage; + Instrumentation *instr; bool leaderparticipates = true; int querylen; @@ -987,18 +984,14 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, shm_toc_estimate_keys(&pcxt->estimator, 2); /* - * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE - * and PARALLEL_KEY_BUFFER_USAGE. + * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_estimate_keys(&pcxt->estimator, 1); - shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); + mul_size(sizeof(Instrumentation), pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ @@ -1066,15 +1059,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, } /* - * Allocate space for each worker's WalUsage and BufferUsage; no need to + * Allocate space for each worker's Instrumentation; no need to * initialize. */ - walusage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage); - bufferusage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); + instr = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(Instrumentation), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr); /* Launch workers, saving status for leader/caller */ LaunchParallelWorkers(pcxt); @@ -1085,8 +1075,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, ginleader->ginshared = ginshared; ginleader->sharedsort = sharedsort; ginleader->snapshot = snapshot; - ginleader->walusage = walusage; - ginleader->bufferusage = bufferusage; + ginleader->instr = instr; /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) @@ -1125,7 +1114,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) * or we might get incomplete data.) */ for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) - InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); + InstrAccumParallelQuery(&ginleader->instr[i]); /* Free last reference to MVCC snapshot, if one was used */ if (IsMVCCSnapshot(ginleader->snapshot)) @@ -2117,8 +2106,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; - WalUsage *walusage; - BufferUsage *bufferusage; + QueryInstrumentation *instr; + Instrumentation *worker_instr; int sortmem; /* @@ -2185,7 +2174,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2198,10 +2187,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) heapRel, indexRel, sortmem, false); /* Report WAL/buffer usage during parallel execution */ - bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); - walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], - &walusage[ParallelWorkerNumber]); + worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false); + InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 1a446050d85b8..a7db507fa0d1d 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -643,8 +643,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; + QueryInstrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; Size dead_items_max_bytes = 0; @@ -660,6 +659,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -990,14 +991,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrQueryStopFinalize(instr); + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; StringInfoData buf; char *msgfmt; int32 diff; @@ -1006,12 +1007,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; + BufferUsage bufferusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); total_blks_hit = bufferusage.shared_blks_hit + bufferusage.local_blks_hit; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 47a9bda30c9b1..504b34cc906dd 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -66,8 +66,7 @@ #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) -#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005) -#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006) +#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xA000000000000005) /* * DISABLE_LEADER_PARTICIPATION disables the leader's participation in @@ -195,8 +194,7 @@ typedef struct BTLeader Sharedsort *sharedsort; Sharedsort *sharedsort2; Snapshot snapshot; - WalUsage *walusage; - BufferUsage *bufferusage; + Instrumentation *instr; } BTLeader; /* @@ -1408,8 +1406,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) Sharedsort *sharedsort2; BTSpool *btspool = buildstate->spool; BTLeader *btleader = palloc0_object(BTLeader); - WalUsage *walusage; - BufferUsage *bufferusage; + Instrumentation *instr; bool leaderparticipates = true; int querylen; @@ -1462,18 +1459,14 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) } /* - * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE - * and PARALLEL_KEY_BUFFER_USAGE. + * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgWalUsage or - * pgBufferUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_estimate_keys(&pcxt->estimator, 1); - shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); + mul_size(sizeof(Instrumentation), pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ @@ -1560,15 +1553,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) } /* - * Allocate space for each worker's WalUsage and BufferUsage; no need to + * Allocate space for each worker's Instrumentation; no need to * initialize. */ - walusage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage); - bufferusage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); + instr = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(Instrumentation), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr); /* Launch workers, saving status for leader/caller */ LaunchParallelWorkers(pcxt); @@ -1580,8 +1570,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btleader->sharedsort = sharedsort; btleader->sharedsort2 = sharedsort2; btleader->snapshot = snapshot; - btleader->walusage = walusage; - btleader->bufferusage = bufferusage; + btleader->instr = instr; /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) @@ -1620,7 +1609,7 @@ _bt_end_parallel(BTLeader *btleader) * or we might get incomplete data.) */ for (i = 0; i < btleader->pcxt->nworkers_launched; i++) - InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); + InstrAccumParallelQuery(&btleader->instr[i]); /* Free last reference to MVCC snapshot, if one was used */ if (IsMVCCSnapshot(btleader->snapshot)) @@ -1753,8 +1742,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; - WalUsage *walusage; - BufferUsage *bufferusage; + QueryInstrumentation *instr; + Instrumentation *worker_instr; int sortmem; #ifdef BTREE_BUILD_STATS @@ -1828,7 +1817,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1836,10 +1825,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) sharedsort2, sortmem, false); /* Report WAL/buffer usage during parallel execution */ - bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); - walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], - &walusage[ParallelWorkerNumber]); + worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false); + InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f5c9a34374dde..9b33584f4548e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1081,10 +1081,10 @@ XLogInsertRecord(XLogRecData *rdata, /* Report WAL traffic to the instrumentation. */ if (inserted) { - pgWalUsage.wal_bytes += rechdr->xl_tot_len; - pgWalUsage.wal_records++; - pgWalUsage.wal_fpi += num_fpi; - pgWalUsage.wal_fpi_bytes += fpi_bytes; + INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len); + INSTR_WALUSAGE_INCR(wal_records); + INSTR_WALUSAGE_ADD(wal_fpi, num_fpi); + INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes); /* Required for the flush of pending stats WAL data */ pgstat_report_fixed = true; @@ -2063,7 +2063,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = InvalidXLogRecPtr; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - pgWalUsage.wal_buffers_full++; + INSTR_WALUSAGE_INCR(wal_buffers_full); TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); /* diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index eeed91be2668e..c21b2019eabd3 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -309,9 +309,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, Oid save_userid; int save_sec_context; int save_nestlevel; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - BufferUsage bufferusage; + QueryInstrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -362,6 +360,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } pg_rusage_init(&ru0); + + instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrQueryStart(instr); } /* Used for instrumentation and stats report */ @@ -742,12 +743,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrQueryStopFinalize(instr); + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_analyze_min_duration)) { long delay_in_ms; - WalUsage walusage; double read_rate = 0; double write_rate = 0; char *msgfmt; @@ -755,18 +757,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + BufferUsage bufusage = instr->instr.bufusage; + WalUsage walusage = instr->instr.walusage; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; /* * We do not expect an analyze to take > 25 days and it simplifies diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index e4b70166b0e50..eef343a9d9760 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -144,7 +144,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -324,14 +324,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - instr_time planstart, - planduration; - BufferUsage bufusage_start, - bufusage; + QueryInstrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrQueryAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrQueryAlloc(INSTRUMENT_TIMER); + if (es->memory) { /* @@ -348,15 +350,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrQueryStart(instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrQueryStopFinalize(instr); if (es->memory) { @@ -364,16 +363,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); } @@ -590,7 +582,12 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, /* grab serialization metrics before we destroy the DestReceiver */ if (es->serialize != EXPLAIN_SERIALIZE_NONE) - serializeMetrics = GetSerializationMetrics(dest); + { + SerializeMetrics *metrics = GetSerializationMetrics(dest); + + if (metrics) + memcpy(&serializeMetrics, metrics, sizeof(SerializeMetrics)); + } /* call the DestReceiver's destroy method even during explain */ dest->rDestroy(dest); @@ -613,7 +610,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1019,7 +1016,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) ExplainIndentText(es); if (es->timing) appendStringInfo(es->str, "Serialization: time=%.3f ms output=" UINT64_FORMAT "kB format=%s\n", - 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent), + 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total), BYTES_TO_KILOBYTES(metrics->bytesSent), format); else @@ -1027,10 +1024,10 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), format); - if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) + if (es->buffers && peek_buffer_usage(es, &metrics->instr.bufusage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->instr.bufusage, NULL); es->indent--; } } @@ -1038,13 +1035,13 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) { if (es->timing) ExplainPropertyFloat("Time", "ms", - 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent), + 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total), 3, es); ExplainPropertyUInteger("Output Volume", "kB", BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->instr.bufusage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1101,18 +1098,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) { Trigger *trig = rInfo->ri_TrigDesc->triggers + nt; - Instrumentation *instr = rInfo->ri_TrigInstrument + nt; + TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt; char *relname; char *conname = NULL; - /* Must clean up instrumentation state */ - InstrEndLoop(instr); - /* * We ignore triggers that were never invoked; they likely aren't * relevant to the current query type. */ - if (instr->ntuples == 0) + if (tginstr->firings == 0) continue; ExplainOpenGroup("Trigger", NULL, true, es); @@ -1137,11 +1131,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) if (show_relname) appendStringInfo(es->str, " on %s", relname); if (es->timing) - appendStringInfo(es->str, ": time=%.3f calls=%.0f\n", - INSTR_TIME_GET_MILLISEC(instr->total), - instr->ntuples); + appendStringInfo(es->str, ": time=%.3f calls=%d\n", + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), + tginstr->firings); else - appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples); + appendStringInfo(es->str, ": calls=%d\n", tginstr->firings); } else { @@ -1151,9 +1145,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) ExplainPropertyText("Relation", relname, es); if (es->timing) ExplainPropertyFloat("Time", "ms", - INSTR_TIME_GET_MILLISEC(instr->total), 3, + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3, es); - ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es); + ExplainPropertyInteger("Calls", NULL, tginstr->firings, es); } if (conname) @@ -1840,7 +1834,7 @@ ExplainNode(PlanState *planstate, List *ancestors, { double nloops = planstate->instrument->nloops; double startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops; - double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops; + double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->instr.total) / nloops; double rows = planstate->instrument->ntuples / nloops; if (es->format == EXPLAIN_FORMAT_TEXT) @@ -1893,11 +1887,11 @@ ExplainNode(PlanState *planstate, List *ancestors, /* prepare per-worker general execution details */ if (es->workers_state && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; double startup_ms; double total_ms; @@ -1906,7 +1900,7 @@ ExplainNode(PlanState *planstate, List *ancestors, if (nloops <= 0) continue; startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops; - total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops; + total_ms = INSTR_TIME_GET_MILLISEC(instrument->instr.total) / nloops; rows = instrument->ntuples / nloops; ExplainOpenWorker(n, es); @@ -1975,6 +1969,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument->table_instr.bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -1992,6 +1989,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainPropertyFloat("Heap Fetches", NULL, planstate->instrument->ntuples2, 0, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexOnlyScanState *) planstate)->ioss_Instrument->table_instr.bufusage, "Table"); break; case T_BitmapIndexScan: show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, @@ -2293,18 +2293,18 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->instr.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; if (nloops <= 0) @@ -2312,9 +2312,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->instr.bufusage, NULL); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->instr.walusage); ExplainCloseWorker(n, es); } } @@ -4112,7 +4112,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4137,6 +4137,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4192,6 +4194,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4233,6 +4237,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { + char *buffers_title = NULL; + + if (title) + { + buffers_title = psprintf("%s Buffers", title); + ExplainOpenGroup(buffers_title, buffers_title, true, es); + } + ExplainPropertyInteger("Shared Hit Blocks", NULL, usage->shared_blks_hit, es); ExplainPropertyInteger("Shared Read Blocks", NULL, @@ -4253,8 +4265,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) usage->temp_blks_read, es); ExplainPropertyInteger("Temp Written Blocks", NULL, usage->temp_blks_written, es); + + if (buffers_title) + ExplainCloseGroup(buffers_title, buffers_title, true, es); + if (track_io_timing) { + char *timings_title = NULL; + + if (title) + { + timings_title = psprintf("%s I/O Timings", title); + ExplainOpenGroup(timings_title, timings_title, true, es); + } + ExplainPropertyFloat("Shared I/O Read Time", "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); @@ -4273,6 +4297,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) ExplainPropertyFloat("Temp I/O Write Time", "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); + + if (timings_title) + ExplainCloseGroup(timings_title, timings_title, true, es); } } } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 3c96061cf32ab..e1fc723c758bb 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -110,15 +110,11 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContext oldcontext; StringInfo buf = &myState->buf; int natts = typeinfo->natts; - instr_time start, - end; - BufferUsage instr_start; + Instrumentation *instr = &myState->metrics.instr; /* only measure time, buffers if requested */ - if (myState->es->timing) - INSTR_TIME_SET_CURRENT(start); - if (myState->es->buffers) - instr_start = pgBufferUsage; + if (instr->need_timer || instr->need_stack) + InstrStart(instr); /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) @@ -186,18 +182,9 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContextSwitchTo(oldcontext); MemoryContextReset(myState->tmpcontext); - /* Update timing data */ - if (myState->es->timing) - { - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start); - } - - /* Update buffer metrics */ - if (myState->es->buffers) - BufferUsageAccumDiff(&myState->metrics.bufferUsage, - &pgBufferUsage, - &instr_start); + /* Stop per-tuple measurement */ + if (instr->need_timer || instr->need_stack) + InstrStop(instr); return true; } @@ -233,9 +220,17 @@ serializeAnalyzeStartup(DestReceiver *self, int operation, TupleDesc typeinfo) /* The output buffer is re-used across rows, as in printtup.c */ initStringInfo(&receiver->buf); - /* Initialize results counters */ + /* Initialize metrics and per-tuple instrumentation */ memset(&receiver->metrics, 0, sizeof(SerializeMetrics)); - INSTR_TIME_SET_ZERO(receiver->metrics.timeSpent); + { + int instrument_options = 0; + + if (receiver->es->timing) + instrument_options |= INSTRUMENT_TIMER; + if (receiver->es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + InstrInitOptions(&receiver->metrics.instr, instrument_options); + } } /* @@ -246,6 +241,8 @@ serializeAnalyzeShutdown(DestReceiver *self) { SerializeDestReceiver *receiver = (SerializeDestReceiver *) self; + InstrFinalizeChild(&receiver->metrics.instr, instr_stack.current); + if (receiver->finfos) pfree(receiver->finfos); receiver->finfos = NULL; @@ -296,16 +293,18 @@ CreateExplainSerializeDestReceiver(ExplainState *es) * receiver if the subject statement is CREATE TABLE AS. In that * case, return all-zeroes stats. */ -SerializeMetrics +/* + * GetSerializationMetrics - get serialization metrics + * + * Returns a pointer to the SerializeMetrics inside the dest receiver, + * or NULL if the receiver is not a SerializeDestReceiver (e.g. an IntoRel + * receiver for CREATE TABLE AS). + */ +SerializeMetrics * GetSerializationMetrics(DestReceiver *dest) { - SerializeMetrics empty; - if (dest->mydest == DestExplainSerialize) - return ((SerializeDestReceiver *) dest)->metrics; - - memset(&empty, 0, sizeof(SerializeMetrics)); - INSTR_TIME_SET_ZERO(empty.timeSpent); + return &((SerializeDestReceiver *) dest)->metrics; - return empty; + return NULL; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 876aad2100aeb..f7e158e4dd9b6 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -580,13 +580,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - instr_time planstart; - instr_time planduration; - BufferUsage bufusage_start, - bufusage; + QueryInstrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + int instrument_options = INSTRUMENT_TIMER; + + if (es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + + instr = InstrQueryAlloc(instrument_options); if (es->memory) { @@ -598,9 +601,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrQueryStart(instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -635,8 +636,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrQueryStopFinalize(instr); if (es->memory) { @@ -644,13 +644,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - plan_list = cplan->stmt_list; /* Explain each query */ @@ -660,7 +653,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index c69c12dc014d7..90ac5ccaacdb9 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -2139,7 +2139,7 @@ ExecuteTruncateGuts(List *explicit_rels, rel, 0, /* dummy rangetable index */ NULL, - 0); + NULL); estate->es_opened_result_relations = lappend(estate->es_opened_result_relations, resultRelInfo); resultRelInfo++; diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 6596843a8d85f..f2597b917e116 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -92,7 +92,8 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo, static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, + QueryInstrumentation *qinstr, MemoryContext per_tuple_context); static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, @@ -2311,7 +2312,8 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, + QueryInstrumentation *qinstr, MemoryContext per_tuple_context) { LOCAL_FCINFO(fcinfo, 0); @@ -2346,7 +2348,7 @@ ExecCallTriggerFunc(TriggerData *trigdata, * If doing EXPLAIN ANALYZE, start charging time to this trigger. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(qinstr, instr + tgindx); /* * Do the function evaluation in the per-tuple memory context, so that @@ -2391,10 +2393,10 @@ ExecCallTriggerFunc(TriggerData *trigdata, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); return (HeapTuple) DatumGetPointer(result); } @@ -2441,6 +2443,7 @@ ExecBSInsertTriggers(EState *estate, ResultRelInfo *relinfo) i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple) @@ -2502,6 +2505,7 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo, i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple == NULL) { @@ -2606,6 +2610,7 @@ ExecIRInsertTriggers(EState *estate, ResultRelInfo *relinfo, i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple == NULL) { @@ -2670,6 +2675,7 @@ ExecBSDeleteTriggers(EState *estate, ResultRelInfo *relinfo) i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple) @@ -2780,6 +2786,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple == NULL) { @@ -2884,6 +2891,7 @@ ExecIRDeleteTriggers(EState *estate, ResultRelInfo *relinfo, i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (rettuple == NULL) return false; /* Delete was suppressed */ @@ -2942,6 +2950,7 @@ ExecBSUpdateTriggers(EState *estate, ResultRelInfo *relinfo) i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple) @@ -3094,6 +3103,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple == NULL) @@ -3258,6 +3268,7 @@ ExecIRUpdateTriggers(EState *estate, ResultRelInfo *relinfo, i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple == NULL) { @@ -3316,6 +3327,7 @@ ExecBSTruncateTriggers(EState *estate, ResultRelInfo *relinfo) i, relinfo->ri_TrigFunctions, relinfo->ri_TrigInstrument, + estate->es_instrument, GetPerTupleMemoryContext(estate)); if (newtuple) @@ -3938,7 +3950,7 @@ static void AfterTriggerExecute(EState *estate, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2); @@ -4332,7 +4344,7 @@ AfterTriggerExecute(EState *estate, ResultRelInfo *src_relInfo, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, - FmgrInfo *finfo, Instrumentation *instr, + FmgrInfo *finfo, TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2) @@ -4373,7 +4385,7 @@ AfterTriggerExecute(EState *estate, * to include time spent re-fetching tuples in the trigger cost. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(estate->es_instrument, instr + tgindx); /* * Fetch the required tuple(s). @@ -4561,6 +4573,7 @@ AfterTriggerExecute(EState *estate, tgindx, finfo, NULL, + NULL, per_tuple_context); if (rettuple != NULL && rettuple != LocTriggerData.tg_trigtuple && @@ -4590,10 +4603,10 @@ AfterTriggerExecute(EState *estate, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); } @@ -4709,7 +4722,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, Relation rel = NULL; TriggerDesc *trigdesc = NULL; FmgrInfo *finfo = NULL; - Instrumentation *instr = NULL; + TriggerInstrumentation *instr = NULL; TupleTableSlot *slot1 = NULL, *slot2 = NULL; diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 77834b96a21c1..b5fed54fb85c3 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -47,9 +47,8 @@ */ #define PARALLEL_VACUUM_KEY_SHARED 1 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 2 -#define PARALLEL_VACUUM_KEY_BUFFER_USAGE 3 -#define PARALLEL_VACUUM_KEY_WAL_USAGE 4 -#define PARALLEL_VACUUM_KEY_INDEX_STATS 5 +#define PARALLEL_VACUUM_KEY_INSTRUMENTATION 3 +#define PARALLEL_VACUUM_KEY_INDEX_STATS 4 /* * Shared information among parallel workers. So this is allocated in the DSM @@ -188,11 +187,8 @@ struct ParallelVacuumState /* Shared dead items space among parallel vacuum workers */ TidStore *dead_items; - /* Points to buffer usage area in DSM */ - BufferUsage *buffer_usage; - - /* Points to WAL usage area in DSM */ - WalUsage *wal_usage; + /* Points to instrumentation area in DSM */ + Instrumentation *instr; /* * False if the index is totally unsuitable target for all parallel @@ -250,8 +246,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, PVShared *shared; TidStore *dead_items; PVIndStats *indstats; - BufferUsage *buffer_usage; - WalUsage *wal_usage; + Instrumentation *instr; bool *will_parallel_vacuum; Size est_indstats_len; Size est_shared_len; @@ -304,18 +299,15 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_estimate_keys(&pcxt->estimator, 1); /* - * Estimate space for BufferUsage and WalUsage -- - * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE. + * Estimate space for Instrumentation -- + * PARALLEL_VACUUM_KEY_INSTRUMENTATION. * * If there are no extensions loaded that care, we could skip this. We - * have no way of knowing whether anyone's looking at pgBufferUsage or - * pgWalUsage, so do it unconditionally. + * have no way of knowing whether anyone's looking at instrumentation, so + * do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_estimate_keys(&pcxt->estimator, 1); - shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(WalUsage), pcxt->nworkers)); + mul_size(sizeof(Instrumentation), pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */ @@ -396,17 +388,13 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, pvs->shared = shared; /* - * Allocate space for each worker's BufferUsage and WalUsage; no need to - * initialize + * Allocate space for each worker's Instrumentation; no need to + * initialize. */ - buffer_usage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage); - pvs->buffer_usage = buffer_usage; - wal_usage = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage); - pvs->wal_usage = wal_usage; + instr = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(Instrumentation), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, instr); + pvs->instr = instr; /* Store query string for workers */ if (debug_query_string) @@ -749,7 +737,7 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan WaitForParallelWorkersToFinish(pvs->pcxt); for (int i = 0; i < pvs->pcxt->nworkers_launched; i++) - InstrAccumParallelQuery(&pvs->buffer_usage[i], &pvs->wal_usage[i]); + InstrAccumParallelQuery(&pvs->instr[i]); } /* @@ -1006,8 +994,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; - BufferUsage *buffer_usage; - WalUsage *wal_usage; + QueryInstrumentation *instr; + Instrumentation *worker_instr; int nindexes; char *sharedquery; ErrorContextCallback errcallback; @@ -1095,16 +1083,14 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); /* Report buffer/WAL usage during parallel execution */ - buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); - wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], - &wal_usage[ParallelWorkerNumber]); + worker_instr = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, false); + InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ if (track_cost_delay_timing) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 58b84955c2beb..5366c1e801c2c 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -78,6 +78,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -247,9 +248,19 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot); estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot); estate->es_top_eflags = eflags; - estate->es_instrument = queryDesc->instrument_options; estate->es_jit_flags = queryDesc->plannedstmt->jitFlags; + /* + * Set up query-level instrumentation if needed. We do this before + * InitPlan so that node and trigger instrumentation can be allocated + * within the query's dedicated instrumentation memory context. + */ + if (!queryDesc->totaltime && queryDesc->instrument_options) + { + queryDesc->totaltime = InstrQueryAlloc(queryDesc->instrument_options); + estate->es_instrument = queryDesc->totaltime; + } + /* * Set up an AFTER-trigger statement context, unless told not to, or * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called). @@ -331,9 +342,21 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ + /* Start up instrumentation for this execution run */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + { + InstrQueryStart(queryDesc->totaltime); + + /* + * Remember all node entries for abort recovery. We do this once here + * after InstrQueryStart has pushed the parent stack entry. + */ + if (estate->es_instrument && + estate->es_instrument->instr.need_stack && + !queryDesc->already_executed) + ExecRememberNodeInstrumentation(queryDesc->planstate, + queryDesc->totaltime); + } /* * extract information from the query descriptor and the query feature. @@ -385,7 +408,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, estate->es_processed); + InstrQueryStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); } @@ -435,7 +458,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrQueryStart(queryDesc->totaltime); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -444,8 +467,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per-node and trigger statistics to their respective parent + * instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and the + * leader's own ExecFinalizeNodeInstrumentation handles propagation. If + * we accumulated here, the leader would double-count: worker parent nodes + * would already include their children's stats, and then the leader's + * accumulation would add the children again. + */ + if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker()) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, 0); + InstrQueryStopFinalize(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); @@ -1263,7 +1304,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, ResultRelInfo *partition_root_rri, - int instrument_options) + QueryInstrumentation *qinstr) { MemSet(resultRelInfo, 0, sizeof(ResultRelInfo)); resultRelInfo->type = T_ResultRelInfo; @@ -1284,8 +1325,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, palloc0_array(FmgrInfo, n); resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); - if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); + if (qinstr) + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(qinstr, n); } else { @@ -1499,6 +1540,30 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti == NULL || rInfo->ri_TrigDesc == NULL) + continue; + + for (int nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) + { + if (ti[nt].instr.need_stack) + InstrAccumStack(&estate->es_instrument->instr, &ti[nt].instr); + } + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index ac84af294c9f6..7a3903505645c 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -60,13 +60,12 @@ #define PARALLEL_KEY_EXECUTOR_FIXED UINT64CONST(0xE000000000000001) #define PARALLEL_KEY_PLANNEDSTMT UINT64CONST(0xE000000000000002) #define PARALLEL_KEY_PARAMLISTINFO UINT64CONST(0xE000000000000003) -#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xE000000000000004) +#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xE000000000000004) #define PARALLEL_KEY_TUPLE_QUEUE UINT64CONST(0xE000000000000005) -#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xE000000000000006) +#define PARALLEL_KEY_NODE_INSTRUMENTATION UINT64CONST(0xE000000000000006) #define PARALLEL_KEY_DSA UINT64CONST(0xE000000000000007) #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xE000000000000008) #define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009) -#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xE00000000000000A) #define PARALLEL_TUPLE_QUEUE_SIZE 65536 @@ -87,7 +86,7 @@ typedef struct FixedParallelExecutorState * instrument_options: Same meaning here as in instrument.c. * * instrument_offset: Offset, relative to the start of this structure, - * of the first Instrumentation object. This will depend on the length of + * of the first NodeInstrumentation object. This will depend on the length of * the plan_node_id array. * * num_workers: Number of workers. @@ -104,11 +103,15 @@ struct SharedExecutorInstrumentation int num_workers; int num_plan_nodes; int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; - /* array of num_plan_nodes * num_workers Instrumentation objects follows */ + + /* + * array of num_plan_nodes * num_workers NodeInstrumentation objects + * follows + */ }; #define GetInstrumentationArray(sei) \ (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ - (Instrumentation *) (((char *) sei) + sei->instrument_offset)) + (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ typedef struct ExecParallelEstimateContext @@ -621,8 +624,6 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, char *pstmt_data; char *pstmt_space; char *paramlistinfo_space; - BufferUsage *bufusage_space; - WalUsage *walusage_space; SharedExecutorInstrumentation *instrumentation = NULL; SharedJitInstrumentation *jit_instrumentation = NULL; int pstmt_len; @@ -686,21 +687,14 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, shm_toc_estimate_keys(&pcxt->estimator, 1); /* - * Estimate space for BufferUsage. + * Estimate space for Instrumentation. * * If EXPLAIN is not in use and there are no extensions loaded that care, * we could skip this. But we have no way of knowing whether anyone's - * looking at pgBufferUsage, so do it unconditionally. - */ - shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_estimate_keys(&pcxt->estimator, 1); - - /* - * Same thing for WalUsage. + * looking at instrumentation, so do it unconditionally. */ shm_toc_estimate_chunk(&pcxt->estimator, - mul_size(sizeof(WalUsage), pcxt->nworkers)); + mul_size(sizeof(Instrumentation), pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); /* Estimate space for tuple queues. */ @@ -725,7 +719,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation_len = MAXALIGN(instrumentation_len); instrument_offset = instrumentation_len; instrumentation_len += - mul_size(sizeof(Instrumentation), + mul_size(sizeof(NodeInstrumentation), mul_size(e.nnodes, nworkers)); shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len); shm_toc_estimate_keys(&pcxt->estimator, 1); @@ -786,17 +780,18 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space); SerializeParamList(estate->es_param_list_info, ¶mlistinfo_space); - /* Allocate space for each worker's BufferUsage; no need to initialize. */ - bufusage_space = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(BufferUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space); - pei->buffer_usage = bufusage_space; + /* + * Allocate space for each worker's Instrumentation; no need to + * initialize. + */ + { + Instrumentation *instr; - /* Same for WalUsage. */ - walusage_space = shm_toc_allocate(pcxt->toc, - mul_size(sizeof(WalUsage), pcxt->nworkers)); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space); - pei->wal_usage = walusage_space; + instr = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(Instrumentation), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr); + pei->instrumentation = instr; + } /* Set up the tuple queues that the workers will write into. */ pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false); @@ -811,20 +806,20 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, */ if (estate->es_instrument) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len); - instrumentation->instrument_options = estate->es_instrument; + instrumentation->instrument_options = estate->es_instrument->instrument_options; instrumentation->instrument_offset = instrument_offset; instrumentation->num_workers = nworkers; instrumentation->num_plan_nodes = e.nnodes; instrument = GetInstrumentationArray(instrumentation); for (i = 0; i < nworkers * e.nnodes; ++i) - InstrInit(&instrument[i], estate->es_instrument); - shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, + InstrInitNode(&instrument[i], estate->es_instrument->instrument_options); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_NODE_INSTRUMENTATION, instrumentation); - pei->instrumentation = instrumentation; + pei->node_instrumentation = instrumentation; if (estate->es_jit_flags != PGJIT_NONE) { @@ -1053,7 +1048,7 @@ static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; int n; int ibytes; @@ -1071,19 +1066,33 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, instrument = GetInstrumentationArray(instrumentation); instrument += i * instrumentation->num_workers; for (n = 0; n < instrumentation->num_workers; ++n) + { InstrAggNode(planstate->instrument, &instrument[n]); + /* + * Also add worker WAL usage to the global pgWalUsage counter. + * + * When per-node instrumentation is active, parallel workers skip + * ExecFinalizeNodeInstrumentation (to avoid double-counting in + * EXPLAIN), so per-node WAL activity is not rolled up into the + * query-level stats that InstrAccumParallelQuery receives. Without + * this, pgWalUsage would under-report WAL generated by parallel + * workers when instrumentation is active. + */ + WalUsageAdd(&pgWalUsage, &instrument[n].instr.walusage); + } + /* * Also store the per-worker detail. * - * Worker instrumentation should be allocated in the same context as the - * regular instrumentation information, which is the per-query context. - * Switch into per-query memory context. + * Ensure worker instrumentation is allocated in the per-query context. We + * don't need to place this in the instrumentation context since no more + * stack-based instrumentation work is being done. */ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); - ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); + ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation)); planstate->worker_instrument = - palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); + palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument)); MemoryContextSwitchTo(oldcontext); planstate->worker_instrument->num_workers = instrumentation->num_workers; @@ -1212,7 +1221,7 @@ ExecParallelFinish(ParallelExecutorInfo *pei) * finish, or we might get incomplete data.) */ for (i = 0; i < nworkers; i++) - InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]); + InstrAccumParallelQuery(&pei->instrumentation[i]); pei->finished = true; } @@ -1226,10 +1235,10 @@ ExecParallelFinish(ParallelExecutorInfo *pei) void ExecParallelCleanup(ParallelExecutorInfo *pei) { - /* Accumulate instrumentation, if any. */ - if (pei->instrumentation) + /* Accumulate node instrumentation, if any. */ + if (pei->node_instrumentation) ExecParallelRetrieveInstrumentation(pei->planstate, - pei->instrumentation); + pei->node_instrumentation); /* Accumulate JIT instrumentation, if any. */ if (pei->jit_instrumentation) @@ -1313,7 +1322,7 @@ ExecParallelReportInstrumentation(PlanState *planstate, { int i; int plan_node_id = planstate->plan->plan_node_id; - Instrumentation *instrument; + NodeInstrumentation *instrument; InstrEndLoop(planstate->instrument); @@ -1452,8 +1461,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; - BufferUsage *buffer_usage; - WalUsage *wal_usage; + QueryInstrumentation *instr; DestReceiver *receiver; QueryDesc *queryDesc; SharedExecutorInstrumentation *instrumentation; @@ -1468,7 +1476,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */ receiver = ExecParallelGetReceiver(seg, toc); - instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true); + instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_NODE_INSTRUMENTATION, true); if (instrumentation != NULL) instrument_options = instrumentation->instrument_options; jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION, @@ -1512,7 +1520,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1526,10 +1534,12 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) ExecutorFinish(queryDesc); /* Report buffer/WAL usage during parallel execution. */ - buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); - wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], - &wal_usage[ParallelWorkerNumber]); + { + Instrumentation *worker_instr; + + worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false); + InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]); + } /* Report instrumentation data if any instrumentation options are set. */ if (instrumentation != NULL) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index d96d4f9947b79..6f2909a1bc35a 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -1381,7 +1381,7 @@ ExecInitPartitionDispatchInfo(EState *estate, { ResultRelInfo *rri = makeNode(ResultRelInfo); - InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0); + InitResultRelInfo(rri, rel, 0, rootResultRelInfo, NULL); proute->nonleaf_partitions[dispatchidx] = rri; } else diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index d35976925ae76..71a897f2b84a4 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -121,8 +121,9 @@ #include "nodes/nodeFuncs.h" static TupleTableSlot *ExecProcNodeFirst(PlanState *node); -static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -414,8 +415,31 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) - result->instrument = InstrAlloc(1, estate->es_instrument, - result->async_capable); + result->instrument = InstrAllocNode(estate->es_instrument, + result->async_capable); + + /* + * IndexScan / IndexOnlyScan track table and index access separately. + * + * We intentionally don't collect timing for them (even if enabled), since + * we don't need it, and executor nodes call InstrPushStack / + * InstrPopStack (instead of the full InstrNode*) to reduce overhead. + */ + if (estate->es_instrument && (estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0) + { + if (IsA(result, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, result); + + InstrInitOptions(&iss->iss_Instrument->table_instr, INSTRUMENT_BUFFERS); + } + else if (IsA(result, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, result); + + InstrInitOptions(&ioss->ioss_Instrument->table_instr, INSTRUMENT_BUFFERS); + } + } return result; } @@ -463,7 +487,7 @@ ExecProcNodeFirst(PlanState *node) * have ExecProcNode() directly call the relevant function from now on. */ if (node->instrument) - node->ExecProcNode = ExecProcNodeInstr; + node->ExecProcNode = InstrNodeSetupExecProcNode(node->instrument); else node->ExecProcNode = node->ExecProcNodeReal; @@ -471,25 +495,6 @@ ExecProcNodeFirst(PlanState *node) } -/* - * ExecProcNode wrapper that performs instrumentation calls. By keeping - * this a separate function, we avoid overhead in the normal case where - * no instrumentation is wanted. - */ -static TupleTableSlot * -ExecProcNodeInstr(PlanState *node) -{ - TupleTableSlot *result; - - InstrStartNode(node->instrument); - - result = node->ExecProcNodeReal(node); - - InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0); - - return result; -} - /* ---------------------------------------------------------------- * MultiExecProcNode @@ -788,10 +793,10 @@ ExecShutdownNode_walker(PlanState *node, void *context) * at least once already. We don't expect much CPU consumption during * node shutdown, but in the case of Gather or Gather Merge, we may shut * down workers at this stage. If so, their buffer usage will get - * propagated into pgBufferUsage at this point, and we want to make sure - * that it gets associated with the Gather node. We skip this if the node - * has never been executed, so as to avoid incorrectly making it appear - * that it has. + * propagated into the current instrumentation stack entry at this point, + * and we want to make sure that it gets associated with the Gather node. + * We skip this if the node has never been executed, so as to avoid + * incorrectly making it appear that it has. */ if (node->instrument && node->instrument->running) InstrStartNode(node->instrument); @@ -829,6 +834,110 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecRememberNodeInstrumentation + * + * Register all per-node instrumentation entries as unfinalized children of + * the executor's instrumentation. This is needed for abort recovery: if the + * executor aborts, we need to walk each per-node entry to recover buffer/WAL + * data from nodes that never got finalized, that someone might be interested + * in as an aggregate. + */ +void +ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent) +{ + (void) ExecRememberNodeInstrumentation_walker(node, parent); +} + +static bool +ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) +{ + QueryInstrumentation *parent = (QueryInstrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + if (node->instrument) + { + InstrQueryRememberChild(parent, &node->instrument->instr); + + /* IndexScan/IndexOnlyScan have a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + InstrQueryRememberChild(parent, &iss->iss_Instrument->table_instr); + } + else if (IsA(node, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node); + + InstrQueryRememberChild(parent, &ioss->ioss_Instrument->table_instr); + } + } + + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); +} + +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + Instrumentation *parent = (Instrumentation *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing our + * instrumentation as the parent context. This ensures children can + * accumulate to us even if they were never executed by the leader (e.g. + * nodes beneath Gather that only workers ran). + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + node->instrument ? &node->instrument->instr : parent); + + if (!node->instrument) + return false; + + /* IndexScan/IndexOnlyScan have a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + InstrFinalizeChild(&iss->iss_Instrument->table_instr, &node->instrument->instr); + } + else if (IsA(node, IndexOnlyScanState)) + { + IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node); + + InstrFinalizeChild(&ioss->ioss_Instrument->table_instr, &node->instrument->instr); + } + + InstrFinalizeChild(&node->instrument->instr, parent); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 9886ab06b6955..c20dc50f6fdf0 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -150,7 +150,7 @@ CreateExecutorState(void) estate->es_total_processed = 0; estate->es_top_eflags = 0; - estate->es_instrument = 0; + estate->es_instrument = NULL; estate->es_finished = false; estate->es_exprcontexts = NIL; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc2522f..4ea807e295fe6 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,124 +16,419 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" -BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +Instrumentation instr_top; +InstrStackState instr_stack = {0, 0, NULL, &instr_top}; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); -static void WalUsageAdd(WalUsage *dst, WalUsage *add); - - -/* Allocate new instrumentation structure(s) */ -Instrumentation * -InstrAlloc(int n, int instrument_options, bool async_mode) +void +InstrStackGrow(void) { - Instrumentation *instr; + int space = instr_stack.stack_space; - /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); - if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + if (instr_stack.entries == NULL) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; - - for (i = 0; i < n; i++) - { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - instr[i].async_mode = async_mode; - } + space = 10; /* Allocate sufficient initial space for + * typical activity */ + instr_stack.entries = MemoryContextAlloc(TopMemoryContext, + sizeof(Instrumentation *) * space); + } + else + { + space *= 2; + instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, space); } - return instr; + /* Update stack space after allocation succeeded to protect against OOMs */ + instr_stack.stack_space = space; +} + +/* General purpose instrumentation handling */ +static inline bool +InstrNeedStack(int instrument_options) +{ + return (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) != 0; } -/* Initialize a pre-allocated instrumentation structure. */ void -InstrInit(Instrumentation *instr, int instrument_options) +InstrInitOptions(Instrumentation *instr, int instrument_options) { - memset(instr, 0, sizeof(Instrumentation)); - instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; - instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; + instr->need_stack = InstrNeedStack(instrument_options); instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } -/* Entry to a plan node */ +static inline void +InstrStartTimer(Instrumentation *instr) +{ + Assert(INSTR_TIME_IS_ZERO(instr->starttime)); + + INSTR_TIME_SET_CURRENT(instr->starttime); +} + +static inline void +InstrStopTimer(Instrumentation *instr) +{ + instr_time endtime; + + Assert(!INSTR_TIME_IS_ZERO(instr->starttime)); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); +} + void -InstrStartNode(Instrumentation *instr) +InstrStart(Instrumentation *instr) { if (instr->need_timer) - { - if (!INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrStartNode called twice in a row"); - else - INSTR_TIME_SET_CURRENT(instr->starttime); - } + InstrStartTimer(instr); + + if (instr->need_stack) + InstrPushStack(instr); +} - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; +void +InstrStop(Instrumentation *instr) +{ + if (instr->need_timer) + InstrStopTimer(instr); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + if (instr->need_stack) + InstrPopStack(instr); } -/* Exit from a plan node */ +/* + * Stops instrumentation, finalizes the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not the current + * top, as can happen with PG_FINALLY, or resource owners, which don't have a + * guaranteed cleanup order. + * + * We are careful here to achieve two goals: + * + * 1) Reset the stack to the parent of whichever of the released stack entries + * has the lowest index + * 2) Accumulate all instrumentation to the currently active instrumentation, + * so that callers get a complete picture of activity, even after an abort + */ void -InstrStopNode(Instrumentation *instr, double nTuples) +InstrStopFinalize(Instrumentation *instr) { - double save_tuplecount = instr->tuplecount; - instr_time endtime; + int idx = -1; - /* count the returned tuples */ - instr->tuplecount += nTuples; + for (int i = instr_stack.stack_size - 1; i >= 0; i--) + { + if (instr_stack.entries[i] == instr) + { + idx = i; + break; + } + } - /* let's update the time only if the timer was requested */ - if (instr->need_timer) + if (idx >= 0) { - if (INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrStopNode called without start"); + while (instr_stack.stack_size > idx + 1) + instr_stack.stack_size--; - INSTR_TIME_SET_CURRENT(endtime); - INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); + InstrPopStack(instr); + } + + if (instr->need_timer) + InstrStopTimer(instr); + + InstrAccumStack(instr_stack.current, instr); +} + +/* + * Finalize child instrumentation by accumulating buffer/WAL usage to the + * provided instrumentation, which may be the current entry, or one the caller + * treats as a parent and will add to the totals later. + * + * Also deletes the unfinalized entry to avoid double counting in an abort + * situation, e.g. during executor finish. + */ +void +InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent) +{ + if (instr->need_stack) + { + if (!dlist_node_is_detached(&instr->unfinalized_entry)) + dlist_delete_thoroughly(&instr->unfinalized_entry); - INSTR_TIME_SET_ZERO(instr->starttime); + InstrAccumStack(parent, instr); } +} - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); +/* Query instrumentation handling */ - /* Is this the first tuple of this cycle? */ - if (!instr->running) +/* + * Use ResourceOwner mechanism to correctly reset instr_stack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr) +{ + ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc); +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res); + MemoryContext instr_cxt = qinstr->instr_cxt; + dlist_mutable_iter iter; + + /* Accumulate data from all unfinalized child entries (nodes, triggers) */ + dlist_foreach_modify(iter, &qinstr->unfinalized_entries) { - instr->running = true; - instr->firsttuple = instr->counter; + Instrumentation *child = dlist_container(Instrumentation, unfinalized_entry, iter.cur); + + InstrAccumStack(&qinstr->instr, child); } + + /* Ensure the stack is reset as expected, and we accumulate to the parent */ + InstrStopFinalize(&qinstr->instr); + + /* + * Destroy the dedicated instrumentation context, which frees the + * QueryInstrumentation and all child allocations. + */ + MemoryContextDelete(instr_cxt); +} + +QueryInstrumentation * +InstrQueryAlloc(int instrument_options) +{ + QueryInstrumentation *instr; + MemoryContext instr_cxt; + + /* + * When the instrumentation stack is used, create a dedicated memory + * context for this query's instrumentation allocations. This context is a + * child of TopMemoryContext so it survives transaction abort — + * ResourceOwner release needs to access it. + * + * For simpler cases (timer/rows only), use the current memory context. + * + * All child instrumentation allocations (nodes, triggers, etc) must be + * allocated within this context to ensure correct clean up on abort. + */ + if (InstrNeedStack(instrument_options)) + instr_cxt = AllocSetContextCreate(TopMemoryContext, + "Instrumentation", + ALLOCSET_SMALL_SIZES); else + instr_cxt = CurrentMemoryContext; + + instr = MemoryContextAllocZero(instr_cxt, sizeof(QueryInstrumentation)); + instr->instrument_options = instrument_options; + instr->instr_cxt = instr_cxt; + + InstrInitOptions(&instr->instr, instrument_options); + dlist_init(&instr->unfinalized_entries); + + return instr; +} + +void +InstrQueryStart(QueryInstrumentation *qinstr) +{ + InstrStart(&qinstr->instr); + + if (qinstr->instr.need_stack) + { + Assert(CurrentResourceOwner != NULL); + qinstr->owner = CurrentResourceOwner; + + ResourceOwnerEnlarge(qinstr->owner); + ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr); + } +} + +void +InstrQueryStop(QueryInstrumentation *qinstr) +{ + InstrStop(&qinstr->instr); + + if (qinstr->instr.need_stack) { - /* - * In async mode, if the plan node hadn't emitted any tuples before, - * this might be the first tuple - */ - if (instr->async_mode && save_tuplecount < 1.0) - instr->firsttuple = instr->counter; + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; } } +void +InstrQueryStopFinalize(QueryInstrumentation *qinstr) +{ + InstrStopFinalize(&qinstr->instr); + + if (!qinstr->instr.need_stack) + return; + + Assert(qinstr->owner != NULL); + ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr); + qinstr->owner = NULL; + + /* + * Reparent the dedicated instrumentation context under the current memory + * context, so that its lifetime is now tied to the caller's context + * rather than TopMemoryContext. + */ + MemoryContextSetParent(qinstr->instr_cxt, CurrentMemoryContext); +} + +/* + * Register a child Instrumentation entry for abort processing. + * + * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to + * recover buffer/WAL data from entries that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + */ +void +InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *child) +{ + if (child->need_stack) + dlist_push_head(&parent->unfinalized_entries, &child->unfinalized_entry); +} + +/* start instrumentation during parallel executor startup */ +QueryInstrumentation * +InstrStartParallelQuery(void) +{ + QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrQueryStart(qinstr); + return qinstr; +} + +/* report usage after parallel executor shutdown */ +void +InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst) +{ + InstrQueryStopFinalize(qinstr); + dst->need_stack = qinstr->instr.need_stack; + memcpy(&dst->bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage)); + memcpy(&dst->walusage, &qinstr->instr.walusage, sizeof(WalUsage)); +} + +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If it's active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ +void +InstrAccumParallelQuery(Instrumentation *instr) +{ + InstrAccumStack(instr_stack.current, instr); + + WalUsageAdd(&pgWalUsage, &instr->walusage); +} + +/* Node instrumentation handling */ + +/* Allocate new node instrumentation structure */ +NodeInstrumentation * +InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode) +{ + NodeInstrumentation *instr = MemoryContextAlloc(qinstr->instr_cxt, sizeof(NodeInstrumentation)); + + InstrInitNode(instr, qinstr->instrument_options); + instr->async_mode = async_mode; + + return instr; +} + +/* Initialize a pre-allocated instrumentation structure. */ +void +InstrInitNode(NodeInstrumentation *instr, int instrument_options) +{ + memset(instr, 0, sizeof(NodeInstrumentation)); + InstrInitOptions(&instr->instr, instrument_options); +} + +/* Entry to a plan node. If you modify this, check InstrNodeSetupExecProcNode. */ +void +InstrStartNode(NodeInstrumentation *instr) +{ + InstrStart(&instr->instr); +} + +/* + * Updates the node instrumentation time counter. + * + * Note this is different from InstrStop because total is only updated in + * InstrEndLoop. We need the separate counter variable because we need to + * calculate start-up time for the first tuple in each cycle, and then + * accumulate it together. + */ +static inline void +InstrStopNodeTimer(NodeInstrumentation *instr) +{ + instr_time endtime; + + Assert(!INSTR_TIME_IS_ZERO(instr->instr.starttime)); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime); + INSTR_TIME_SET_ZERO(instr->instr.starttime); + + /* + * Is this the first tuple of this cycle? + * + * In async mode, if the plan node hadn't emitted any tuples before, this + * might be the first tuple + */ + if (!instr->running || (instr->async_mode && instr->tuplecount < 1.0)) + instr->firsttuple = instr->counter; +} + +/* Exit from a plan node. If you modify this, check InstrNodeSetupExecProcNode. */ +void +InstrStopNode(NodeInstrumentation *instr, double nTuples) +{ + if (instr->instr.need_timer) + InstrStopNodeTimer(instr); + + /* Only pop the stack, accumulation runs in InstrFinalizeNode */ + if (instr->instr.need_stack) + InstrPopStack(&instr->instr); + + instr->running = true; + + /* count the returned tuples */ + instr->tuplecount += nTuples; +} + /* Update tuple count */ void -InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) { /* count the returned tuples */ instr->tuplecount += nTuples; @@ -141,24 +436,24 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) /* Finish a run cycle for a plan node */ void -InstrEndLoop(Instrumentation *instr) +InstrEndLoop(NodeInstrumentation *instr) { /* Skip if nothing has happened, or already shut down */ if (!instr->running) return; - if (!INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrEndLoop called on running node"); + /* Ensure InstrNodeStop was called */ + Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime)); /* Accumulate per-cycle statistics into totals */ INSTR_TIME_ADD(instr->startup, instr->firsttuple); - INSTR_TIME_ADD(instr->total, instr->counter); + INSTR_TIME_ADD(instr->instr.total, instr->counter); instr->ntuples += instr->tuplecount; instr->nloops += 1; /* Reset for next cycle (if any) */ instr->running = false; - INSTR_TIME_SET_ZERO(instr->starttime); + INSTR_TIME_SET_ZERO(instr->instr.starttime); INSTR_TIME_SET_ZERO(instr->counter); INSTR_TIME_SET_ZERO(instr->firsttuple); instr->tuplecount = 0; @@ -166,7 +461,7 @@ InstrEndLoop(Instrumentation *instr) /* aggregate instrumentation information */ void -InstrAggNode(Instrumentation *dst, Instrumentation *add) +InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) { if (!dst->running && add->running) { @@ -181,7 +476,7 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->tuplecount += add->tuplecount; INSTR_TIME_ADD(dst->startup, add->startup); - INSTR_TIME_ADD(dst->total, add->total); + INSTR_TIME_ADD(dst->instr.total, add->instr.total); dst->ntuples += add->ntuples; dst->ntuples2 += add->ntuples2; dst->nloops += add->nloops; @@ -189,41 +484,166 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->nfiltered2 += add->nfiltered2; /* Add delta of buffer usage since entry to node's totals */ - if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + if (dst->instr.need_stack) + InstrAccumStack(&dst->instr, &add->instr); +} + +/* + * Specialized handling of instrumented ExecProcNode + * + * These functions are equivalent to running ExecProcNodeReal wrapped in + * InstrStartNode and InstrStopNode, but avoid the conditionals in the hot path + * by checking the instrumentation options when the ExecProcNode pointer gets + * first set, and then using a special-purpose function for each. This results + * in a more optimized set of compiled instructions. + */ + +#include "executor/tuptable.h" +#include "nodes/execnodes.h" + +/* Simplified pop: restore saved state instead of re-deriving from array */ +static inline void +InstrPopStackTo(Instrumentation *prev) +{ + Assert(instr_stack.stack_size > 0); + Assert(instr_stack.stack_size > 1 ? instr_stack.entries[instr_stack.stack_size - 2] == prev : &instr_top == prev); + instr_stack.stack_size--; + instr_stack.current = prev; +} + +static inline TupleTableSlot * +ExecProcNodeInstr(PlanState *node, bool need_timer, bool need_stack) +{ + NodeInstrumentation *instr = node->instrument; + Instrumentation *prev = instr_stack.current; + TupleTableSlot *result; + + if (need_stack) + InstrPushStack(&instr->instr); + if (need_timer) + InstrStartTimer(&instr->instr); + + result = node->ExecProcNodeReal(node); + + if (need_timer) + InstrStopNodeTimer(instr); + if (need_stack) + InstrPopStackTo(prev); + + instr->running = true; + if (!TupIsNull(result)) + instr->tuplecount += 1.0; + + return result; +} + +static TupleTableSlot * +ExecProcNodeInstrFull(PlanState *node) +{ + return ExecProcNodeInstr(node, true, true); +} + +static TupleTableSlot * +ExecProcNodeInstrRowsStackOnly(PlanState *node) +{ + return ExecProcNodeInstr(node, false, true); +} + +static TupleTableSlot * +ExecProcNodeInstrRowsTimerOnly(PlanState *node) +{ + return ExecProcNodeInstr(node, true, false); +} + +static TupleTableSlot * +ExecProcNodeInstrRowsOnly(PlanState *node) +{ + return ExecProcNodeInstr(node, false, false); +} + +/* + * Returns an ExecProcNode wrapper that performs instrumentation calls, + * tailored to the instrumentation options enabled for the node. + */ +ExecProcNodeMtd +InstrNodeSetupExecProcNode(NodeInstrumentation *instr) +{ + bool need_timer = instr->instr.need_timer; + bool need_stack = instr->instr.need_stack; + + if (need_timer && need_stack) + return ExecProcNodeInstrFull; + else if (need_stack) + return ExecProcNodeInstrRowsStackOnly; + else if (need_timer) + return ExecProcNodeInstrRowsTimerOnly; + else + return ExecProcNodeInstrRowsOnly; +} - if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); +/* Trigger instrumentation handling */ +TriggerInstrumentation * +InstrAllocTrigger(QueryInstrumentation *qinstr, int n) +{ + TriggerInstrumentation *tginstr; + int i; + + /* + * Allocate in the query's dedicated instrumentation context so all + * instrumentation data is grouped together and cleaned up as a unit. + */ + Assert(qinstr != NULL && qinstr->instr_cxt != NULL); + tginstr = MemoryContextAllocZero(qinstr->instr_cxt, + n * sizeof(TriggerInstrumentation)); + + for (i = 0; i < n; i++) + InstrInitOptions(&tginstr[i].instr, qinstr->instrument_options); + + return tginstr; } -/* note current values during parallel executor startup */ void -InstrStartParallelQuery(void) +InstrStartTrigger(QueryInstrumentation *qinstr, TriggerInstrumentation *tginstr) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; + InstrStart(&tginstr->instr); + + /* + * On first call, register with the parent QueryInstrumentation for abort + * recovery. + */ + if (qinstr && tginstr->instr.need_stack && + dlist_node_is_detached(&tginstr->instr.unfinalized_entry)) + dlist_push_head(&qinstr->unfinalized_entries, + &tginstr->instr.unfinalized_entry); } -/* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) { - memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); - memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ + InstrStop(&tginstr->instr); + tginstr->firings += firings; } -/* accumulate work done by workers in leader's stats */ void -InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrAccumStack(Instrumentation *dst, Instrumentation *add) { - BufferUsageAdd(&pgBufferUsage, bufusage); - WalUsageAdd(&pgWalUsage, walusage); + Assert(dst != NULL); + Assert(add != NULL); + + if (!add->need_stack) + return; + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); } /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -244,39 +664,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ +/* dst += add */ void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - -/* helper functions for WAL usage accumulation */ -static void -WalUsageAdd(WalUsage *dst, WalUsage *add) +WalUsageAdd(WalUsage *dst, const WalUsage *add) { dst->wal_bytes += add->wal_bytes; dst->wal_records += add->wal_records; diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 70c55ee6d614d..63e24a0bcd4cf 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -276,7 +276,7 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) /* Set up instrumentation of bitmap index scans if requested */ if (estate->es_instrument) - indexstate->biss_Instrument = palloc0_object(IndexScanInstrumentation); + indexstate->biss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation)); /* Open the index relation. */ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 9eab81fd1c85b..66b02788b3c7f 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -67,6 +67,7 @@ IndexOnlyNext(IndexOnlyScanState *node) IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; + Instrumentation *table_instr = NULL; /* * extract necessary information from index scan node @@ -83,6 +84,9 @@ IndexOnlyNext(IndexOnlyScanState *node) econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; + if (node->ioss_Instrument && node->ioss_Instrument->table_instr.need_stack) + table_instr = &node->ioss_Instrument->table_instr; + if (scandesc == NULL) { /* @@ -163,11 +167,22 @@ IndexOnlyNext(IndexOnlyScanState *node) ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { + bool found; + /* * Rats, we have to visit the heap to check visibility. */ InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + + if (table_instr) + InstrPushStack(table_instr); + + found = index_fetch_heap(scandesc, node->ioss_TableSlot); + + if (table_instr) + InstrPopStack(table_instr); + + if (!found) continue; /* no visible tuple, try next index entry */ ExecClearTuple(node->ioss_TableSlot); @@ -434,6 +449,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->ioss_Instrument->nsearches; + InstrAccumStack(&winstrument->table_instr, &node->ioss_Instrument->table_instr); } /* @@ -608,7 +624,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) /* Set up instrumentation of index-only scans if requested */ if (estate->es_instrument) - indexstate->ioss_Instrument = palloc0_object(IndexScanInstrumentation); + indexstate->ioss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation)); /* Open the index relation. */ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; @@ -893,4 +909,11 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->ioss_SharedInfo = palloc(size); memcpy(node->ioss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + for (int i = 0; i < node->ioss_SharedInfo->num_workers; i++) + { + InstrAccumStack(&node->ioss_Instrument->table_instr, + &node->ioss_SharedInfo->winstrument[i].table_instr); + } } diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 06143e94c5a44..e66b6d6407b63 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -85,7 +85,10 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; + bool found; + Instrumentation *table_instr = NULL; /* * extract necessary information from index scan node @@ -102,6 +105,9 @@ IndexNext(IndexScanState *node) econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; + if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack) + table_instr = &node->iss_Instrument->table_instr; + if (scandesc == NULL) { /* @@ -130,8 +136,24 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (table_instr) + InstrPushStack(table_instr); + + for (;;) + { + found = index_fetch_heap(scandesc, slot); + if (found || !scandesc->xs_heap_continue) + break; + } + + if (table_instr) + InstrPopStack(table_instr); + + if (unlikely(!found)) + continue; + CHECK_FOR_INTERRUPTS(); /* @@ -179,6 +201,7 @@ IndexNextWithReorder(IndexScanState *node) Datum *lastfetched_vals; bool *lastfetched_nulls; int cmp; + Instrumentation *table_instr = NULL; estate = node->ss.ps.state; @@ -198,6 +221,9 @@ IndexNextWithReorder(IndexScanState *node) econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; + if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack) + table_instr = &node->iss_Instrument->table_instr; + if (scandesc == NULL) { /* @@ -259,36 +285,67 @@ IndexNextWithReorder(IndexScanState *node) } /* - * Fetch next tuple from the index. + * Fetch next valid tuple from the index. */ -next_indextuple: - if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + for (;;) { + ItemPointer tid; + bool found; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, ForwardScanDirection); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + { + /* + * No more tuples from the index. But we still need to drain + * any remaining tuples from the queue before we're done. + */ + node->iss_ReachedEnd = true; + break; + } + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + + if (table_instr) + InstrPushStack(table_instr); + + for (;;) + { + found = index_fetch_heap(scandesc, slot); + if (found || !scandesc->xs_heap_continue) + break; + } + + if (table_instr) + InstrPopStack(table_instr); + /* - * No more tuples from the index. But we still need to drain any - * remaining tuples from the queue before we're done. + * If the index was lossy, we have to recheck the index quals and + * ORDER BY expressions using the fetched tuple. */ - node->iss_ReachedEnd = true; - continue; - } - - /* - * If the index was lossy, we have to recheck the index quals and - * ORDER BY expressions using the fetched tuple. - */ - if (scandesc->xs_recheck) - { - econtext->ecxt_scantuple = slot; - if (!ExecQualAndReset(node->indexqualorig, econtext)) + if (found && scandesc->xs_recheck) { - /* Fails recheck, so drop it and loop back for another */ - InstrCountFiltered2(node, 1); - /* allow this loop to be cancellable */ - CHECK_FOR_INTERRUPTS(); - goto next_indextuple; + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->indexqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + continue; + } } + + if (found) + break; } + /* No more index entries, re-run to clear the reorder queue */ + if (node->iss_ReachedEnd) + continue; + if (scandesc->xs_recheckorderby) { econtext->ecxt_scantuple = slot; @@ -814,6 +871,7 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument->nsearches; + InstrAccumStack(&winstrument->table_instr, &node->iss_Instrument->table_instr); } /* @@ -976,7 +1034,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) /* Set up instrumentation of index scans if requested */ if (estate->es_instrument) - indexstate->iss_Instrument = palloc0_object(IndexScanInstrumentation); + indexstate->iss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation)); /* Open the index relation. */ lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode; @@ -1826,4 +1884,11 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + { + InstrAccumStack(&node->iss_Instrument->table_instr, + &node->iss_SharedInfo->winstrument[i].table_instr); + } } diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 2d7708805a688..f655035e213ed 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -903,7 +903,7 @@ create_edata_for_relation(LogicalRepRelMapEntry *rel) * Use Relation opened by logicalrep_rel_open() instead of opening it * again. */ - InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0); + InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, NULL); /* * We put the ResultRelInfo in the es_opened_result_relations list, even diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 00bc609529a5b..41a0baa344991 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -835,7 +835,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { PinLocalBuffer(bufHdr, true); - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); return true; } @@ -856,7 +856,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { if (BufferTagsEqual(&tag, &bufHdr->tag)) { - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); return true; } UnpinBuffer(bufHdr); @@ -1257,21 +1257,21 @@ PinBufferForBlock(Relation rel, { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); } else { bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, strategy, foundPtr, io_context); if (*foundPtr) - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); } if (rel) { /* - * While pgBufferUsage's "read" counter isn't bumped unless we reach - * WaitReadBuffers() (so, not for hits, and not for buffers that are - * zeroed instead), the per-relation stats always count them. + * While the current buffer usage "read" counter isn't bumped unless + * we reach WaitReadBuffers() (so, not for hits, and not for buffers + * that are zeroed instead), the per-relation stats always count them. */ pgstat_count_buffer_read(rel); if (*foundPtr) @@ -1998,9 +1998,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) true); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_hit += 1; + INSTR_BUFUSAGE_INCR(local_blks_hit); else - pgBufferUsage.shared_blks_hit += 1; + INSTR_BUFUSAGE_INCR(shared_blks_hit); if (operation->rel) pgstat_count_buffer_hit(operation->rel); @@ -2068,9 +2068,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) io_start, 1, io_buffers_len * BLCKSZ); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len); else - pgBufferUsage.shared_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len); /* * Track vacuum cost when issuing IO, not after waiting for it. @@ -2959,7 +2959,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } - pgBufferUsage.shared_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by); *extended_by = extend_by; @@ -3105,7 +3105,7 @@ MarkBufferDirty(Buffer buffer) */ if (!(old_buf_state & BM_DIRTY)) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } @@ -4520,7 +4520,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_start, 1, BLCKSZ); - pgBufferUsage.shared_blks_written++; + INSTR_BUFUSAGE_INCR(shared_blks_written); /* * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state. @@ -5663,7 +5663,7 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, UnlockBufHdr(bufHdr); } - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 404c6bccbdd7e..8845b0aeed62d 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) /* Mark not-dirty */ TerminateLocalBufferIO(bufHdr, true, 0, false); - pgBufferUsage.local_blks_written++; + INSTR_BUFUSAGE_INCR(local_blks_written); } static Buffer @@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, *extended_by = extend_by; - pgBufferUsage.local_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(local_blks_written, extend_by); return first_block; } @@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer) buf_state = pg_atomic_read_u64(&bufHdr->state); if (!(buf_state & BM_DIRTY)) - pgBufferUsage.local_blks_dirtied++; + INSTR_BUFUSAGE_INCR(local_blks_dirtied); buf_state |= BM_DIRTY; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index c4afe4d368a34..8b501dfcadd02 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -475,13 +475,13 @@ BufFileLoadBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start); } /* we choose not to advance curOffset here */ if (file->nbytes > 0) - pgBufferUsage.temp_blks_read++; + INSTR_BUFUSAGE_INCR(temp_blks_read); } /* @@ -549,13 +549,13 @@ BufFileDumpBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start); } file->curOffset += bytestowrite; wpos += bytestowrite; - pgBufferUsage.temp_blks_written++; + INSTR_BUFUSAGE_INCR(temp_blks_written); } file->dirty = false; diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 28de24538dced..60400f0c81f42 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc) * pg_stat_database only counts block read and write times, these are done for * IOOP_READ, IOOP_WRITE and IOOP_EXTEND. * - * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats - * for shared, local and temporary blocks. pg_stat_io does not track the - * activity of temporary blocks, so these are ignored here. + * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has + * write and read stats for shared, local and temporary blocks. pg_stat_io + * does not track the activity of temporary blocks, so these are ignored here. */ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, @@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, { pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time); } else if (io_op == IOOP_READ) { pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time); } } diff --git a/src/include/commands/explain_dr.h b/src/include/commands/explain_dr.h index f98eaae186457..fa98d29589fc2 100644 --- a/src/include/commands/explain_dr.h +++ b/src/include/commands/explain_dr.h @@ -23,11 +23,10 @@ typedef struct ExplainState ExplainState; typedef struct SerializeMetrics { uint64 bytesSent; /* # of bytes serialized */ - instr_time timeSpent; /* time spent serializing */ - BufferUsage bufferUsage; /* buffers accessed during serialization */ + Instrumentation instr; /* per-tuple timing/buffer measurement */ } SerializeMetrics; extern DestReceiver *CreateExplainSerializeDestReceiver(ExplainState *es); -extern SerializeMetrics GetSerializationMetrics(DestReceiver *dest); +extern SerializeMetrics *GetSerializationMetrics(DestReceiver *dest); #endif diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h index 5a2034811d563..6c8b602d07f98 100644 --- a/src/include/executor/execParallel.h +++ b/src/include/executor/execParallel.h @@ -25,9 +25,8 @@ typedef struct ParallelExecutorInfo { PlanState *planstate; /* plan subtree we're running in parallel */ ParallelContext *pcxt; /* parallel context we're using */ - BufferUsage *buffer_usage; /* points to bufusage area in DSM */ - WalUsage *wal_usage; /* walusage area in DSM */ - SharedExecutorInstrumentation *instrumentation; /* optional */ + Instrumentation *instrumentation; /* instrumentation area in DSM */ + SharedExecutorInstrumentation *node_instrumentation; /* optional */ struct SharedJitInstrumentation *jit_instrumentation; /* optional */ dsa_area *area; /* points to DSA area in DSM */ dsa_pointer param_exec; /* serialized PARAM_EXEC parameters */ diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index d3a572428449d..340029a203422 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -51,8 +51,8 @@ typedef struct QueryDesc /* This field is set by ExecutePlan */ bool already_executed; /* true if previously executed */ - /* This is always set NULL by the core system, but plugins can change it */ - struct Instrumentation *totaltime; /* total time spent in ExecutorRun */ + /* This field is set by ExecutorRun, or plugins */ + struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */ } QueryDesc; /* in pquery.c */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 07f4b1f7490bf..f56b13841fb9e 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -233,6 +233,7 @@ ExecGetJunkAttribute(TupleTableSlot *slot, AttrNumber attno, bool *isNull) /* * prototypes from functions in execMain.c */ +typedef struct QueryInstrumentation QueryInstrumentation; extern void ExecutorStart(QueryDesc *queryDesc, int eflags); extern void standard_ExecutorStart(QueryDesc *queryDesc, int eflags); extern void ExecutorRun(QueryDesc *queryDesc, @@ -254,7 +255,7 @@ extern void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, ResultRelInfo *partition_root_rri, - int instrument_options); + QueryInstrumentation *qinstr); extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid, ResultRelInfo *rootRelInfo); extern List *ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo); @@ -301,6 +302,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9759f3ea5d8d9..a2590a2e5b19f 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -67,56 +68,267 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* + * Instrumentation base class for capturing time and WAL/buffer usage + * + * If used directly: + * - Allocate on the stack and zero initialize the struct + * - Call InstrInitOptions to set instrumentation options + * - Call InstrStart before the activity you want to measure + * - Call InstrStop / InstrStopFinalize after the activity to capture totals + * + * InstrStart/InstrStop may be called multiple times. The last stop call must + * be to InstrStopFinalize to ensure parent stack entries get the accumulated + * totals. If there is risk of transaction aborts you must call + * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the + * instrumentation stack. + * + * In a query context use QueryInstrumentation instead, which handles aborts + * using the resource owner logic. + */ typedef struct Instrumentation { - /* Parameters set at node creation: */ + /* Parameters set at creation: */ bool need_timer; /* true if we need timer data */ - bool need_bufusage; /* true if we need buffer usage data */ - bool need_walusage; /* true if we need WAL usage data */ + bool need_stack; /* true if we need WAL/buffer usage data */ + /* Internal state keeping: */ + instr_time starttime; /* start time of last InstrStart */ + /* Accumulated statistics: */ + instr_time total; /* total runtime */ + BufferUsage bufusage; /* total buffer usage */ + WalUsage walusage; /* total WAL usage */ + /* Abort handling: link in parent QueryInstrumentation's unfinalized list */ + dlist_node unfinalized_entry; +} Instrumentation; + +/* + * Query-related instrumentation tracking. + * + * Usage: + * - Allocate on the heap using InstrQueryAlloc (required for abort handling) + * - Call InstrQueryStart before the activity you want to measure + * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals + * + * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop + * call must be to InstrQueryStopFinalize to ensure parent stack entries get + * the accumulated totals. + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of the top level transaction after having called + * InstrQueryStart, without first calling InstrQueryStop or + * InstrQueryStopFinalize. In the case of a transaction abort, logic equivalent + * to InstrQueryStopFinalize will be called automatically. + */ +struct ResourceOwnerData; +typedef struct QueryInstrumentation +{ + Instrumentation instr; + + /* Original instrument_options flags used to create this instrumentation */ + int instrument_options; + + /* Resource owner used for cleanup for aborts between InstrStart/InstrStop */ + struct ResourceOwnerData *owner; + + /* + * Dedicated memory context for all instrumentation allocations belonging + * to this query (node instrumentation, trigger instrumentation, etc.). + * Initially a child of TopMemoryContext so it survives transaction abort + * for ResourceOwner cleanup, which is then reassigned to the current + * memory context on InstrQueryStopFinalize. + */ + MemoryContext instr_cxt; + + /* + * Child entries that need to be cleaned up on abort, since they are not + * registered as a resource owner themselves. Contains both node and + * trigger instrumentation entries linked via instr.unfinalized_entry. + */ + dlist_head unfinalized_entries; +} QueryInstrumentation; + +/* + * Specialized instrumentation for per-node execution statistics + * + * Relies on an outer QueryInstrumentation having been set up to handle the + * stack used for WAL/buffer usage statistics, and relies on it for managing + * aborts. Solely intended for the executor and anyone reporting about its + * activities (e.g. EXPLAIN ANALYZE). + */ +typedef struct NodeInstrumentation +{ + Instrumentation instr; + /* Parameters set at node creation: */ bool async_mode; /* true if node is in async mode */ /* Info about current plan cycle: */ bool running; /* true if we've completed first tuple */ - instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ - instr_time total; /* total time */ double ntuples; /* total tuples produced */ double ntuples2; /* secondary node-specific tuple counter */ double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ -} Instrumentation; +} NodeInstrumentation; -typedef struct WorkerInstrumentation +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ +typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ - Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; -} WorkerInstrumentation; + NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; +} WorkerNodeInstrumentation; + +typedef struct TriggerInstrumentation +{ + Instrumentation instr; + int firings; /* number of times the instrumented trigger + * was fired */ +} TriggerInstrumentation; + +/* + * Dynamic array-based stack for tracking current WAL/buffer usage context. + * + * When the stack is empty, 'current' points to instr_top which accumulates + * session-level totals. + */ +typedef struct InstrStackState +{ + int stack_space; /* allocated capacity of entries array */ + int stack_size; /* current number of entries */ + + Instrumentation **entries; /* dynamic array of pointers */ + Instrumentation *current; /* top of stack, or &instr_top when empty */ +} InstrStackState; -extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int n, int instrument_options, - bool async_mode); -extern void InstrInit(Instrumentation *instr, int instrument_options); -extern void InstrStartNode(Instrumentation *instr); -extern void InstrStopNode(Instrumentation *instr, double nTuples); -extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples); -extern void InstrEndLoop(Instrumentation *instr); -extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); +/* + * The top instrumentation represents a running total of the current backend + * WAL/buffer usage information. This will not be updated immediately, but + * rather when the current stack entry gets accumulated which typically happens + * at query end. + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT Instrumentation instr_top; + +/* + * The instrumentation stack state. The 'current' field points to the + * currently active stack entry that is getting updated as activity happens, + * and will be accumulated to parent stacks when it gets finalized by + * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStackState instr_stack; + +extern void InstrStackGrow(void); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * instrumentation entry. + * + * See note on InstrPopStack regarding safe use of these functions. + */ +static inline void +InstrPushStack(Instrumentation *instr) +{ + if (unlikely(instr_stack.stack_size == instr_stack.stack_space)) + InstrStackGrow(); + + instr_stack.entries[instr_stack.stack_size++] = instr; + instr_stack.current = instr; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, consider instead using InstrStart + InstrStopFinalize which can skip + * intermediate stack entries. + */ +static inline void +InstrPopStack(Instrumentation *instr) +{ + Assert(instr_stack.stack_size > 0); + Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr); + instr_stack.stack_size--; + instr_stack.current = instr_stack.stack_size > 0 + ? instr_stack.entries[instr_stack.stack_size - 1] + : &instr_top; +} + +extern void InstrInitOptions(Instrumentation *instr, int instrument_options); +extern void InstrStart(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr); +extern void InstrStopFinalize(Instrumentation *instr); +extern void InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent); +extern void InstrAccumStack(Instrumentation *dst, Instrumentation *add); + +extern QueryInstrumentation *InstrQueryAlloc(int instrument_options); +extern void InstrQueryStart(QueryInstrumentation *instr); +extern void InstrQueryStop(QueryInstrumentation *instr); +extern void InstrQueryStopFinalize(QueryInstrumentation *instr); +extern void InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *instr); + +pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst); +extern void InstrAccumParallelQuery(Instrumentation *instr); + +extern NodeInstrumentation *InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode); +extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); +extern void InstrStartNode(NodeInstrumentation *instr); +extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); +extern void InstrEndLoop(NodeInstrumentation *instr); +extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); + +typedef struct TupleTableSlot TupleTableSlot; +typedef struct PlanState PlanState; +typedef TupleTableSlot *(*ExecProcNodeMtd) (PlanState *pstate); +extern ExecProcNodeMtd InstrNodeSetupExecProcNode(NodeInstrumentation *instr); + +extern TriggerInstrumentation *InstrAllocTrigger(QueryInstrumentation *qinstr, int n); +extern void InstrStartTrigger(QueryInstrumentation *qinstr, + TriggerInstrumentation *tginstr); +extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); + +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); +extern void WalUsageAdd(WalUsage *dst, const WalUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +#define INSTR_BUFUSAGE_INCR(fld) do { \ + instr_stack.current->bufusage.fld++; \ + } while(0) +#define INSTR_BUFUSAGE_ADD(fld,val) do { \ + instr_stack.current->bufusage.fld += val; \ + } while(0) +#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ + INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \ + } while (0) +#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ + INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \ + } while (0) + +#define INSTR_WALUSAGE_INCR(fld) do { \ + pgWalUsage.fld++; \ + instr_stack.current->walusage.fld++; \ + } while(0) +#define INSTR_WALUSAGE_ADD(fld,val) do { \ + pgWalUsage.fld += val; \ + instr_stack.current->walusage.fld += val; \ + } while(0) + #endif /* INSTRUMENT_H */ diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa57..e8531b84efa03 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Instrumentation utilized for tracking buffer usage during table access */ + Instrumentation table_instr; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 684e398f82438..ef951643339e0 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -53,12 +53,15 @@ typedef struct Instrumentation Instrumentation; typedef struct pairingheap pairingheap; typedef struct PlanState PlanState; typedef struct QueryEnvironment QueryEnvironment; +typedef struct QueryInstrumentation QueryInstrumentation; typedef struct RelationData *Relation; typedef Relation *RelationPtr; typedef struct ScanKeyData ScanKeyData; typedef struct SnapshotData *Snapshot; typedef struct SortSupportData *SortSupport; typedef struct TIDBitmap TIDBitmap; +typedef struct NodeInstrumentation NodeInstrumentation; +typedef struct TriggerInstrumentation TriggerInstrumentation; typedef struct TupleConversionMap TupleConversionMap; typedef struct TupleDescData *TupleDesc; typedef struct Tuplesortstate Tuplesortstate; @@ -66,7 +69,7 @@ typedef struct Tuplestorestate Tuplestorestate; typedef struct TupleTableSlot TupleTableSlot; typedef struct TupleTableSlotOps TupleTableSlotOps; typedef struct WalUsage WalUsage; -typedef struct WorkerInstrumentation WorkerInstrumentation; +typedef struct WorkerNodeInstrumentation WorkerNodeInstrumentation; /* ---------------- @@ -533,7 +536,7 @@ typedef struct ResultRelInfo ExprState **ri_TrigWhenExprs; /* optional runtime measurements for triggers */ - Instrumentation *ri_TrigInstrument; + TriggerInstrumentation *ri_TrigInstrument; /* On-demand created slots for triggers / returning processing */ TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */ @@ -729,7 +732,7 @@ typedef struct EState * ExecutorRun() calls. */ int es_top_eflags; /* eflags passed to ExecutorStart */ - int es_instrument; /* OR of InstrumentOption flags */ + QueryInstrumentation *es_instrument; /* query-level instrumentation */ bool es_finished; /* true when ExecutorFinish is done */ List *es_exprcontexts; /* List of ExprContexts within EState */ @@ -1184,8 +1187,10 @@ typedef struct PlanState ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a * wrapper */ - Instrumentation *instrument; /* Optional runtime stats for this node */ - WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ + NodeInstrumentation *instrument; /* Optional runtime stats for this + * node */ + WorkerNodeInstrumentation *worker_instrument; /* per-worker + * instrumentation */ /* Per-worker JIT instrumentation */ struct SharedJitInstrumentation *worker_jit_instrument; @@ -1785,6 +1790,7 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; IndexScanInstrumentation *ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; + TupleTableSlot *ioss_TableSlot; Buffer ioss_VMBuffer; Size ioss_PscanLen; diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb65..5463bc921f06e 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index 7c1f26b182cb0..e28e754369314 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -822,3 +822,200 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove (9 rows) reset work_mem; +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + exception_buffers_visible +--------------------------- + t +(1 row) + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + exception_buffers_nested_visible +---------------------------------- + t +(1 row) + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_cursor_explain_buffers(); + noscroll_ok | scroll_ok +-------------+----------- + t | t +(1 row) + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_parallel_explain_buffers(); + ratio +------- + 1 +(1 row) + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index ebdab42604beb..cf5c6335a1922 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -188,3 +188,197 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove -- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk) select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))'); reset work_mem; + +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); + +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; + +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. + +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; + +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_cursor_explain_buffers(); + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; + +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_parallel_explain_buffers(); + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0c07c945f05cd..cae1e2a8857d9 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1341,6 +1341,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStackState Instrumentation Int128AggState Int8TransTypeData @@ -1807,6 +1808,7 @@ NextSampleBlock_function NextSampleTuple_function NextValueExpr Node +NodeInstrumentation NodeTag NonEmptyRange NoneCompressorState @@ -2457,6 +2459,7 @@ QueryCompletion QueryDesc QueryEnvironment QueryInfo +QueryInstrumentation QueryItem QueryItemType QueryMode @@ -3191,6 +3194,7 @@ TriggerDesc TriggerEvent TriggerFlags TriggerInfo +TriggerInstrumentation TriggerTransition TruncateStmt TsmRoutine @@ -3412,9 +3416,9 @@ WorkTableScan WorkTableScanState WorkerInfo WorkerInfoData -WorkerInstrumentation WorkerJobDumpPtrType WorkerJobRestorePtrType +WorkerNodeInstrumentation Working_State WriteBufPtrType WriteBytePtrType