diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index e856cd35a6f0f..4be81489ff4fb 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
if (auto_explain_enabled())
{
- /*
- * Set up to track total elapsed time in ExecutorRun. Make sure the
- * space is allocated in the per-query context so it will go away at
- * ExecutorEnd.
- */
+ /* Set up to track total elapsed time in ExecutorRun. */
if (queryDesc->totaltime == NULL)
- {
- MemoryContext oldcxt;
-
- oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
- MemoryContextSwitchTo(oldcxt);
- }
+ queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
}
}
@@ -381,14 +371,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
*/
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
/* Log plan if duration is exceeded. */
- msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
+ msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total);
if (msec >= auto_explain_log_min_duration)
{
ExplainState *es = NewExplainState();
diff --git a/contrib/meson.build b/contrib/meson.build
index 5a752eac34711..2b1399e56f320 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -51,6 +51,7 @@ subdir('pg_overexplain')
subdir('pg_plan_advice')
subdir('pg_prewarm')
subdir('pgrowlocks')
+subdir('pg_session_buffer_usage')
subdir('pg_stat_statements')
subdir('pgstattuple')
subdir('pg_surgery')
diff --git a/contrib/pg_session_buffer_usage/Makefile b/contrib/pg_session_buffer_usage/Makefile
new file mode 100644
index 0000000000000..75bd8e09b3d3d
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/Makefile
@@ -0,0 +1,23 @@
+# contrib/pg_session_buffer_usage/Makefile
+
+MODULE_big = pg_session_buffer_usage
+OBJS = \
+ $(WIN32RES) \
+ pg_session_buffer_usage.o
+
+EXTENSION = pg_session_buffer_usage
+DATA = pg_session_buffer_usage--1.0.sql
+PGFILEDESC = "pg_session_buffer_usage - show buffer usage statistics for the current session"
+
+REGRESS = pg_session_buffer_usage
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_session_buffer_usage
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out
new file mode 100644
index 0000000000000..242b400395043
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out
@@ -0,0 +1,283 @@
+LOAD 'pg_session_buffer_usage';
+CREATE EXTENSION pg_session_buffer_usage;
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+ AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+ AND local_blks_hit >= 0 AND local_blks_read >= 0
+ AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+ AND temp_blks_read >= 0 AND temp_blks_written >= 0
+ AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+ AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+ AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+ ok
+----
+ t
+(1 row)
+
+-- Verify counters increase after buffer activity
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+ count
+-------
+ 1000
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM pg_session_buffer_usage();
+ blocks_increased
+------------------
+ t
+(1 row)
+
+DROP TABLE test_buf_activity;
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+ AS leader_buffers_match
+FROM pg_session_buffer_usage(), dc_serial_result s;
+ leader_buffers_match
+----------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_dc_tab, dc_serial_result;
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 FROM ins;
+ERROR: division by zero
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM pg_session_buffer_usage();
+ exception_buffers_visible
+---------------------------
+ t
+(1 row)
+
+DROP TABLE exc_tab;
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+ERROR: duplicate key value violates unique constraint "unique_tab_a_key"
+DETAIL: Key (a)=(1) already exists.
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM pg_session_buffer_usage();
+ unique_violation_buffers_visible
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE unique_tab;
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+ BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+ SELECT a / 0 FROM ins';
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT subxact_exc_func();
+ subxact_exc_func
+------------------
+ caught
+(1 row)
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM pg_session_buffer_usage();
+ subxact_buffers_visible
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+ rec record;
+ cnt int := 0;
+BEGIN
+ BEGIN
+ FOR rec IN SELECT * FROM cursor_tab LOOP
+ cnt := cnt + 1;
+ IF cnt = 250 THEN
+ PERFORM 1 / 0;
+ END IF;
+ END LOOP;
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught after ' || cnt || ' rows';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT cursor_exc_func();
+ cursor_exc_func
+-----------------------
+ caught after 250 rows
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0
+ AS cursor_subxact_buffers_visible
+FROM pg_session_buffer_usage();
+ cursor_subxact_buffers_visible
+--------------------------------
+ t
+(1 row)
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CONTEXT: parallel worker
+RESET debug_parallel_query;
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+ AS worker_abort_buffers_not_propagated
+FROM pg_session_buffer_usage(), par_abort_serial_result s;
+ worker_abort_buffers_not_propagated
+-------------------------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_abort_tab, par_abort_serial_result;
+-- Cleanup
+DROP EXTENSION pg_session_buffer_usage;
diff --git a/contrib/pg_session_buffer_usage/meson.build b/contrib/pg_session_buffer_usage/meson.build
new file mode 100644
index 0000000000000..34c7502beb463
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/meson.build
@@ -0,0 +1,34 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+pg_session_buffer_usage_sources = files(
+ 'pg_session_buffer_usage.c',
+)
+
+if host_system == 'windows'
+ pg_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_session_buffer_usage',
+ '--FILEDESC', 'pg_session_buffer_usage - show buffer usage statistics for the current session',])
+endif
+
+pg_session_buffer_usage = shared_module('pg_session_buffer_usage',
+ pg_session_buffer_usage_sources,
+ kwargs: contrib_mod_args,
+)
+contrib_targets += pg_session_buffer_usage
+
+install_data(
+ 'pg_session_buffer_usage--1.0.sql',
+ 'pg_session_buffer_usage.control',
+ kwargs: contrib_data_args,
+)
+
+tests += {
+ 'name': 'pg_session_buffer_usage',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'pg_session_buffer_usage',
+ ],
+ },
+}
diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql
new file mode 100644
index 0000000000000..b300fdbc64397
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql
@@ -0,0 +1,31 @@
+/* contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_session_buffer_usage" to load this file. \quit
+
+CREATE FUNCTION pg_session_buffer_usage(
+ OUT shared_blks_hit bigint,
+ OUT shared_blks_read bigint,
+ OUT shared_blks_dirtied bigint,
+ OUT shared_blks_written bigint,
+ OUT local_blks_hit bigint,
+ OUT local_blks_read bigint,
+ OUT local_blks_dirtied bigint,
+ OUT local_blks_written bigint,
+ OUT temp_blks_read bigint,
+ OUT temp_blks_written bigint,
+ OUT shared_blk_read_time double precision,
+ OUT shared_blk_write_time double precision,
+ OUT local_blk_read_time double precision,
+ OUT local_blk_write_time double precision,
+ OUT temp_blk_read_time double precision,
+ OUT temp_blk_write_time double precision
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'pg_session_buffer_usage'
+LANGUAGE C PARALLEL RESTRICTED;
+
+CREATE FUNCTION pg_session_buffer_usage_reset()
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_session_buffer_usage_reset'
+LANGUAGE C PARALLEL RESTRICTED;
diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c
new file mode 100644
index 0000000000000..f869956b3a9b0
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_session_buffer_usage.c
+ * show buffer usage statistics for the current session
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * contrib/pg_session_buffer_usage/pg_session_buffer_usage.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/instrument.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC_EXT(
+ .name = "pg_session_buffer_usage",
+ .version = PG_VERSION
+);
+
+#define NUM_BUFFER_USAGE_COLUMNS 16
+
+PG_FUNCTION_INFO_V1(pg_session_buffer_usage);
+PG_FUNCTION_INFO_V1(pg_session_buffer_usage_reset);
+
+#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack
+ * change */
+
+/*
+ * SQL function: pg_session_buffer_usage()
+ *
+ * Returns a single row with all BufferUsage counters accumulated since the
+ * start of the session. Excludes any usage not yet added to the top of the
+ * stack (e.g. if this gets called inside a statement that also had buffer
+ * activity).
+ */
+Datum
+pg_session_buffer_usage(PG_FUNCTION_ARGS)
+{
+ TupleDesc tupdesc;
+ Datum values[NUM_BUFFER_USAGE_COLUMNS];
+ bool nulls[NUM_BUFFER_USAGE_COLUMNS];
+ BufferUsage *usage;
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ memset(nulls, 0, sizeof(nulls));
+
+#if HAVE_INSTR_STACK
+ usage = &instr_top.bufusage;
+#else
+ usage = &pgBufferUsage;
+#endif
+
+ values[0] = Int64GetDatum(usage->shared_blks_hit);
+ values[1] = Int64GetDatum(usage->shared_blks_read);
+ values[2] = Int64GetDatum(usage->shared_blks_dirtied);
+ values[3] = Int64GetDatum(usage->shared_blks_written);
+ values[4] = Int64GetDatum(usage->local_blks_hit);
+ values[5] = Int64GetDatum(usage->local_blks_read);
+ values[6] = Int64GetDatum(usage->local_blks_dirtied);
+ values[7] = Int64GetDatum(usage->local_blks_written);
+ values[8] = Int64GetDatum(usage->temp_blks_read);
+ values[9] = Int64GetDatum(usage->temp_blks_written);
+ values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time));
+ values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time));
+ values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time));
+ values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time));
+ values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+ values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * SQL function: pg_session_buffer_usage_reset()
+ *
+ * Resets all BufferUsage counters on the top instrumentation stack to zero.
+ * Useful in tests to avoid the baseline/delta pattern.
+ */
+Datum
+pg_session_buffer_usage_reset(PG_FUNCTION_ARGS)
+{
+#if HAVE_INSTR_STACK
+ memset(&instr_top.bufusage, 0, sizeof(BufferUsage));
+#else
+ memset(&pgBufferUsage, 0, sizeof(BufferUsage));
+#endif
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control
new file mode 100644
index 0000000000000..fabd05ee02458
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control
@@ -0,0 +1,5 @@
+# pg_session_buffer_usage extension
+comment = 'show buffer usage statistics for the current session'
+default_version = '1.0'
+module_pathname = '$libdir/pg_session_buffer_usage'
+relocatable = true
diff --git a/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql
new file mode 100644
index 0000000000000..8f5810fadd337
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql
@@ -0,0 +1,204 @@
+LOAD 'pg_session_buffer_usage';
+CREATE EXTENSION pg_session_buffer_usage;
+
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+ AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+ AND local_blks_hit >= 0 AND local_blks_read >= 0
+ AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+ AND temp_blks_read >= 0 AND temp_blks_written >= 0
+ AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+ AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+ AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+
+-- Verify counters increase after buffer activity
+SELECT pg_session_buffer_usage_reset();
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM pg_session_buffer_usage();
+
+DROP TABLE test_buf_activity;
+
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+SELECT count(*) FROM par_dc_tab;
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+ AS leader_buffers_match
+FROM pg_session_buffer_usage(), dc_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_dc_tab, dc_serial_result;
+
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+
+SELECT pg_session_buffer_usage_reset();
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 FROM ins;
+
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP TABLE exc_tab;
+
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+
+SELECT pg_session_buffer_usage_reset();
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP TABLE unique_tab;
+
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+ BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+ SELECT a / 0 FROM ins';
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT subxact_exc_func();
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+ rec record;
+ cnt int := 0;
+BEGIN
+ BEGIN
+ FOR rec IN SELECT * FROM cursor_tab LOOP
+ cnt := cnt + 1;
+ IF cnt = 250 THEN
+ PERFORM 1 / 0;
+ END IF;
+ END LOOP;
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught after ' || cnt || ' rows';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT cursor_exc_func();
+
+SELECT local_blks_hit + local_blks_read > 0
+ AS cursor_subxact_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+
+SELECT pg_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+RESET debug_parallel_query;
+
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+ AS worker_abort_buffers_not_propagated
+FROM pg_session_buffer_usage(), par_abort_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_abort_tab, par_abort_serial_result;
+
+-- Cleanup
+DROP EXTENSION pg_session_buffer_usage;
diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out
index e4d6564ea5b5a..cba487f6be582 100644
--- a/contrib/pg_stat_statements/expected/utility.out
+++ b/contrib/pg_stat_statements/expected/utility.out
@@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
(3 rows)
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+SELECT run_explain_buffers_test();
+ run_explain_buffers_test
+--------------------------
+
+(1 row)
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+ query | has_buffer_stats
+-----------------------------------+------------------
+ SELECT run_explain_buffers_test() | t
+(1 row)
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+ v int;
+BEGIN
+ EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+CALL pgss_call_rollback_proc();
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+ query | calls | local_hitread | wal_bytes_generated | wal_records_generated
+--------------------------------+-------+---------------+---------------------+-----------------------
+ CALL pgss_call_rollback_proc() | 1 | t | t | t
+(1 row)
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
-- CALL
CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
DECLARE
diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out
index 977e382d84894..611213daef6c2 100644
--- a/contrib/pg_stat_statements/expected/wal.out
+++ b/contrib/pg_stat_statements/expected/wal.out
@@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
t
(1 row)
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+SELECT pgss_error_func();
+ pgss_error_func
+-----------------
+
+(1 row)
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+ query | calls | local_hitread | wal_bytes_generated | wal_records_generated
+--------------------------+-------+---------------+---------------------+-----------------------
+ SELECT pgss_error_func() | 1 | t | t | t
+(1 row)
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 4a427533bd88d..8448f9c13faaa 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -909,22 +909,11 @@ pgss_planner(Query *parse,
&& pgss_track_planning && query_string
&& parse->queryId != INT64CONST(0))
{
- instr_time start;
- instr_time duration;
- BufferUsage bufusage_start,
- bufusage;
- WalUsage walusage_start,
- walusage;
+ Instrumentation instr = {0};
- /* We need to track buffer usage as the planner can access them. */
- bufusage_start = pgBufferUsage;
-
- /*
- * Similarly the planner could write some WAL records in some cases
- * (e.g. setting a hint bit with those being WAL-logged)
- */
- walusage_start = pgWalUsage;
- INSTR_TIME_SET_CURRENT(start);
+ /* Track time and buffer/WAL usage as the planner can access them. */
+ InstrInitOptions(&instr, INSTRUMENT_ALL);
+ InstrStart(&instr);
nesting_level++;
PG_TRY();
@@ -938,30 +927,20 @@ pgss_planner(Query *parse,
}
PG_FINALLY();
{
+ InstrStopFinalize(&instr);
nesting_level--;
}
PG_END_TRY();
- INSTR_TIME_SET_CURRENT(duration);
- INSTR_TIME_SUBTRACT(duration, start);
-
- /* calc differences of buffer counters. */
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
- /* calc differences of WAL counters. */
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
pgss_store(query_string,
parse->queryId,
parse->stmt_location,
parse->stmt_len,
PGSS_PLAN,
- INSTR_TIME_GET_MILLISEC(duration),
+ INSTR_TIME_GET_MILLISEC(instr.total),
0,
- &bufusage,
- &walusage,
+ &instr.bufusage,
+ &instr.walusage,
NULL,
NULL,
0,
@@ -1013,19 +992,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
*/
if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0))
{
- /*
- * Set up to track total elapsed time in ExecutorRun. Make sure the
- * space is allocated in the per-query context so it will go away at
- * ExecutorEnd.
- */
+ /* Set up to track total elapsed time in ExecutorRun. */
if (queryDesc->totaltime == NULL)
- {
- MemoryContext oldcxt;
-
- oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
- MemoryContextSwitchTo(oldcxt);
- }
+ queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
}
}
@@ -1082,21 +1051,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
if (queryId != INT64CONST(0) && queryDesc->totaltime &&
pgss_enabled(nesting_level))
{
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
pgss_store(queryDesc->sourceText,
queryId,
queryDesc->plannedstmt->stmt_location,
queryDesc->plannedstmt->stmt_len,
PGSS_EXEC,
- INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total),
+ INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total),
queryDesc->estate->es_total_processed,
- &queryDesc->totaltime->bufusage,
- &queryDesc->totaltime->walusage,
+ &queryDesc->totaltime->instr.bufusage,
+ &queryDesc->totaltime->instr.walusage,
queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
NULL,
queryDesc->estate->es_parallel_workers_to_launch,
@@ -1160,17 +1123,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
!IsA(parsetree, ExecuteStmt) &&
!IsA(parsetree, PrepareStmt))
{
- instr_time start;
- instr_time duration;
uint64 rows;
- BufferUsage bufusage_start,
- bufusage;
- WalUsage walusage_start,
- walusage;
+ Instrumentation instr = {0};
- bufusage_start = pgBufferUsage;
- walusage_start = pgWalUsage;
- INSTR_TIME_SET_CURRENT(start);
+ InstrInitOptions(&instr, INSTRUMENT_ALL);
+ InstrStart(&instr);
nesting_level++;
PG_TRY();
@@ -1186,6 +1143,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
}
PG_FINALLY();
{
+ InstrStopFinalize(&instr);
nesting_level--;
}
PG_END_TRY();
@@ -1200,9 +1158,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
* former value, which'd otherwise be a good idea.
*/
- INSTR_TIME_SET_CURRENT(duration);
- INSTR_TIME_SUBTRACT(duration, start);
-
/*
* Track the total number of rows retrieved or affected by the utility
* statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
@@ -1214,23 +1169,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
qc->nprocessed : 0;
- /* calc differences of buffer counters. */
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
- /* calc differences of WAL counters. */
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
pgss_store(queryString,
saved_queryId,
saved_stmt_location,
saved_stmt_len,
PGSS_EXEC,
- INSTR_TIME_GET_MILLISEC(duration),
+ INSTR_TIME_GET_MILLISEC(instr.total),
rows,
- &bufusage,
- &walusage,
+ &instr.bufusage,
+ &instr.walusage,
NULL,
NULL,
0,
diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql
index dd97203c21025..7540e49c73caf 100644
--- a/contrib/pg_stat_statements/sql/utility.sql
+++ b/contrib/pg_stat_statements/sql/utility.sql
@@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7;
SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+SELECT run_explain_buffers_test();
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+ v int;
+BEGIN
+ EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+CALL pgss_call_rollback_proc();
+
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
+
-- CALL
CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
DECLARE
diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql
index 1dc1552a81ebc..467e321b2062e 100644
--- a/contrib/pg_stat_statements/sql/wal.sql
+++ b/contrib/pg_stat_statements/sql/wal.sql
@@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated,
wal_records >= rows as wal_records_ge_rows
FROM pg_stat_statements ORDER BY query COLLATE "C";
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT pgss_error_func();
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 60d90329a651d..6f0cb2a285b38 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node)
if (!resultRelInfo->ri_projectReturning)
{
TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
- Instrumentation *instr = node->ss.ps.instrument;
+ NodeInstrumentation *instr = node->ss.ps.instrument;
Assert(!dmstate->has_returning);
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 5f6f1db0467e9..9219625faf6e3 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2;
-> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10)
Index Cond: (unique2 = t1.unique2)
Index Searches: 10
+ Table Buffers: shared hit=10
Buffers: shared hit=24 read=6
Planning:
Buffers: shared hit=15 dirtied=9
@@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)';
Index Cond: (f1 @> '((0.5,2))'::polygon)
Rows Removed by Index Recheck: 1
Index Searches: 1
- Buffers: shared hit=1
+ Table Buffers: shared hit=1
+ Buffers: shared hit=2
Planning Time: 0.039 ms
Execution Time: 0.098 ms
@@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)';
then rejected by a recheck of the index condition. This happens because a
GiST index is lossy
for polygon containment tests: it actually
returns the rows with polygons that overlap the target, and then we have
- to do the exact containment test on those rows.
+ to do the exact containment test on those rows. The Table Buffers
+ counts indicate how many operations were performed on the table instead of
+ the index. This number is included in the Buffers counts.
@@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000
QUERY PLAN
-------------------------------------------------------------------&zwsp;------------------------------------------------------------
Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1)
- Buffers: shared hit=16
+ Buffers: shared hit=14
-> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1)
Index Cond: (unique2 > 9000)
Filter: (unique1 < 100)
Rows Removed by Filter: 287
Index Searches: 1
- Buffers: shared hit=16
+ Table Buffers: shared hit=11
+ Buffers: shared hit=14
Planning Time: 0.077 ms
Execution Time: 0.086 ms
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 7dee77fd366b0..912c96f2ff51a 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200);
-> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1)
Index Cond: ((id > 100) AND (id < 200))
Index Searches: 1
+ Table Buffers: shared hit=1
Buffers: shared hit=4
Planning Time: 0.244 ms
Execution Time: 0.073 ms
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 1909c3254b5ba..d62eb7dee9cdf 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -2432,8 +2432,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
* and PARALLEL_KEY_BUFFER_USAGE.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgWalUsage or
- * pgBufferUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
mul_size(sizeof(WalUsage), pcxt->nworkers));
@@ -2884,6 +2884,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
+ QueryInstrumentation *instr;
WalUsage *walusage;
BufferUsage *bufferusage;
int sortmem;
@@ -2933,7 +2934,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
tuplesort_attach_shared(sharedsort, seg);
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Might as well use reliable figure when doling out maintenance_work_mem
@@ -2948,7 +2949,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
/* Report WAL/buffer usage during parallel execution */
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &bufferusage[ParallelWorkerNumber],
&walusage[ParallelWorkerNumber]);
index_close(indexRel, indexLockmode);
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 97cea5f7d4e04..8cdcd2a9bec1c 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -988,8 +988,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
* and PARALLEL_KEY_BUFFER_USAGE.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgWalUsage or
- * pgBufferUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
mul_size(sizeof(WalUsage), pcxt->nworkers));
@@ -2114,6 +2114,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
+ QueryInstrumentation *instr;
WalUsage *walusage;
BufferUsage *bufferusage;
int sortmem;
@@ -2182,7 +2183,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
tuplesort_attach_shared(sharedsort, seg);
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Might as well use reliable figure when doling out maintenance_work_mem
@@ -2197,7 +2198,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
/* Report WAL/buffer usage during parallel execution */
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &bufferusage[ParallelWorkerNumber],
&walusage[ParallelWorkerNumber]);
index_close(indexRel, indexLockmode);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 82c5b28e0ad55..b4cbd0e682c74 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -641,8 +641,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
TimestampTz starttime = 0;
PgStat_Counter startreadtime = 0,
startwritetime = 0;
- WalUsage startwalusage = pgWalUsage;
- BufferUsage startbufferusage = pgBufferUsage;
+ QueryInstrumentation *instr = NULL;
ErrorContextCallback errcallback;
char **indnames = NULL;
Size dead_items_max_bytes = 0;
@@ -658,6 +657,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
startreadtime = pgStatBlockReadTime;
startwritetime = pgStatBlockWriteTime;
}
+ instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+ InstrQueryStart(instr);
}
/* Used for instrumentation and stats report */
@@ -983,14 +984,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
{
TimestampTz endtime = GetCurrentTimestamp();
+ instr = InstrQueryStopFinalize(instr);
+
if (verbose || params.log_vacuum_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
params.log_vacuum_min_duration))
{
long secs_dur;
int usecs_dur;
- WalUsage walusage;
- BufferUsage bufferusage;
StringInfoData buf;
char *msgfmt;
int32 diff;
@@ -999,12 +1000,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
int64 total_blks_hit;
int64 total_blks_read;
int64 total_blks_dirtied;
+ BufferUsage bufferusage = instr->instr.bufusage;
+ WalUsage walusage = instr->instr.walusage;
TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
- memset(&bufferusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
total_blks_hit = bufferusage.shared_blks_hit +
bufferusage.local_blks_hit;
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 69ef1527e0609..dfe4fd9459c96 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1465,8 +1465,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
* and PARALLEL_KEY_BUFFER_USAGE.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgWalUsage or
- * pgBufferUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
mul_size(sizeof(WalUsage), pcxt->nworkers));
@@ -1752,6 +1752,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
+ QueryInstrumentation *instr;
WalUsage *walusage;
BufferUsage *bufferusage;
int sortmem;
@@ -1827,7 +1828,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
}
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/* Perform sorting of spool, and possibly a spool2 */
sortmem = maintenance_work_mem / btshared->scantuplesortstates;
@@ -1837,7 +1838,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
/* Report WAL/buffer usage during parallel execution */
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &bufferusage[ParallelWorkerNumber],
&walusage[ParallelWorkerNumber]);
#ifdef BTREE_BUILD_STATS
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f5c9a34374dde..9b33584f4548e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1081,10 +1081,10 @@ XLogInsertRecord(XLogRecData *rdata,
/* Report WAL traffic to the instrumentation. */
if (inserted)
{
- pgWalUsage.wal_bytes += rechdr->xl_tot_len;
- pgWalUsage.wal_records++;
- pgWalUsage.wal_fpi += num_fpi;
- pgWalUsage.wal_fpi_bytes += fpi_bytes;
+ INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len);
+ INSTR_WALUSAGE_INCR(wal_records);
+ INSTR_WALUSAGE_ADD(wal_fpi, num_fpi);
+ INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes);
/* Required for the flush of pending stats WAL data */
pgstat_report_fixed = true;
@@ -2063,7 +2063,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
WriteRqst.Flush = InvalidXLogRecPtr;
XLogWrite(WriteRqst, tli, false);
LWLockRelease(WALWriteLock);
- pgWalUsage.wal_buffers_full++;
+ INSTR_WALUSAGE_INCR(wal_buffers_full);
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
/*
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 53adac9139bd4..38f8b379fa413 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
Oid save_userid;
int save_sec_context;
int save_nestlevel;
- WalUsage startwalusage = pgWalUsage;
- BufferUsage startbufferusage = pgBufferUsage;
- BufferUsage bufferusage;
+ QueryInstrumentation *instr = NULL;
PgStat_Counter startreadtime = 0;
PgStat_Counter startwritetime = 0;
@@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
}
pg_rusage_init(&ru0);
+
+ instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+ InstrQueryStart(instr);
}
/* Used for instrumentation and stats report */
@@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
{
TimestampTz endtime = GetCurrentTimestamp();
+ instr = InstrQueryStopFinalize(instr);
+
if (verbose || params.log_analyze_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
params.log_analyze_min_duration))
{
long delay_in_ms;
- WalUsage walusage;
double read_rate = 0;
double write_rate = 0;
char *msgfmt;
@@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
int64 total_blks_hit;
int64 total_blks_read;
int64 total_blks_dirtied;
-
- memset(&bufferusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-
- total_blks_hit = bufferusage.shared_blks_hit +
- bufferusage.local_blks_hit;
- total_blks_read = bufferusage.shared_blks_read +
- bufferusage.local_blks_read;
- total_blks_dirtied = bufferusage.shared_blks_dirtied +
- bufferusage.local_blks_dirtied;
+ BufferUsage bufusage = instr->instr.bufusage;
+ WalUsage walusage = instr->instr.walusage;
+
+ total_blks_hit = bufusage.shared_blks_hit +
+ bufusage.local_blks_hit;
+ total_blks_read = bufusage.shared_blks_read +
+ bufusage.local_blks_read;
+ total_blks_dirtied = bufusage.shared_blks_dirtied +
+ bufusage.local_blks_dirtied;
/*
* We do not expect an analyze to take > 25 days and it simplifies
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 93918a223b8d7..8a641f9d05f94 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
static const char *explain_get_index_name(Oid indexId);
static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
static void show_wal_usage(ExplainState *es, const WalUsage *usage);
static void show_memory_counters(ExplainState *es,
const MemoryContextCounters *mem_counters);
@@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
QueryEnvironment *queryEnv)
{
PlannedStmt *plan;
- instr_time planstart,
- planduration;
- BufferUsage bufusage_start,
- bufusage;
+ QueryInstrumentation *instr = NULL;
MemoryContextCounters mem_counters;
MemoryContext planner_ctx = NULL;
MemoryContext saved_ctx = NULL;
+ if (es->buffers)
+ instr = InstrQueryAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS);
+ else
+ instr = InstrQueryAlloc(INSTRUMENT_TIMER);
+
if (es->memory)
{
/*
@@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
saved_ctx = MemoryContextSwitchTo(planner_ctx);
}
- if (es->buffers)
- bufusage_start = pgBufferUsage;
- INSTR_TIME_SET_CURRENT(planstart);
+ InstrQueryStart(instr);
/* plan the query */
plan = pg_plan_query(query, queryString, cursorOptions, params, es);
- INSTR_TIME_SET_CURRENT(planduration);
- INSTR_TIME_SUBTRACT(planduration, planstart);
+ instr = InstrQueryStopFinalize(instr);
if (es->memory)
{
@@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
MemoryContextMemConsumed(planner_ctx, &mem_counters);
}
- /* calc differences of buffer counters. */
- if (es->buffers)
- {
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
- }
-
/* run it (if needed) and produce output */
ExplainOnePlan(plan, into, es, queryString, params, queryEnv,
- &planduration, (es->buffers ? &bufusage : NULL),
+ &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL),
es->memory ? &mem_counters : NULL);
}
@@ -611,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
}
if (bufusage)
- show_buffer_usage(es, bufusage);
+ show_buffer_usage(es, bufusage, NULL);
if (mem_counters)
show_memory_counters(es, mem_counters);
@@ -1028,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
{
es->indent++;
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->bufferUsage, NULL);
es->indent--;
}
}
@@ -1042,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
BYTES_TO_KILOBYTES(metrics->bytesSent), es);
ExplainPropertyText("Format", format, es);
if (es->buffers)
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->bufferUsage, NULL);
}
ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1099,18 +1091,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
{
Trigger *trig = rInfo->ri_TrigDesc->triggers + nt;
- Instrumentation *instr = rInfo->ri_TrigInstrument + nt;
+ TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt;
char *relname;
char *conname = NULL;
- /* Must clean up instrumentation state */
- InstrEndLoop(instr);
-
/*
* We ignore triggers that were never invoked; they likely aren't
* relevant to the current query type.
*/
- if (instr->ntuples == 0)
+ if (tginstr->firings == 0)
continue;
ExplainOpenGroup("Trigger", NULL, true, es);
@@ -1135,11 +1124,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
if (show_relname)
appendStringInfo(es->str, " on %s", relname);
if (es->timing)
- appendStringInfo(es->str, ": time=%.3f calls=%.0f\n",
- INSTR_TIME_GET_MILLISEC(instr->total),
- instr->ntuples);
+ appendStringInfo(es->str, ": time=%.3f calls=%d\n",
+ INSTR_TIME_GET_MILLISEC(tginstr->instr.total),
+ tginstr->firings);
else
- appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples);
+ appendStringInfo(es->str, ": calls=%d\n", tginstr->firings);
}
else
{
@@ -1149,9 +1138,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
ExplainPropertyText("Relation", relname, es);
if (es->timing)
ExplainPropertyFloat("Time", "ms",
- INSTR_TIME_GET_MILLISEC(instr->total), 3,
+ INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3,
es);
- ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es);
+ ExplainPropertyInteger("Calls", NULL, tginstr->firings, es);
}
if (conname)
@@ -1838,7 +1827,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
{
double nloops = planstate->instrument->nloops;
double startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops;
- double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops;
+ double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->instr.total) / nloops;
double rows = planstate->instrument->ntuples / nloops;
if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -1891,11 +1880,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* prepare per-worker general execution details */
if (es->workers_state && es->verbose)
{
- WorkerInstrumentation *w = planstate->worker_instrument;
+ WorkerNodeInstrumentation *w = planstate->worker_instrument;
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
double startup_ms;
double total_ms;
@@ -1904,7 +1893,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (nloops <= 0)
continue;
startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops;
- total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops;
+ total_ms = INSTR_TIME_GET_MILLISEC(instrument->instr.total) / nloops;
rows = instrument->ntuples / nloops;
ExplainOpenWorker(n, es);
@@ -1973,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
show_indexsearches_info(planstate, es);
+
+ if (es->buffers && planstate->instrument)
+ show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTable->instr.bufusage, "Table");
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -2291,18 +2283,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->bufusage);
+ show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL);
if (es->wal && planstate->instrument)
- show_wal_usage(es, &planstate->instrument->walusage);
+ show_wal_usage(es, &planstate->instrument->instr.walusage);
/* Prepare per-worker buffer/WAL usage */
if (es->workers_state && (es->buffers || es->wal) && es->verbose)
{
- WorkerInstrumentation *w = planstate->worker_instrument;
+ WorkerNodeInstrumentation *w = planstate->worker_instrument;
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
if (nloops <= 0)
@@ -2310,9 +2302,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->bufusage);
+ show_buffer_usage(es, &instrument->instr.bufusage, NULL);
if (es->wal)
- show_wal_usage(es, &instrument->walusage);
+ show_wal_usage(es, &instrument->instr.walusage);
ExplainCloseWorker(n, es);
}
}
@@ -4110,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
* Show buffer usage details. This better be sync with peek_buffer_usage.
*/
static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
{
if (es->format == EXPLAIN_FORMAT_TEXT)
{
@@ -4135,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared || has_local || has_temp)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "Buffers:");
if (has_shared)
@@ -4190,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared_timing || has_local_timing || has_temp_timing)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "I/O Timings:");
if (has_shared_timing)
@@ -4231,6 +4227,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
}
else
{
+ char *buffers_title = NULL;
+
+ if (title)
+ {
+ buffers_title = psprintf("%s Buffers", title);
+ ExplainOpenGroup(buffers_title, buffers_title, true, es);
+ }
+
ExplainPropertyInteger("Shared Hit Blocks", NULL,
usage->shared_blks_hit, es);
ExplainPropertyInteger("Shared Read Blocks", NULL,
@@ -4251,8 +4255,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
usage->temp_blks_read, es);
ExplainPropertyInteger("Temp Written Blocks", NULL,
usage->temp_blks_written, es);
+
+ if (buffers_title)
+ ExplainCloseGroup(buffers_title, buffers_title, true, es);
+
if (track_io_timing)
{
+ char *timings_title = NULL;
+
+ if (title)
+ {
+ timings_title = psprintf("%s I/O Timings", title);
+ ExplainOpenGroup(timings_title, timings_title, true, es);
+ }
+
ExplainPropertyFloat("Shared I/O Read Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
3, es);
@@ -4271,6 +4287,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
ExplainPropertyFloat("Temp I/O Write Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
3, es);
+
+ if (timings_title)
+ ExplainCloseGroup(timings_title, timings_title, true, es);
}
}
}
diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c
index 3c96061cf32ab..c9695b03a6030 100644
--- a/src/backend/commands/explain_dr.c
+++ b/src/backend/commands/explain_dr.c
@@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
MemoryContext oldcontext;
StringInfo buf = &myState->buf;
int natts = typeinfo->natts;
- instr_time start,
- end;
- BufferUsage instr_start;
+ QueryInstrumentation *instr = NULL;
/* only measure time, buffers if requested */
- if (myState->es->timing)
- INSTR_TIME_SET_CURRENT(start);
- if (myState->es->buffers)
- instr_start = pgBufferUsage;
+ if (myState->es->timing || myState->es->buffers)
+ {
+ InstrumentOption instrument_options = 0;
+
+ if (myState->es->timing)
+ instrument_options |= INSTRUMENT_TIMER;
+ if (myState->es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+ instr = InstrQueryAlloc(instrument_options);
+ InstrQueryStart(instr);
+ }
/* Set or update my derived attribute info, if needed */
if (myState->attrinfo != typeinfo || myState->nattrs != natts)
@@ -186,18 +191,19 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
MemoryContextSwitchTo(oldcontext);
MemoryContextReset(myState->tmpcontext);
+ if (myState->es->timing || myState->es->buffers)
+ instr = InstrQueryStopFinalize(instr);
+
/* Update timing data */
if (myState->es->timing)
- {
- INSTR_TIME_SET_CURRENT(end);
- INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start);
- }
+ INSTR_TIME_ADD(myState->metrics.timeSpent, instr->instr.total);
/* Update buffer metrics */
if (myState->es->buffers)
- BufferUsageAccumDiff(&myState->metrics.bufferUsage,
- &pgBufferUsage,
- &instr_start);
+ BufferUsageAdd(&myState->metrics.bufferUsage, &instr->instr.bufusage);
+
+ if (myState->es->timing || myState->es->buffers)
+ pfree(instr);
return true;
}
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 5b86a727587b2..d81f6b30e9cd3 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -578,13 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
ListCell *p;
ParamListInfo paramLI = NULL;
EState *estate = NULL;
- instr_time planstart;
- instr_time planduration;
- BufferUsage bufusage_start,
- bufusage;
+ QueryInstrumentation *instr = NULL;
MemoryContextCounters mem_counters;
MemoryContext planner_ctx = NULL;
MemoryContext saved_ctx = NULL;
+ int instrument_options = INSTRUMENT_TIMER;
+
+ if (es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+
+ instr = InstrQueryAlloc(instrument_options);
if (es->memory)
{
@@ -596,9 +599,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
saved_ctx = MemoryContextSwitchTo(planner_ctx);
}
- if (es->buffers)
- bufusage_start = pgBufferUsage;
- INSTR_TIME_SET_CURRENT(planstart);
+ InstrQueryStart(instr);
/* Look it up in the hash table */
entry = FetchPreparedStatement(execstmt->name, true);
@@ -633,8 +634,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
cplan = GetCachedPlan(entry->plansource, paramLI,
CurrentResourceOwner, pstate->p_queryEnv);
- INSTR_TIME_SET_CURRENT(planduration);
- INSTR_TIME_SUBTRACT(planduration, planstart);
+ instr = InstrQueryStopFinalize(instr);
if (es->memory)
{
@@ -642,13 +642,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
MemoryContextMemConsumed(planner_ctx, &mem_counters);
}
- /* calc differences of buffer counters. */
- if (es->buffers)
- {
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
- }
-
plan_list = cplan->stmt_list;
/* Explain each query */
@@ -658,7 +651,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
if (pstmt->commandType != CMD_UTILITY)
ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv,
- &planduration, (es->buffers ? &bufusage : NULL),
+ &instr->instr.total, (es->buffers ? &instr->instr.bufusage : NULL),
es->memory ? &mem_counters : NULL);
else
ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI);
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 98d402c0a3be7..c336007314121 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -90,7 +90,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
int tgindx,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context);
static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
ResultRelInfo *src_partinfo,
@@ -2309,7 +2309,7 @@ static HeapTuple
ExecCallTriggerFunc(TriggerData *trigdata,
int tgindx,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context)
{
LOCAL_FCINFO(fcinfo, 0);
@@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
* If doing EXPLAIN ANALYZE, start charging time to this trigger.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStartTrigger(instr + tgindx);
/*
* Do the function evaluation in the per-tuple memory context, so that
@@ -2389,10 +2389,10 @@ ExecCallTriggerFunc(TriggerData *trigdata,
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
- * one "tuple returned" (really the number of firings).
+ * the firing of the trigger.
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStopTrigger(instr + tgindx, 1);
return (HeapTuple) DatumGetPointer(result);
}
@@ -3936,7 +3936,7 @@ static void AfterTriggerExecute(EState *estate,
ResultRelInfo *dst_relInfo,
TriggerDesc *trigdesc,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context,
TupleTableSlot *trig_tuple_slot1,
TupleTableSlot *trig_tuple_slot2);
@@ -4330,7 +4330,7 @@ AfterTriggerExecute(EState *estate,
ResultRelInfo *src_relInfo,
ResultRelInfo *dst_relInfo,
TriggerDesc *trigdesc,
- FmgrInfo *finfo, Instrumentation *instr,
+ FmgrInfo *finfo, TriggerInstrumentation *instr,
MemoryContext per_tuple_context,
TupleTableSlot *trig_tuple_slot1,
TupleTableSlot *trig_tuple_slot2)
@@ -4371,7 +4371,7 @@ AfterTriggerExecute(EState *estate,
* to include time spent re-fetching tuples in the trigger cost.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStartTrigger(instr + tgindx);
/*
* Fetch the required tuple(s).
@@ -4588,10 +4588,10 @@ AfterTriggerExecute(EState *estate,
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
- * one "tuple returned" (really the number of firings).
+ * the firing of the trigger.
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStopTrigger(instr + tgindx, 1);
}
@@ -4707,7 +4707,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
Relation rel = NULL;
TriggerDesc *trigdesc = NULL;
FmgrInfo *finfo = NULL;
- Instrumentation *instr = NULL;
+ TriggerInstrumentation *instr = NULL;
TupleTableSlot *slot1 = NULL,
*slot2 = NULL;
diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c
index 279108ca89f0b..75074fe4efa60 100644
--- a/src/backend/commands/vacuumparallel.c
+++ b/src/backend/commands/vacuumparallel.c
@@ -308,8 +308,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
* PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgBufferUsage or
- * pgWalUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
mul_size(sizeof(BufferUsage), pcxt->nworkers));
@@ -995,6 +995,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
PVIndStats *indstats;
PVShared *shared;
TidStore *dead_items;
+ QueryInstrumentation *instr;
BufferUsage *buffer_usage;
WalUsage *wal_usage;
int nindexes;
@@ -1084,7 +1085,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
error_context_stack = &errcallback;
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/* Process indexes to perform vacuum/cleanup */
parallel_vacuum_process_safe_indexes(&pvs);
@@ -1092,7 +1093,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
/* Report buffer/WAL usage during parallel execution */
buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &buffer_usage[ParallelWorkerNumber],
&wal_usage[ParallelWorkerNumber]);
/* Report any remaining cost-based vacuum delay time */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index bfd3ebc601ec6..82253317e96f5 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
/* decls for local routines only used within this module */
static void InitPlan(QueryDesc *queryDesc, int eflags);
static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecFinalizeTriggerInstrumentation(EState *estate);
static void ExecPostprocessPlan(EState *estate);
static void ExecEndPlan(PlanState *planstate, EState *estate);
static void ExecutePlan(QueryDesc *queryDesc,
@@ -329,9 +330,28 @@ standard_ExecutorRun(QueryDesc *queryDesc,
*/
oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
- /* Allow instrumentation of Executor overall runtime */
+ /*
+ * Start up required top-level instrumentation stack for WAL/buffer
+ * tracking
+ */
+ if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)))
+ queryDesc->totaltime = InstrQueryAlloc(estate->es_instrument);
+
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ {
+ /* Allow instrumentation of Executor overall runtime */
+ InstrQueryStart(queryDesc->totaltime);
+
+ /*
+ * Remember all node entries for abort recovery. We do this once here
+ * after the first call to InstrQueryStart has pushed the parent
+ * entry.
+ */
+ if ((estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) &&
+ !queryDesc->already_executed)
+ ExecRememberNodeInstrumentation(queryDesc->planstate,
+ queryDesc->totaltime);
+ }
/*
* extract information from the query descriptor and the query feature.
@@ -383,7 +403,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
dest->rShutdown(dest);
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, estate->es_processed);
+ InstrQueryStop(queryDesc->totaltime);
MemoryContextSwitchTo(oldcontext);
}
@@ -433,7 +453,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
/* Allow instrumentation of Executor overall runtime */
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ InstrQueryStart(queryDesc->totaltime);
/* Run ModifyTable nodes to completion */
ExecPostprocessPlan(estate);
@@ -442,8 +462,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
AfterTriggerEndQuery(estate);
+ /*
+ * Accumulate per-node and trigger statistics to their respective parent
+ * instrumentation stacks.
+ *
+ * We skip this in parallel workers because their per-node stats are
+ * reported individually via ExecParallelReportInstrumentation, and the
+ * leader's own ExecFinalizeNodeInstrumentation handles propagation. If
+ * we accumulated here, the leader would double-count: worker parent nodes
+ * would already include their children's stats, and then the leader's
+ * accumulation would add the children again.
+ */
+ if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker())
+ {
+ ExecFinalizeNodeInstrumentation(queryDesc->planstate);
+
+ ExecFinalizeTriggerInstrumentation(estate);
+ }
+
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, 0);
+ queryDesc->totaltime = InstrQueryStopFinalize(queryDesc->totaltime);
MemoryContextSwitchTo(oldcontext);
@@ -1270,7 +1308,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
resultRelInfo->ri_TrigWhenExprs = (ExprState **)
palloc0_array(ExprState *, n);
if (instrument_options)
- resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+ resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options);
}
else
{
@@ -1484,6 +1522,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo)
return resultRelInfo->ri_ancestorResultRels;
}
+static void
+ExecFinalizeTriggerInstrumentation(EState *estate)
+{
+ List *rels = NIL;
+
+ rels = list_concat(rels, estate->es_tuple_routing_result_relations);
+ rels = list_concat(rels, estate->es_opened_result_relations);
+ rels = list_concat(rels, estate->es_trig_target_relations);
+
+ foreach_node(ResultRelInfo, rInfo, rels)
+ {
+ TriggerInstrumentation *ti = rInfo->ri_TrigInstrument;
+
+ if (ti && (ti->instr.need_bufusage || ti->instr.need_walusage))
+ InstrAccum(instr_stack.current, &ti->instr);
+ }
+}
+
/* ----------------------------------------------------------------
* ExecPostprocessPlan
*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index ac84af294c9f6..73534fa6c7e5d 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -87,7 +87,7 @@ typedef struct FixedParallelExecutorState
* instrument_options: Same meaning here as in instrument.c.
*
* instrument_offset: Offset, relative to the start of this structure,
- * of the first Instrumentation object. This will depend on the length of
+ * of the first NodeInstrumentation object. This will depend on the length of
* the plan_node_id array.
*
* num_workers: Number of workers.
@@ -104,11 +104,15 @@ struct SharedExecutorInstrumentation
int num_workers;
int num_plan_nodes;
int plan_node_id[FLEXIBLE_ARRAY_MEMBER];
- /* array of num_plan_nodes * num_workers Instrumentation objects follows */
+
+ /*
+ * array of num_plan_nodes * num_workers NodeInstrumentation objects
+ * follows
+ */
};
#define GetInstrumentationArray(sei) \
(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
- (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+ (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
/* Context object for ExecParallelEstimate. */
typedef struct ExecParallelEstimateContext
@@ -690,7 +694,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
*
* If EXPLAIN is not in use and there are no extensions loaded that care,
* we could skip this. But we have no way of knowing whether anyone's
- * looking at pgBufferUsage, so do it unconditionally.
+ * looking at instrumentation, so do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
mul_size(sizeof(BufferUsage), pcxt->nworkers));
@@ -725,7 +729,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation_len = MAXALIGN(instrumentation_len);
instrument_offset = instrumentation_len;
instrumentation_len +=
- mul_size(sizeof(Instrumentation),
+ mul_size(sizeof(NodeInstrumentation),
mul_size(e.nnodes, nworkers));
shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -811,7 +815,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
*/
if (estate->es_instrument)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
@@ -821,7 +825,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation->num_plan_nodes = e.nnodes;
instrument = GetInstrumentationArray(instrumentation);
for (i = 0; i < nworkers * e.nnodes; ++i)
- InstrInit(&instrument[i], estate->es_instrument);
+ InstrInitNode(&instrument[i], estate->es_instrument);
shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
instrumentation);
pei->instrumentation = instrumentation;
@@ -1053,7 +1057,7 @@ static bool
ExecParallelRetrieveInstrumentation(PlanState *planstate,
SharedExecutorInstrumentation *instrumentation)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
int n;
int ibytes;
@@ -1071,8 +1075,22 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
instrument = GetInstrumentationArray(instrumentation);
instrument += i * instrumentation->num_workers;
for (n = 0; n < instrumentation->num_workers; ++n)
+ {
InstrAggNode(planstate->instrument, &instrument[n]);
+ /*
+ * Also add worker WAL usage to the global pgWalUsage counter.
+ *
+ * When per-node instrumentation is active, parallel workers skip
+ * ExecFinalizeNodeInstrumentation (to avoid double-counting in
+ * EXPLAIN), so per-node WAL activity is not rolled up into the
+ * query-level stats that InstrAccumParallelQuery receives. Without
+ * this, pgWalUsage would under-report WAL generated by parallel
+ * workers when instrumentation is active.
+ */
+ WalUsageAdd(&pgWalUsage, &instrument[n].instr.walusage);
+ }
+
/*
* Also store the per-worker detail.
*
@@ -1081,9 +1099,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
* Switch into per-query memory context.
*/
oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
- ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+ ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
planstate->worker_instrument =
- palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+ palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument));
MemoryContextSwitchTo(oldcontext);
planstate->worker_instrument->num_workers = instrumentation->num_workers;
@@ -1313,7 +1331,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
{
int i;
int plan_node_id = planstate->plan->plan_node_id;
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
InstrEndLoop(planstate->instrument);
@@ -1452,6 +1470,7 @@ void
ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
{
FixedParallelExecutorState *fpes;
+ QueryInstrumentation *instr;
BufferUsage *buffer_usage;
WalUsage *wal_usage;
DestReceiver *receiver;
@@ -1512,7 +1531,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
* leader, which also doesn't count buffer accesses and WAL activity that
* occur during executor startup.
*/
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Run the plan. If we specified a tuple bound, be careful not to demand
@@ -1528,7 +1547,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
/* Report buffer/WAL usage during parallel execution. */
buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &buffer_usage[ParallelWorkerNumber],
&wal_usage[ParallelWorkerNumber]);
/* Report instrumentation data if any instrumentation options are set. */
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 7e40b85251718..5e476939edfb5 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -120,8 +120,9 @@
#include "nodes/nodeFuncs.h"
static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
-static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
static bool ExecShutdownNode_walker(PlanState *node, void *context);
+static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context);
+static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context);
/* ------------------------------------------------------------------------
@@ -413,8 +414,23 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
/* Set up instrumentation for this node if requested */
if (estate->es_instrument)
- result->instrument = InstrAlloc(1, estate->es_instrument,
- result->async_capable);
+ {
+ result->instrument = InstrAllocNode(estate->es_instrument,
+ result->async_capable);
+
+ /* IndexScan tracks table access separately from index access. */
+ if (IsA(result, IndexScanState) && (estate->es_instrument & INSTRUMENT_BUFFERS) != 0)
+ {
+ IndexScanState *iss = castNode(IndexScanState, result);
+
+ /*
+ * We intentionally don't collect timing here (even if enabled),
+ * since we don't need it, and IndexNext calls InstrPushStack /
+ * InstrPopStack (instead of InstrNode*) to reduce overhead.
+ */
+ iss->iss_InstrumentTable = InstrAllocNode(INSTRUMENT_BUFFERS, false);
+ }
+ }
return result;
}
@@ -462,7 +478,7 @@ ExecProcNodeFirst(PlanState *node)
* have ExecProcNode() directly call the relevant function from now on.
*/
if (node->instrument)
- node->ExecProcNode = ExecProcNodeInstr;
+ node->ExecProcNode = InstrNodeSetupExecProcNode(node->instrument);
else
node->ExecProcNode = node->ExecProcNodeReal;
@@ -470,25 +486,6 @@ ExecProcNodeFirst(PlanState *node)
}
-/*
- * ExecProcNode wrapper that performs instrumentation calls. By keeping
- * this a separate function, we avoid overhead in the normal case where
- * no instrumentation is wanted.
- */
-static TupleTableSlot *
-ExecProcNodeInstr(PlanState *node)
-{
- TupleTableSlot *result;
-
- InstrStartNode(node->instrument);
-
- result = node->ExecProcNodeReal(node);
-
- InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
-
- return result;
-}
-
/* ----------------------------------------------------------------
* MultiExecProcNode
@@ -787,10 +784,10 @@ ExecShutdownNode_walker(PlanState *node, void *context)
* at least once already. We don't expect much CPU consumption during
* node shutdown, but in the case of Gather or Gather Merge, we may shut
* down workers at this stage. If so, their buffer usage will get
- * propagated into pgBufferUsage at this point, and we want to make sure
- * that it gets associated with the Gather node. We skip this if the node
- * has never been executed, so as to avoid incorrectly making it appear
- * that it has.
+ * propagated into the current instrumentation stack entry at this point,
+ * and we want to make sure that it gets associated with the Gather node.
+ * We skip this if the node has never been executed, so as to avoid
+ * incorrectly making it appear that it has.
*/
if (node->instrument && node->instrument->running)
InstrStartNode(node->instrument);
@@ -828,6 +825,100 @@ ExecShutdownNode_walker(PlanState *node, void *context)
return false;
}
+/*
+ * ExecRememberNodeInstrumentation
+ *
+ * Register all per-node instrumentation entries as unfinalized children of
+ * the executor's instrumentation. This is needed for abort recovery: if the
+ * executor aborts, we need to walk each per-node entry to recover buffer/WAL
+ * data from nodes that never got finalized, that someone might be interested
+ * in as an aggregate.
+ */
+void
+ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent)
+{
+ (void) ExecRememberNodeInstrumentation_walker(node, parent);
+}
+
+static bool
+ExecRememberNodeInstrumentation_walker(PlanState *node, void *context)
+{
+ QueryInstrumentation *parent = (QueryInstrumentation *) context;
+
+ Assert(parent != NULL);
+
+ if (node == NULL)
+ return false;
+
+ if (node->instrument)
+ {
+ InstrQueryRememberNode(parent, node->instrument);
+
+ /* IndexScan has a separate entry to track table access */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ if (iss->iss_InstrumentTable)
+ InstrQueryRememberNode(parent, iss->iss_InstrumentTable);
+ }
+ }
+
+ return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context);
+}
+
+/*
+ * ExecFinalizeNodeInstrumentation
+ *
+ * Accumulate instrumentation stats from all execution nodes to their respective
+ * parents (or the original parent instrumentation).
+ *
+ * This must run after the cleanup done by ExecShutdownNode, and not rely on any
+ * resources cleaned up by it. We also expect shutdown actions to have occurred,
+ * e.g. parallel worker instrumentation to have been added to the leader.
+ */
+void
+ExecFinalizeNodeInstrumentation(PlanState *node)
+{
+ (void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current);
+}
+
+static bool
+ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context)
+{
+ Instrumentation *parent = (Instrumentation *) context;
+
+ Assert(parent != NULL);
+
+ if (node == NULL)
+ return false;
+
+ /*
+ * Recurse into children first (bottom-up accumulation), passing our
+ * instrumentation as the parent context. This ensures children can
+ * accumulate to us even if they were never executed by the leader (e.g.
+ * nodes beneath Gather that only workers ran).
+ */
+ planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker,
+ node->instrument ? &node->instrument->instr : parent);
+
+ if (!node->instrument)
+ return false;
+
+ /* IndexScan has a separate entry to track table access */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ if (iss->iss_InstrumentTable)
+ iss->iss_InstrumentTable = InstrFinalizeNode(iss->iss_InstrumentTable, &node->instrument->instr);
+ }
+
+ node->instrument = InstrFinalizeNode(node->instrument, parent);
+
+ return false;
+}
+
/*
* ExecSetTupleBound
*
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index a40610bc2522f..2727e7b5ce4dc 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,124 +16,419 @@
#include
#include "executor/instrument.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
-BufferUsage pgBufferUsage;
-static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
-static WalUsage save_pgWalUsage;
+Instrumentation instr_top;
+InstrStackState instr_stack = {0, 0, NULL, &instr_top};
-static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
-static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+void
+InstrStackGrow(void)
+{
+ if (instr_stack.entries == NULL)
+ {
+ instr_stack.stack_space = 10; /* Allocate sufficient initial space
+ * for typical activity */
+ instr_stack.entries = MemoryContextAlloc(TopMemoryContext,
+ sizeof(Instrumentation *) * instr_stack.stack_space);
+ }
+ else
+ {
+ instr_stack.stack_space *= 2;
+ instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, instr_stack.stack_space);
+ }
+}
+
+/* General purpose instrumentation handling */
+void
+InstrInitOptions(Instrumentation *instr, int instrument_options)
+{
+ instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+ instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+ instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+}
+
+static inline void
+InstrStartTimer(Instrumentation *instr)
+{
+ Assert(INSTR_TIME_IS_ZERO(instr->starttime));
+ INSTR_TIME_SET_CURRENT(instr->starttime);
+}
-/* Allocate new instrumentation structure(s) */
-Instrumentation *
-InstrAlloc(int n, int instrument_options, bool async_mode)
+static inline void
+InstrStopTimer(Instrumentation *instr)
{
- Instrumentation *instr;
+ instr_time endtime;
- /* initialize all fields to zeroes, then modify as needed */
- instr = palloc0(n * sizeof(Instrumentation));
- if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
- {
- bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
- bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
- bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
- int i;
+ Assert(!INSTR_TIME_IS_ZERO(instr->starttime));
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+
+ INSTR_TIME_SET_ZERO(instr->starttime);
+}
+
+void
+InstrStart(Instrumentation *instr)
+{
+ if (instr->need_timer)
+ InstrStartTimer(instr);
- for (i = 0; i < n; i++)
+ if (instr->need_bufusage || instr->need_walusage)
+ InstrPushStack(instr);
+}
+
+void
+InstrStop(Instrumentation *instr)
+{
+ if (instr->need_timer)
+ InstrStopTimer(instr);
+
+ if (instr->need_bufusage || instr->need_walusage)
+ InstrPopStack(instr);
+}
+
+/*
+ * Stops instrumentation, finalizes the stack entry and accumulates to its parent.
+ *
+ * Note that this intentionally allows passing a stack that is not the current
+ * top, as can happen with PG_FINALLY, or resource owners, which don't have a
+ * guaranteed cleanup order.
+ *
+ * We are careful here to achieve two goals:
+ *
+ * 1) Reset the stack to the parent of whichever of the released stack entries
+ * has the lowest index
+ * 2) Accumulate all instrumentation to the currently active instrumentation,
+ * so that callers get a complete picture of activity, even after an abort
+ */
+void
+InstrStopFinalize(Instrumentation *instr)
+{
+ int idx = -1;
+
+ for (int i = instr_stack.stack_size - 1; i >= 0; i--)
+ {
+ if (instr_stack.entries[i] == instr)
{
- instr[i].need_bufusage = need_buffers;
- instr[i].need_walusage = need_wal;
- instr[i].need_timer = need_timer;
- instr[i].async_mode = async_mode;
+ idx = i;
+ break;
}
}
+ if (idx >= 0)
+ {
+ while (instr_stack.stack_size > idx + 1)
+ instr_stack.stack_size--;
+
+ InstrPopStack(instr);
+ }
+
+ if (instr->need_timer)
+ InstrStopTimer(instr);
+
+ InstrAccum(instr_stack.current, instr);
+}
+
+
+/* Query instrumentation handling */
+
+/*
+ * Use ResourceOwner mechanism to correctly reset instr_stack on abort.
+ */
+static void ResOwnerReleaseInstrumentation(Datum res);
+static const ResourceOwnerDesc instrumentation_resowner_desc =
+{
+ .name = "instrumentation",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_INSTRUMENTATION,
+ .ReleaseResource = ResOwnerReleaseInstrumentation,
+ .DebugPrint = NULL, /* default message is fine */
+};
+
+static inline void
+ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static inline void
+ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+ ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static void
+ResOwnerReleaseInstrumentation(Datum res)
+{
+ QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res);
+ dlist_mutable_iter iter;
+
+ /* Accumulate data from all unfinalized child node entries. */
+ dlist_foreach_modify(iter, &qinstr->unfinalized_children)
+ {
+ NodeInstrumentation *child = dlist_container(NodeInstrumentation, unfinalized_node, iter.cur);
+
+ InstrAccum(&qinstr->instr, &child->instr);
+
+ /*
+ * Free NodeInstrumentation now, since InstrFinalizeNode won't be
+ * called
+ */
+ pfree(child);
+ }
+
+ /* Ensure the stack is reset as expected, and we accumulate to the parent */
+ InstrStopFinalize(&qinstr->instr);
+
+ /* Free QueryInstrumentation now, since InstrStop won't be called */
+ pfree(qinstr);
+}
+
+QueryInstrumentation *
+InstrQueryAlloc(int instrument_options)
+{
+ QueryInstrumentation *instr;
+
+ /*
+ * If needed, allocate in TopMemoryContext so that the Instrumentation
+ * survives transaction abort — ResourceOwner release needs to access
+ * it.
+ */
+ if ((instrument_options & INSTRUMENT_BUFFERS) != 0 || (instrument_options & INSTRUMENT_WAL) != 0)
+ instr = MemoryContextAllocZero(TopMemoryContext, sizeof(QueryInstrumentation));
+ else
+ instr = palloc0(sizeof(QueryInstrumentation));
+
+ InstrInitOptions(&instr->instr, instrument_options);
+ dlist_init(&instr->unfinalized_children);
+
return instr;
}
-/* Initialize a pre-allocated instrumentation structure. */
void
-InstrInit(Instrumentation *instr, int instrument_options)
+InstrQueryStart(QueryInstrumentation *qinstr)
{
- memset(instr, 0, sizeof(Instrumentation));
- instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
- instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
- instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+ InstrStart(&qinstr->instr);
+
+ if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage)
+ {
+ Assert(CurrentResourceOwner != NULL);
+ qinstr->owner = CurrentResourceOwner;
+
+ ResourceOwnerEnlarge(qinstr->owner);
+ ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr);
+ }
}
-/* Entry to a plan node */
void
-InstrStartNode(Instrumentation *instr)
+InstrQueryStop(QueryInstrumentation *qinstr)
{
- if (instr->need_timer)
+ InstrStop(&qinstr->instr);
+
+ if (qinstr->instr.need_bufusage || qinstr->instr.need_walusage)
{
- if (!INSTR_TIME_IS_ZERO(instr->starttime))
- elog(ERROR, "InstrStartNode called twice in a row");
- else
- INSTR_TIME_SET_CURRENT(instr->starttime);
+ Assert(qinstr->owner != NULL);
+ ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+ qinstr->owner = NULL;
}
+}
+
+QueryInstrumentation *
+InstrQueryStopFinalize(QueryInstrumentation *qinstr)
+{
+ QueryInstrumentation *copy;
+
+ InstrStopFinalize(&qinstr->instr);
- /* save buffer usage totals at node entry, if needed */
- if (instr->need_bufusage)
- instr->bufusage_start = pgBufferUsage;
+ if (!qinstr->instr.need_bufusage && !qinstr->instr.need_walusage)
+ return qinstr;
- if (instr->need_walusage)
- instr->walusage_start = pgWalUsage;
+ Assert(qinstr->owner != NULL);
+ ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+ qinstr->owner = NULL;
+
+ /*
+ * Copy to the current memory context so the caller doesn't need to
+ * explicitly free the TopMemoryContext allocation.
+ */
+ copy = palloc(sizeof(QueryInstrumentation));
+ memcpy(copy, qinstr, sizeof(QueryInstrumentation));
+ pfree(qinstr);
+ return copy;
}
-/* Exit from a plan node */
+/*
+ * Register a child NodeInstrumentation entry for abort processing.
+ *
+ * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to
+ * recover buffer/WAL data from entries that were never finalized, in order for
+ * aggregate totals to be accurate despite the query erroring out.
+ */
void
-InstrStopNode(Instrumentation *instr, double nTuples)
+InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *child)
+{
+ if (child->instr.need_bufusage || child->instr.need_walusage)
+ dlist_push_head(&parent->unfinalized_children, &child->unfinalized_node);
+}
+
+/* start instrumentation during parallel executor startup */
+QueryInstrumentation *
+InstrStartParallelQuery(void)
+{
+ QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+
+ InstrQueryStart(qinstr);
+ return qinstr;
+}
+
+/* report usage after parallel executor shutdown */
+void
+InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage)
+{
+ qinstr = InstrQueryStopFinalize(qinstr);
+ memcpy(bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
+ memcpy(walusage, &qinstr->instr.walusage, sizeof(WalUsage));
+}
+
+/*
+ * Accumulate work done by parallel workers in the leader's stats.
+ *
+ * Note that what gets added here effectively depends on whether per-node
+ * instrumentation is active. If it's active the parallel worker intentionally
+ * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would
+ * cause double counting. Instead, this only accumulates any extra activity
+ * outside of nodes.
+ *
+ * Otherwise this is responsible for making sure that the complete query
+ * activity is accumulated.
+ */
+void
+InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+{
+ BufferUsageAdd(&instr_stack.current->bufusage, bufusage);
+ WalUsageAdd(&instr_stack.current->walusage, walusage);
+
+ WalUsageAdd(&pgWalUsage, walusage);
+}
+
+/* Node instrumentation handling */
+
+/* Allocate new node instrumentation structure */
+NodeInstrumentation *
+InstrAllocNode(int instrument_options, bool async_mode)
+{
+ NodeInstrumentation *instr;
+
+ /*
+ * If needed, allocate in a context that supports stack-based
+ * instrumentation abort handling. We can utilize TopTransactionContext
+ * instead of TopMemoryContext here because nodes don't get used for
+ * utility commands that restart transactions, which would require a
+ * context that survives longer (EXPLAIN ANALYZE is fine).
+ */
+ if ((instrument_options & INSTRUMENT_BUFFERS) != 0 || (instrument_options & INSTRUMENT_WAL) != 0)
+ instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation));
+ else
+ instr = palloc(sizeof(NodeInstrumentation));
+
+ InstrInitNode(instr, instrument_options);
+ instr->async_mode = async_mode;
+
+ return instr;
+}
+
+/* Initialize a pre-allocated instrumentation structure. */
+void
+InstrInitNode(NodeInstrumentation *instr, int instrument_options)
+{
+ memset(instr, 0, sizeof(NodeInstrumentation));
+ InstrInitOptions(&instr->instr, instrument_options);
+}
+
+/* Entry to a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStartNode(NodeInstrumentation *instr)
+{
+ InstrStart(&instr->instr);
+}
+
+/*
+ * Updates the node instrumentation time counter.
+ *
+ * Note this is different from InstrStop because total is only updated in
+ * InstrEndLoop. We need the separate counter variable because we need to
+ * calculate start-up time for the first tuple in each cycle, and then
+ * accumulate it together.
+ */
+static inline void
+InstrStopNodeTimer(NodeInstrumentation *instr)
{
- double save_tuplecount = instr->tuplecount;
instr_time endtime;
+ Assert(!INSTR_TIME_IS_ZERO(instr->instr.starttime));
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime);
+ INSTR_TIME_SET_ZERO(instr->instr.starttime);
+
+ /*
+ * Is this the first tuple of this cycle?
+ *
+ * In async mode, if the plan node hadn't emitted any tuples before, this
+ * might be the first tuple
+ */
+ if (!instr->running || (instr->async_mode && instr->tuplecount < 1.0))
+ instr->firsttuple = instr->counter;
+}
+
+/* Exit from a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStopNode(NodeInstrumentation *instr, double nTuples)
+{
+ if (instr->instr.need_timer)
+ InstrStopNodeTimer(instr);
+
+ /* Only pop the stack, accumulation runs in InstrFinalizeNode */
+ if (instr->instr.need_bufusage || instr->instr.need_walusage)
+ InstrPopStack(&instr->instr);
+
+ instr->running = true;
+
/* count the returned tuples */
instr->tuplecount += nTuples;
+}
- /* let's update the time only if the timer was requested */
- if (instr->need_timer)
- {
- if (INSTR_TIME_IS_ZERO(instr->starttime))
- elog(ERROR, "InstrStopNode called without start");
+NodeInstrumentation *
+InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent)
+{
+ NodeInstrumentation *dst;
- INSTR_TIME_SET_CURRENT(endtime);
- INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
+ /* If we didn't use stack based instrumentation, nothing to be done */
+ if (!instr->instr.need_bufusage && !instr->instr.need_walusage)
+ return instr;
- INSTR_TIME_SET_ZERO(instr->starttime);
- }
+ /* Copy into per-query memory context */
+ dst = palloc(sizeof(NodeInstrumentation));
+ memcpy(dst, instr, sizeof(NodeInstrumentation));
- /* Add delta of buffer usage since entry to node's totals */
- if (instr->need_bufusage)
- BufferUsageAccumDiff(&instr->bufusage,
- &pgBufferUsage, &instr->bufusage_start);
+ /* Accumulate node's buffer/WAL usage to the parent */
+ InstrAccum(parent, &dst->instr);
- if (instr->need_walusage)
- WalUsageAccumDiff(&instr->walusage,
- &pgWalUsage, &instr->walusage_start);
+ /* Unregister from query's unfinalized list before freeing */
+ if (instr->instr.need_bufusage || instr->instr.need_walusage)
+ dlist_delete(&instr->unfinalized_node);
- /* Is this the first tuple of this cycle? */
- if (!instr->running)
- {
- instr->running = true;
- instr->firsttuple = instr->counter;
- }
- else
- {
- /*
- * In async mode, if the plan node hadn't emitted any tuples before,
- * this might be the first tuple
- */
- if (instr->async_mode && save_tuplecount < 1.0)
- instr->firsttuple = instr->counter;
- }
+ pfree(instr);
+
+ return dst;
}
/* Update tuple count */
void
-InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
{
/* count the returned tuples */
instr->tuplecount += nTuples;
@@ -141,24 +436,24 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
/* Finish a run cycle for a plan node */
void
-InstrEndLoop(Instrumentation *instr)
+InstrEndLoop(NodeInstrumentation *instr)
{
/* Skip if nothing has happened, or already shut down */
if (!instr->running)
return;
- if (!INSTR_TIME_IS_ZERO(instr->starttime))
- elog(ERROR, "InstrEndLoop called on running node");
+ /* Ensure InstrNodeStop was called */
+ Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime));
/* Accumulate per-cycle statistics into totals */
INSTR_TIME_ADD(instr->startup, instr->firsttuple);
- INSTR_TIME_ADD(instr->total, instr->counter);
+ INSTR_TIME_ADD(instr->instr.total, instr->counter);
instr->ntuples += instr->tuplecount;
instr->nloops += 1;
/* Reset for next cycle (if any) */
instr->running = false;
- INSTR_TIME_SET_ZERO(instr->starttime);
+ INSTR_TIME_SET_ZERO(instr->instr.starttime);
INSTR_TIME_SET_ZERO(instr->counter);
INSTR_TIME_SET_ZERO(instr->firsttuple);
instr->tuplecount = 0;
@@ -166,7 +461,7 @@ InstrEndLoop(Instrumentation *instr)
/* aggregate instrumentation information */
void
-InstrAggNode(Instrumentation *dst, Instrumentation *add)
+InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
{
if (!dst->running && add->running)
{
@@ -181,7 +476,7 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
dst->tuplecount += add->tuplecount;
INSTR_TIME_ADD(dst->startup, add->startup);
- INSTR_TIME_ADD(dst->total, add->total);
+ INSTR_TIME_ADD(dst->instr.total, add->instr.total);
dst->ntuples += add->ntuples;
dst->ntuples2 += add->ntuples2;
dst->nloops += add->nloops;
@@ -189,41 +484,175 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
dst->nfiltered2 += add->nfiltered2;
/* Add delta of buffer usage since entry to node's totals */
- if (dst->need_bufusage)
- BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ if (dst->instr.need_bufusage)
+ BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage);
+
+ if (dst->instr.need_walusage)
+ WalUsageAdd(&dst->instr.walusage, &add->instr.walusage);
+}
+
+/*
+ * Specialized handling of instrumented ExecProcNode
+ *
+ * These functions are equivalent to running ExecProcNodeReal wrapped in
+ * InstrStartNode and InstrStopNode, but avoid the conditionals in the hot path
+ * by checking the instrumentation options when the ExecProcNode pointer gets
+ * first set, and then using a special-purpose function for each. This results
+ * in a more optimized set of compiled instructions.
+ */
+
+#include "executor/tuptable.h"
+#include "nodes/execnodes.h"
+
+/* Simplified pop: restore saved state instead of re-deriving from array */
+static inline void
+InstrPopStackTo(Instrumentation *prev)
+{
+ Assert(instr_stack.stack_size > 0);
+ instr_stack.stack_size--;
+ instr_stack.current = prev;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrFull(PlanState *node)
+{
+ NodeInstrumentation *instr = node->instrument;
+ Instrumentation *prev = instr_stack.current;
+ TupleTableSlot *result;
+
+ InstrPushStack(&instr->instr);
+ InstrStartTimer(&instr->instr);
+
+ result = node->ExecProcNodeReal(node);
+
+ InstrStopNodeTimer(instr);
+ InstrPopStackTo(prev);
+
+ instr->running = true;
+ if (!TupIsNull(result))
+ instr->tuplecount += 1.0;
- if (dst->need_walusage)
- WalUsageAdd(&dst->walusage, &add->walusage);
+ return result;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsBuffersWalOnly(PlanState *node)
+{
+ NodeInstrumentation *instr = node->instrument;
+ Instrumentation *prev = instr_stack.current;
+ TupleTableSlot *result;
+
+ InstrPushStack(&instr->instr);
+
+ result = node->ExecProcNodeReal(node);
+
+ InstrPopStackTo(prev);
+
+ instr->running = true;
+ if (!TupIsNull(result))
+ instr->tuplecount += 1.0;
+
+ return result;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsTimerOnly(PlanState *node)
+{
+ NodeInstrumentation *instr = node->instrument;
+ TupleTableSlot *result;
+
+ InstrStartTimer(&instr->instr);
+
+ result = node->ExecProcNodeReal(node);
+
+ InstrStopNodeTimer(instr);
+
+ instr->running = true;
+ if (!TupIsNull(result))
+ instr->tuplecount += 1.0;
+
+ return result;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsOnly(PlanState *node)
+{
+ NodeInstrumentation *instr = node->instrument;
+ TupleTableSlot *result;
+
+ result = node->ExecProcNodeReal(node);
+
+ instr->running = true;
+ if (!TupIsNull(result))
+ instr->tuplecount += 1.0;
+
+ return result;
+}
+
+/*
+ * Returns an ExecProcNode wrapper that performs instrumentation calls,
+ * tailored to the instrumentation options enabled for the node.
+ */
+ExecProcNodeMtd
+InstrNodeSetupExecProcNode(NodeInstrumentation *instr)
+{
+ bool need_timer = instr->instr.need_timer;
+ bool need_buf = (instr->instr.need_bufusage ||
+ instr->instr.need_walusage);
+
+ if (need_timer && need_buf)
+ return ExecProcNodeInstrFull;
+ else if (need_buf)
+ return ExecProcNodeInstrRowsBuffersWalOnly;
+ else if (need_timer)
+ return ExecProcNodeInstrRowsTimerOnly;
+ else
+ return ExecProcNodeInstrRowsOnly;
+}
+
+/* Trigger instrumentation handling */
+TriggerInstrumentation *
+InstrAllocTrigger(int n, int instrument_options)
+{
+ TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation));
+ int i;
+
+ for (i = 0; i < n; i++)
+ InstrInitOptions(&tginstr[i].instr, instrument_options);
+
+ return tginstr;
}
-/* note current values during parallel executor startup */
void
-InstrStartParallelQuery(void)
+InstrStartTrigger(TriggerInstrumentation *tginstr)
{
- save_pgBufferUsage = pgBufferUsage;
- save_pgWalUsage = pgWalUsage;
+ InstrStart(&tginstr->instr);
}
-/* report usage after parallel executor shutdown */
void
-InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
{
- memset(bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
- memset(walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
+ /*
+ * This trigger may be called again, so we don't finalize instrumentation
+ * here. Accumulation to the parent happens at ExecutorFinish through
+ * ExecFinalizeTriggerInstrumentation.
+ */
+ InstrStop(&tginstr->instr);
+ tginstr->firings += firings;
}
-/* accumulate work done by workers in leader's stats */
void
-InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrAccum(Instrumentation *dst, Instrumentation *add)
{
- BufferUsageAdd(&pgBufferUsage, bufusage);
- WalUsageAdd(&pgWalUsage, walusage);
+ Assert(dst != NULL);
+ Assert(add != NULL);
+
+ BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ WalUsageAdd(&dst->walusage, &add->walusage);
}
/* dst += add */
-static void
+void
BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
{
dst->shared_blks_hit += add->shared_blks_hit;
@@ -244,39 +673,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
}
-/* dst += add - sub */
+/* dst += add */
void
-BufferUsageAccumDiff(BufferUsage *dst,
- const BufferUsage *add,
- const BufferUsage *sub)
-{
- dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
- dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
- dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
- dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
- dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
- dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
- dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
- dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
- dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
- dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
- INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
- add->shared_blk_read_time, sub->shared_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
- add->shared_blk_write_time, sub->shared_blk_write_time);
- INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
- add->local_blk_read_time, sub->local_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
- add->local_blk_write_time, sub->local_blk_write_time);
- INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
- add->temp_blk_read_time, sub->temp_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
- add->temp_blk_write_time, sub->temp_blk_write_time);
-}
-
-/* helper functions for WAL usage accumulation */
-static void
-WalUsageAdd(WalUsage *dst, WalUsage *add)
+WalUsageAdd(WalUsage *dst, const WalUsage *add)
{
dst->wal_bytes += add->wal_bytes;
dst->wal_records += add->wal_records;
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 4513b1f7a9017..3bd3539216882 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -83,7 +83,9 @@ IndexNext(IndexScanState *node)
ExprContext *econtext;
ScanDirection direction;
IndexScanDesc scandesc;
+ ItemPointer tid;
TupleTableSlot *slot;
+ bool found;
/*
* extract necessary information from index scan node
@@ -128,8 +130,24 @@ IndexNext(IndexScanState *node)
/*
* ok, now that we have what we need, fetch the next tuple.
*/
- while (index_getnext_slot(scandesc, direction, slot))
+ while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
{
+ if (node->iss_InstrumentTable)
+ InstrPushStack(&node->iss_InstrumentTable->instr);
+
+ for (;;)
+ {
+ found = index_fetch_heap(scandesc, slot);
+ if (found || !scandesc->xs_heap_continue)
+ break;
+ }
+
+ if (node->iss_InstrumentTable)
+ InstrPopStack(&node->iss_InstrumentTable->instr);
+
+ if (unlikely(!found))
+ continue;
+
CHECK_FOR_INTERRUPTS();
/*
@@ -257,36 +275,67 @@ IndexNextWithReorder(IndexScanState *node)
}
/*
- * Fetch next tuple from the index.
+ * Fetch next valid tuple from the index.
*/
-next_indextuple:
- if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+ for (;;)
{
+ ItemPointer tid;
+ bool found;
+
+ /* Time to fetch the next TID from the index */
+ tid = index_getnext_tid(scandesc, ForwardScanDirection);
+
+ /* If we're out of index entries, we're done */
+ if (tid == NULL)
+ {
+ /*
+ * No more tuples from the index. But we still need to drain
+ * any remaining tuples from the queue before we're done.
+ */
+ node->iss_ReachedEnd = true;
+ break;
+ }
+
+ Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid));
+
+ if (node->iss_InstrumentTable)
+ InstrPushStack(&node->iss_InstrumentTable->instr);
+
+ for (;;)
+ {
+ found = index_fetch_heap(scandesc, slot);
+ if (found || !scandesc->xs_heap_continue)
+ break;
+ }
+
+ if (node->iss_InstrumentTable)
+ InstrPopStack(&node->iss_InstrumentTable->instr);
+
/*
- * No more tuples from the index. But we still need to drain any
- * remaining tuples from the queue before we're done.
+ * If the index was lossy, we have to recheck the index quals and
+ * ORDER BY expressions using the fetched tuple.
*/
- node->iss_ReachedEnd = true;
- continue;
- }
-
- /*
- * If the index was lossy, we have to recheck the index quals and
- * ORDER BY expressions using the fetched tuple.
- */
- if (scandesc->xs_recheck)
- {
- econtext->ecxt_scantuple = slot;
- if (!ExecQualAndReset(node->indexqualorig, econtext))
+ if (found && scandesc->xs_recheck)
{
- /* Fails recheck, so drop it and loop back for another */
- InstrCountFiltered2(node, 1);
- /* allow this loop to be cancellable */
- CHECK_FOR_INTERRUPTS();
- goto next_indextuple;
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->indexqualorig, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ continue;
+ }
}
+
+ if (found)
+ break;
}
+ /* No more index entries, re-run to clear the reorder queue */
+ if (node->iss_ReachedEnd)
+ continue;
+
if (scandesc->xs_recheckorderby)
{
econtext->ecxt_scantuple = slot;
@@ -812,6 +861,11 @@ ExecEndIndexScan(IndexScanState *node)
* which will have a new IndexOnlyScanState and zeroed stats.
*/
winstrument->nsearches += node->iss_Instrument.nsearches;
+ if (node->iss_InstrumentTable)
+ {
+ BufferUsageAdd(&winstrument->worker_table_bufusage, &node->iss_InstrumentTable->instr.bufusage);
+ WalUsageAdd(&winstrument->worker_table_walusage, &node->iss_InstrumentTable->instr.walusage);
+ }
}
/*
@@ -1819,4 +1873,14 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node)
SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
node->iss_SharedInfo = palloc(size);
memcpy(node->iss_SharedInfo, SharedInfo, size);
+
+ /* Aggregate workers' table buffer/WAL usage into leader's entry */
+ if (node->iss_InstrumentTable)
+ for (int i = 0; i < node->iss_SharedInfo->num_workers; i++)
+ {
+ BufferUsageAdd(&node->iss_InstrumentTable->instr.bufusage,
+ &node->iss_SharedInfo->winstrument[i].worker_table_bufusage);
+ WalUsageAdd(&node->iss_InstrumentTable->instr.walusage,
+ &node->iss_SharedInfo->winstrument[i].worker_table_walusage);
+ }
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 00bc609529a5b..41a0baa344991 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -835,7 +835,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
PinLocalBuffer(bufHdr, true);
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
return true;
}
@@ -856,7 +856,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
if (BufferTagsEqual(&tag, &bufHdr->tag))
{
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
return true;
}
UnpinBuffer(bufHdr);
@@ -1257,21 +1257,21 @@ PinBufferForBlock(Relation rel,
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
if (*foundPtr)
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
}
else
{
bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
strategy, foundPtr, io_context);
if (*foundPtr)
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
}
if (rel)
{
/*
- * While pgBufferUsage's "read" counter isn't bumped unless we reach
- * WaitReadBuffers() (so, not for hits, and not for buffers that are
- * zeroed instead), the per-relation stats always count them.
+ * While the current buffer usage "read" counter isn't bumped unless
+ * we reach WaitReadBuffers() (so, not for hits, and not for buffers
+ * that are zeroed instead), the per-relation stats always count them.
*/
pgstat_count_buffer_read(rel);
if (*foundPtr)
@@ -1998,9 +1998,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
true);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
else
- pgBufferUsage.shared_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
if (operation->rel)
pgstat_count_buffer_hit(operation->rel);
@@ -2068,9 +2068,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
io_start, 1, io_buffers_len * BLCKSZ);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len);
else
- pgBufferUsage.shared_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len);
/*
* Track vacuum cost when issuing IO, not after waiting for it.
@@ -2959,7 +2959,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
}
- pgBufferUsage.shared_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by);
*extended_by = extend_by;
@@ -3105,7 +3105,7 @@ MarkBufferDirty(Buffer buffer)
*/
if (!(old_buf_state & BM_DIRTY))
{
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
@@ -4520,7 +4520,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
IOOP_WRITE, io_start, 1, BLCKSZ);
- pgBufferUsage.shared_blks_written++;
+ INSTR_BUFUSAGE_INCR(shared_blks_written);
/*
* Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
@@ -5663,7 +5663,7 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
UnlockBufHdr(bufHdr);
}
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 404c6bccbdd7e..8845b0aeed62d 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
/* Mark not-dirty */
TerminateLocalBufferIO(bufHdr, true, 0, false);
- pgBufferUsage.local_blks_written++;
+ INSTR_BUFUSAGE_INCR(local_blks_written);
}
static Buffer
@@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
*extended_by = extend_by;
- pgBufferUsage.local_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(local_blks_written, extend_by);
return first_block;
}
@@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer)
buf_state = pg_atomic_read_u64(&bufHdr->state);
if (!(buf_state & BM_DIRTY))
- pgBufferUsage.local_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(local_blks_dirtied);
buf_state |= BM_DIRTY;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index c4afe4d368a34..8b501dfcadd02 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -475,13 +475,13 @@ BufFileLoadBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start);
}
/* we choose not to advance curOffset here */
if (file->nbytes > 0)
- pgBufferUsage.temp_blks_read++;
+ INSTR_BUFUSAGE_INCR(temp_blks_read);
}
/*
@@ -549,13 +549,13 @@ BufFileDumpBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start);
}
file->curOffset += bytestowrite;
wpos += bytestowrite;
- pgBufferUsage.temp_blks_written++;
+ INSTR_BUFUSAGE_INCR(temp_blks_written);
}
file->dirty = false;
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 28de24538dced..60400f0c81f42 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc)
* pg_stat_database only counts block read and write times, these are done for
* IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
*
- * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats
- * for shared, local and temporary blocks. pg_stat_io does not track the
- * activity of temporary blocks, so these are ignored here.
+ * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has
+ * write and read stats for shared, local and temporary blocks. pg_stat_io
+ * does not track the activity of temporary blocks, so these are ignored here.
*/
void
pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
@@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
{
pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time);
}
else if (io_op == IOOP_READ)
{
pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time);
}
}
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index d3a572428449d..340029a203422 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -51,8 +51,8 @@ typedef struct QueryDesc
/* This field is set by ExecutePlan */
bool already_executed; /* true if previously executed */
- /* This is always set NULL by the core system, but plugins can change it */
- struct Instrumentation *totaltime; /* total time spent in ExecutorRun */
+ /* This field is set by ExecutorRun, or plugins */
+ struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */
} QueryDesc;
/* in pquery.c */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 82c442d23f811..b902cfcbe6e13 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function);
extern Node *MultiExecProcNode(PlanState *node);
extern void ExecEndNode(PlanState *node);
extern void ExecShutdownNode(PlanState *node);
+extern void ExecRememberNodeInstrumentation(PlanState *node, QueryInstrumentation *parent);
+extern void ExecFinalizeNodeInstrumentation(PlanState *node);
extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9759f3ea5d8d9..bce58acb11bd8 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -13,6 +13,7 @@
#ifndef INSTRUMENT_H
#define INSTRUMENT_H
+#include "lib/ilist.h"
#include "portability/instr_time.h"
@@ -67,56 +68,260 @@ typedef enum InstrumentOption
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
+/*
+ * Instrumentation base class for capturing time and WAL/buffer usage
+ *
+ * If used directly:
+ * - Allocate on the stack and zero initialize the struct
+ * - Call InstrInitOptions to set instrumentation options
+ * - Call InstrStart before the activity you want to measure
+ * - Call InstrStop / InstrStopFinalize after the activity to capture totals
+ *
+ * InstrStart/InstrStop may be called multiple times. The last stop call must
+ * be to InstrStopFinalize to ensure parent stack entries get the accumulated
+ * totals. If there is risk of transaction aborts you must call
+ * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the
+ * instrumentation stack.
+ *
+ * In a query context use QueryInstrumentation instead, which handles aborts
+ * using the resource owner logic.
+ */
typedef struct Instrumentation
{
- /* Parameters set at node creation: */
+ /* Parameters set at creation: */
bool need_timer; /* true if we need timer data */
bool need_bufusage; /* true if we need buffer usage data */
bool need_walusage; /* true if we need WAL usage data */
+ /* Internal state keeping: */
+ instr_time starttime; /* start time of last InstrStart */
+ /* Accumulated statistics: */
+ instr_time total; /* total runtime */
+ BufferUsage bufusage; /* total buffer usage */
+ WalUsage walusage; /* total WAL usage */
+} Instrumentation;
+
+/*
+ * Query-related instrumentation tracking.
+ *
+ * Usage:
+ * - Allocate on the heap using InstrQueryAlloc (required for abort handling)
+ * - Call InstrQueryStart before the activity you want to measure
+ * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals
+ *
+ * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop
+ * call must be to InstrQueryStopFinalize to ensure parent stack entries get
+ * the accumulated totals.
+ *
+ * Uses resource owner mechanism for handling aborts, as such, the caller
+ * *must* not exit out of the top level transaction after having called
+ * InstrQueryStart, without first calling InstrQueryStop. In the case of a
+ * transaction abort, logic equivalent to InstrQueryStop will be called
+ * automatically.
+ */
+struct ResourceOwnerData;
+typedef struct QueryInstrumentation
+{
+ Instrumentation instr;
+
+ /* Resource owner used for cleanup for aborts between InstrStart/InstrStop */
+ struct ResourceOwnerData *owner;
+
+ /*
+ * NodeInstrumentation child entries that need to be cleaned up on abort,
+ * since they are not registered as a resource owner themselves.
+ */
+ dlist_head unfinalized_children; /* head of unfinalized children list */
+} QueryInstrumentation;
+
+/*
+ * Specialized instrumentation for per-node execution statistics
+ *
+ * Relies on an outer QueryInstrumentation having been set up to handle the
+ * stack used for WAL/buffer usage statistics, and relies on it for managing
+ * aborts. Solely intended for the executor and anyone reporting about its
+ * activities (e.g. EXPLAIN ANALYZE).
+ */
+typedef struct NodeInstrumentation
+{
+ Instrumentation instr;
+ /* Parameters set at node creation: */
bool async_mode; /* true if node is in async mode */
/* Info about current plan cycle: */
bool running; /* true if we've completed first tuple */
- instr_time starttime; /* start time of current iteration of node */
instr_time counter; /* accumulated runtime for this node */
instr_time firsttuple; /* time for first tuple of this cycle */
double tuplecount; /* # of tuples emitted so far this cycle */
- BufferUsage bufusage_start; /* buffer usage at start */
- WalUsage walusage_start; /* WAL usage at start */
/* Accumulated statistics across all completed cycles: */
instr_time startup; /* total startup time */
- instr_time total; /* total time */
double ntuples; /* total tuples produced */
double ntuples2; /* secondary node-specific tuple counter */
double nloops; /* # of run cycles for this node */
double nfiltered1; /* # of tuples removed by scanqual or joinqual */
double nfiltered2; /* # of tuples removed by "other" quals */
- BufferUsage bufusage; /* total buffer usage */
- WalUsage walusage; /* total WAL usage */
-} Instrumentation;
-typedef struct WorkerInstrumentation
+ /* Abort handling */
+ dlist_node unfinalized_node; /* node in parent's unfinalized list */
+} NodeInstrumentation;
+
+/*
+ * Care must be taken with any pointers contained within this struct, as this
+ * gets copied across processes during parallel query execution.
+ */
+typedef struct WorkerNodeInstrumentation
{
int num_workers; /* # of structures that follow */
- Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
-} WorkerInstrumentation;
+ NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} WorkerNodeInstrumentation;
+
+typedef struct TriggerInstrumentation
+{
+ Instrumentation instr;
+ int firings; /* number of times the instrumented trigger
+ * was fired */
+} TriggerInstrumentation;
+
+/*
+ * Dynamic array-based stack for tracking current WAL/buffer usage context.
+ *
+ * When the stack is empty, 'current' points to instr_top which accumulates
+ * session-level totals.
+ */
+typedef struct InstrStackState
+{
+ int stack_space; /* allocated capacity of entries array */
+ int stack_size; /* current number of entries */
+
+ Instrumentation **entries; /* dynamic array of pointers */
+ Instrumentation *current; /* top of stack, or &instr_top when empty */
+} InstrStackState;
-extern PGDLLIMPORT BufferUsage pgBufferUsage;
extern PGDLLIMPORT WalUsage pgWalUsage;
-extern Instrumentation *InstrAlloc(int n, int instrument_options,
- bool async_mode);
-extern void InstrInit(Instrumentation *instr, int instrument_options);
-extern void InstrStartNode(Instrumentation *instr);
-extern void InstrStopNode(Instrumentation *instr, double nTuples);
-extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
-extern void InstrEndLoop(Instrumentation *instr);
-extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
-extern void InstrStartParallelQuery(void);
-extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
+/*
+ * The top instrumentation represents a running total of the current backend
+ * WAL/buffer usage information. This will not be updated immediately, but
+ * rather when the current stack entry gets accumulated which typically happens
+ * at query end.
+ *
+ * Care must be taken when utilizing this in the parallel worker context:
+ * Parallel workers will report back their instrumentation to the caller,
+ * and this gets added to the caller's stack. If this were to be used in the
+ * shared memory stats infrastructure it would need to be skipped on parallel
+ * workers to avoid double counting.
+ */
+extern PGDLLIMPORT Instrumentation instr_top;
+
+/*
+ * The instrumentation stack state. The 'current' field points to the
+ * currently active stack entry that is getting updated as activity happens,
+ * and will be accumulated to parent stacks when it gets finalized by
+ * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation
+ * (executor finish) or ResOwnerReleaseInstrumentation on abort.
+ */
+extern PGDLLIMPORT InstrStackState instr_stack;
+
+extern void InstrStackGrow(void);
+
+/*
+ * Pushes the stack so that all WAL/buffer usage updates go to the passed in
+ * instrumentation entry.
+ *
+ * Any caller using this directly must manage the passed in entry and call
+ * InstrPopStack on its own again, typically by using a PG_FINALLY block to
+ * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart
+ * instead when you want automatic handling of abort cases using the resource
+ * owner infrastructure.
+ */
+static inline void
+InstrPushStack(Instrumentation *instr)
+{
+ if (unlikely(instr_stack.stack_size == instr_stack.stack_space))
+ InstrStackGrow();
+
+ instr_stack.entries[instr_stack.stack_size++] = instr;
+ instr_stack.current = instr;
+}
+
+/*
+ * Pops the stack entry back to the previous one that was effective at
+ * InstrPushStack.
+ *
+ * Callers must ensure that no intermediate stack entries are skipped, to
+ * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY
+ * block, consider instead using InstrStart + InstrStopFinalize which can skip
+ * intermediate stack entries.
+ */
+static inline void
+InstrPopStack(Instrumentation *instr)
+{
+ Assert(instr_stack.stack_size > 0);
+ Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr);
+ instr_stack.stack_size--;
+ instr_stack.current = instr_stack.stack_size > 0
+ ? instr_stack.entries[instr_stack.stack_size - 1]
+ : &instr_top;
+}
+
+extern void InstrInitOptions(Instrumentation *instr, int instrument_options);
+extern void InstrStart(Instrumentation *instr);
+extern void InstrStop(Instrumentation *instr);
+extern void InstrStopFinalize(Instrumentation *instr);
+extern void InstrAccum(Instrumentation *dst, Instrumentation *add);
+
+extern QueryInstrumentation *InstrQueryAlloc(int instrument_options);
+extern void InstrQueryStart(QueryInstrumentation *instr);
+extern void InstrQueryStop(QueryInstrumentation *instr);
+extern QueryInstrumentation *InstrQueryStopFinalize(QueryInstrumentation *instr);
+extern void InstrQueryRememberNode(QueryInstrumentation *parent, NodeInstrumentation *instr);
+
+pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void);
+extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage);
extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void BufferUsageAccumDiff(BufferUsage *dst,
- const BufferUsage *add, const BufferUsage *sub);
+
+extern NodeInstrumentation *InstrAllocNode(int instrument_options,
+ bool async_mode);
+extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);
+extern void InstrStartNode(NodeInstrumentation *instr);
+extern void InstrStopNode(NodeInstrumentation *instr, double nTuples);
+extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, Instrumentation *parent);
+extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
+extern void InstrEndLoop(NodeInstrumentation *instr);
+extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
+
+typedef struct TupleTableSlot TupleTableSlot;
+typedef struct PlanState PlanState;
+typedef TupleTableSlot *(*ExecProcNodeMtd) (PlanState *pstate);
+extern ExecProcNodeMtd InstrNodeSetupExecProcNode(NodeInstrumentation *instr);
+
+extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options);
+extern void InstrStartTrigger(TriggerInstrumentation *tginstr);
+extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings);
+
+extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
+extern void WalUsageAdd(WalUsage *dst, const WalUsage *add);
extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
const WalUsage *sub);
+#define INSTR_BUFUSAGE_INCR(fld) do { \
+ instr_stack.current->bufusage.fld++; \
+ } while(0)
+#define INSTR_BUFUSAGE_ADD(fld,val) do { \
+ instr_stack.current->bufusage.fld += val; \
+ } while(0)
+#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
+ INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \
+ } while (0)
+#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
+ INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \
+ } while (0)
+
+#define INSTR_WALUSAGE_INCR(fld) do { \
+ pgWalUsage.fld++; \
+ instr_stack.current->walusage.fld++; \
+ } while(0)
+#define INSTR_WALUSAGE_ADD(fld,val) do { \
+ pgWalUsage.fld += val; \
+ instr_stack.current->walusage.fld += val; \
+ } while(0)
+
#endif /* INSTRUMENT_H */
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 8847d7f94fa57..170b6143ef602 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -18,6 +18,8 @@
#ifndef INSTRUMENT_NODE_H
#define INSTRUMENT_NODE_H
+#include "executor/instrument.h"
+
/* ---------------------
* Instrumentation information for aggregate function execution
@@ -48,6 +50,10 @@ typedef struct IndexScanInstrumentation
{
/* Index search count (incremented with pgstat_count_index_scan call) */
uint64 nsearches;
+
+ /* Used for passing iss_InstrumentTableStack data from parallel workers */
+ BufferUsage worker_table_bufusage;
+ WalUsage worker_table_walusage;
} IndexScanInstrumentation;
/*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 63c067d5aae61..c778641c13dec 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -524,7 +524,7 @@ typedef struct ResultRelInfo
ExprState **ri_TrigWhenExprs;
/* optional runtime measurements for triggers */
- Instrumentation *ri_TrigInstrument;
+ TriggerInstrumentation *ri_TrigInstrument;
/* On-demand created slots for triggers / returning processing */
TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */
@@ -1175,8 +1175,10 @@ typedef struct PlanState
ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a
* wrapper */
- Instrumentation *instrument; /* Optional runtime stats for this node */
- WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
+ NodeInstrumentation *instrument; /* Optional runtime stats for this
+ * node */
+ WorkerNodeInstrumentation *worker_instrument; /* per-worker
+ * instrumentation */
/* Per-worker JIT instrumentation */
struct SharedJitInstrumentation *worker_jit_instrument;
@@ -1726,6 +1728,13 @@ typedef struct IndexScanState
IndexScanInstrumentation iss_Instrument;
SharedIndexScanInstrumentation *iss_SharedInfo;
+ /*
+ * Instrumentation utilized for tracking table access. This is separate
+ * from iss_Instrument since it needs to be allocated in the right context
+ * and IndexScanInstrumentation shouldn't contain pointers.
+ */
+ NodeInstrumentation *iss_InstrumentTable;
+
/* These are needed for re-checking ORDER BY expr ordering */
pairingheap *iss_ReorderQueue;
bool iss_ReachedEnd;
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index eb6033b4fdb65..5463bc921f06e 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority;
#define RELEASE_PRIO_SNAPSHOT_REFS 500
#define RELEASE_PRIO_FILES 600
#define RELEASE_PRIO_WAITEVENTSETS 700
+#define RELEASE_PRIO_INSTRUMENTATION 800
/* 0 is considered invalid */
#define RELEASE_PRIO_FIRST 1
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 7c1f26b182cb0..e28e754369314 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -822,3 +822,200 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
(9 rows)
reset work_mem;
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT explain_exc_func()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+ exception_buffers_visible
+---------------------------
+ t
+(1 row)
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT check_explain_exception_buffers()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+ exception_buffers_nested_visible
+----------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly accumulated.
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+ plan_json json;
+ node json;
+ direct_buf int;
+ noscroll_buf int;
+ scroll_buf int;
+BEGIN
+ -- Direct scan: get leaf Seq Scan node buffers as baseline
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT * FROM cursor_buf_test' INTO plan_json;
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+ direct_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_noscroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ noscroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Scrollable cursor path: ExecShutdownNode is skipped
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_scroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ scroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Both cursor paths should report buffer counts about as high as
+ -- the direct scan (same data plus minor catalog overhead), and not
+ -- double-counted (< 2x the direct scan)
+ RETURN QUERY SELECT
+ (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+ (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_cursor_explain_buffers();
+ noscroll_ok | scroll_ok
+-------------+-----------
+ t | t
+(1 row)
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+-- Parallel query buffer double-counting test.
+--
+-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers.
+-- They scan the same table so the buffer count should be similar.
+-- Double-counting would make the parallel count ~2x larger.
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+ plan_json json;
+ serial_buffers int;
+ parallel_buffers int;
+ node json;
+BEGIN
+ -- Serial --
+ SET LOCAL max_parallel_workers_per_gather = 0;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ serial_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ -- Parallel --
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ parallel_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_parallel_explain_buffers();
+ ratio
+-------
+ 1
+(1 row)
+
+DROP FUNCTION check_parallel_explain_buffers;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ebdab42604beb..cf5c6335a1922 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -188,3 +188,197 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
-- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk)
select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))');
reset work_mem;
+
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT explain_exc_func()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT check_explain_exception_buffers()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly accumulated.
+
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+ plan_json json;
+ node json;
+ direct_buf int;
+ noscroll_buf int;
+ scroll_buf int;
+BEGIN
+ -- Direct scan: get leaf Seq Scan node buffers as baseline
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT * FROM cursor_buf_test' INTO plan_json;
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+ direct_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_noscroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ noscroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Scrollable cursor path: ExecShutdownNode is skipped
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_scroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ scroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Both cursor paths should report buffer counts about as high as
+ -- the direct scan (same data plus minor catalog overhead), and not
+ -- double-counted (< 2x the direct scan)
+ RETURN QUERY SELECT
+ (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+ (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_cursor_explain_buffers();
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+
+-- Parallel query buffer double-counting test.
+--
+-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers.
+-- They scan the same table so the buffer count should be similar.
+-- Double-counting would make the parallel count ~2x larger.
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+ plan_json json;
+ serial_buffers int;
+ parallel_buffers int;
+ node json;
+BEGIN
+ -- Serial --
+ SET LOCAL max_parallel_workers_per_gather = 0;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ serial_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ -- Parallel --
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ parallel_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_parallel_explain_buffers();
+
+DROP FUNCTION check_parallel_explain_buffers;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ec8513d90b5fe..1c9be944c5a24 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1320,6 +1320,7 @@ InjectionPointSharedState
InjectionPointsCtl
InlineCodeBlock
InsertStmt
+InstrStackState
Instrumentation
Int128AggState
Int8TransTypeData
@@ -1786,6 +1787,7 @@ NextSampleBlock_function
NextSampleTuple_function
NextValueExpr
Node
+NodeInstrumentation
NodeTag
NonEmptyRange
NoneCompressorState
@@ -2430,6 +2432,7 @@ QueryCompletion
QueryDesc
QueryEnvironment
QueryInfo
+QueryInstrumentation
QueryItem
QueryItemType
QueryMode
@@ -3163,6 +3166,7 @@ TriggerDesc
TriggerEvent
TriggerFlags
TriggerInfo
+TriggerInstrumentation
TriggerTransition
TruncateStmt
TsmRoutine
@@ -3383,9 +3387,9 @@ WorkTableScan
WorkTableScanState
WorkerInfo
WorkerInfoData
-WorkerInstrumentation
WorkerJobDumpPtrType
WorkerJobRestorePtrType
+WorkerNodeInstrumentation
Working_State
WriteBufPtrType
WriteBytePtrType