diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index e856cd35a6f0f..4be81489ff4fb 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
 
 	if (auto_explain_enabled())
 	{
-		/*
-		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
-		 * space is allocated in the per-query context so it will go away at
-		 * ExecutorEnd.
-		 */
+		/* Set up to track total elapsed time in ExecutorRun. */
 		if (queryDesc->totaltime == NULL)
-		{
-			MemoryContext oldcxt;
-
-			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-			queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
-			MemoryContextSwitchTo(oldcxt);
-		}
+			queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
 	}
 }
 
@@ -381,14 +371,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 		 */
 		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
 
-		/*
-		 * Make sure stats accumulation is done.  (Note: it's okay if several
-		 * levels of hook all do this.)
-		 */
-		InstrEndLoop(queryDesc->totaltime);
-
 		/* Log plan if duration is exceeded. */
-		msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
+		msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total);
 		if (msec >= auto_explain_log_min_duration)
 		{
 			ExplainState *es = NewExplainState();
diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out
index e4d6564ea5b5a..cba487f6be582 100644
--- a/contrib/pg_stat_statements/expected/utility.out
+++ b/contrib/pg_stat_statements/expected/utility.out
@@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
      1 |    1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
 (3 rows)
 
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT run_explain_buffers_test();
+ run_explain_buffers_test 
+--------------------------
+ 
+(1 row)
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+               query               | has_buffer_stats 
+-----------------------------------+------------------
+ SELECT run_explain_buffers_test() | t
+(1 row)
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+    v int;
+BEGIN
+    EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+CALL pgss_call_rollback_proc();
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+             query              | calls | local_hitread | wal_bytes_generated | wal_records_generated 
+--------------------------------+-------+---------------+---------------------+-----------------------
+ CALL pgss_call_rollback_proc() |     1 | t             | t                   | t
+(1 row)
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
 -- CALL
 CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
 DECLARE
diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out
index 977e382d84894..611213daef6c2 100644
--- a/contrib/pg_stat_statements/expected/wal.out
+++ b/contrib/pg_stat_statements/expected/wal.out
@@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t
 (1 row)
 
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT pgss_error_func();
+ pgss_error_func 
+-----------------
+ 
+(1 row)
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+          query           | calls | local_hitread | wal_bytes_generated | wal_records_generated 
+--------------------------+-------+---------------+---------------------+-----------------------
+ SELECT pgss_error_func() |     1 | t             | t                   | t
+(1 row)
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 5494d41dca161..78f1518c94084 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -911,22 +911,11 @@ pgss_planner(Query *parse,
 		&& pgss_track_planning && query_string
 		&& parse->queryId != INT64CONST(0))
 	{
-		instr_time	start;
-		instr_time	duration;
-		BufferUsage bufusage_start,
-					bufusage;
-		WalUsage	walusage_start,
-					walusage;
+		Instrumentation instr = {0};
 
-		/* We need to track buffer usage as the planner can access them. */
-		bufusage_start = pgBufferUsage;
-
-		/*
-		 * Similarly the planner could write some WAL records in some cases
-		 * (e.g. setting a hint bit with those being WAL-logged)
-		 */
-		walusage_start = pgWalUsage;
-		INSTR_TIME_SET_CURRENT(start);
+		/* Track time and buffer/WAL usage as the planner can access them. */
+		InstrInitOptions(&instr, INSTRUMENT_ALL);
+		InstrStart(&instr);
 
 		nesting_level++;
 		PG_TRY();
@@ -940,30 +929,20 @@ pgss_planner(Query *parse,
 		}
 		PG_FINALLY();
 		{
+			InstrStopFinalize(&instr);
 			nesting_level--;
 		}
 		PG_END_TRY();
 
-		INSTR_TIME_SET_CURRENT(duration);
-		INSTR_TIME_SUBTRACT(duration, start);
-
-		/* calc differences of buffer counters. */
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
-		/* calc differences of WAL counters. */
-		memset(&walusage, 0, sizeof(WalUsage));
-		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
 		pgss_store(query_string,
 				   parse->queryId,
 				   parse->stmt_location,
 				   parse->stmt_len,
 				   PGSS_PLAN,
-				   INSTR_TIME_GET_MILLISEC(duration),
+				   INSTR_TIME_GET_MILLISEC(instr.total),
 				   0,
-				   &bufusage,
-				   &walusage,
+				   &instr.bufusage,
+				   &instr.walusage,
 				   NULL,
 				   NULL,
 				   0,
@@ -1015,19 +994,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	 */
 	if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0))
 	{
-		/*
-		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
-		 * space is allocated in the per-query context so it will go away at
-		 * ExecutorEnd.
-		 */
+		/* Set up to track total elapsed time in ExecutorRun. */
 		if (queryDesc->totaltime == NULL)
-		{
-			MemoryContext oldcxt;
-
-			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-			queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
-			MemoryContextSwitchTo(oldcxt);
-		}
+			queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
 	}
 }
 
@@ -1084,21 +1053,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
 	if (queryId != INT64CONST(0) && queryDesc->totaltime &&
 		pgss_enabled(nesting_level))
 	{
-		/*
-		 * Make sure stats accumulation is done.  (Note: it's okay if several
-		 * levels of hook all do this.)
-		 */
-		InstrEndLoop(queryDesc->totaltime);
-
 		pgss_store(queryDesc->sourceText,
 				   queryId,
 				   queryDesc->plannedstmt->stmt_location,
 				   queryDesc->plannedstmt->stmt_len,
 				   PGSS_EXEC,
-				   INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total),
+				   INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total),
 				   queryDesc->estate->es_total_processed,
-				   &queryDesc->totaltime->bufusage,
-				   &queryDesc->totaltime->walusage,
+				   &queryDesc->totaltime->instr.bufusage,
+				   &queryDesc->totaltime->instr.walusage,
 				   queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
 				   NULL,
 				   queryDesc->estate->es_parallel_workers_to_launch,
@@ -1162,17 +1125,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		!IsA(parsetree, ExecuteStmt) &&
 		!IsA(parsetree, PrepareStmt))
 	{
-		instr_time	start;
-		instr_time	duration;
 		uint64		rows;
-		BufferUsage bufusage_start,
-					bufusage;
-		WalUsage	walusage_start,
-					walusage;
+		Instrumentation instr = {0};
 
-		bufusage_start = pgBufferUsage;
-		walusage_start = pgWalUsage;
-		INSTR_TIME_SET_CURRENT(start);
+		InstrInitOptions(&instr, INSTRUMENT_ALL);
+		InstrStart(&instr);
 
 		nesting_level++;
 		PG_TRY();
@@ -1188,6 +1145,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		}
 		PG_FINALLY();
 		{
+			InstrStopFinalize(&instr);
 			nesting_level--;
 		}
 		PG_END_TRY();
@@ -1202,9 +1160,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		 * former value, which'd otherwise be a good idea.
 		 */
 
-		INSTR_TIME_SET_CURRENT(duration);
-		INSTR_TIME_SUBTRACT(duration, start);
-
 		/*
 		 * Track the total number of rows retrieved or affected by the utility
 		 * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
@@ -1216,23 +1171,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 					   qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
 			qc->nprocessed : 0;
 
-		/* calc differences of buffer counters. */
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
-		/* calc differences of WAL counters. */
-		memset(&walusage, 0, sizeof(WalUsage));
-		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
 		pgss_store(queryString,
 				   saved_queryId,
 				   saved_stmt_location,
 				   saved_stmt_len,
 				   PGSS_EXEC,
-				   INSTR_TIME_GET_MILLISEC(duration),
+				   INSTR_TIME_GET_MILLISEC(instr.total),
 				   rows,
-				   &bufusage,
-				   &walusage,
+				   &instr.bufusage,
+				   &instr.walusage,
 				   NULL,
 				   NULL,
 				   0,
diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql
index dd97203c21025..7540e49c73caf 100644
--- a/contrib/pg_stat_statements/sql/utility.sql
+++ b/contrib/pg_stat_statements/sql/utility.sql
@@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7;
 
 SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
 
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+SELECT run_explain_buffers_test();
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+    v int;
+BEGIN
+    EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+CALL pgss_call_rollback_proc();
+
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
+
 -- CALL
 CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
 DECLARE
diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql
index 1dc1552a81ebc..467e321b2062e 100644
--- a/contrib/pg_stat_statements/sql/wal.sql
+++ b/contrib/pg_stat_statements/sql/wal.sql
@@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated,
 wal_records >= rows as wal_records_ge_rows
 FROM pg_stat_statements ORDER BY query COLLATE "C";
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT pgss_error_func();
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 192f8011160a9..06673017bcfaf 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -60,6 +60,7 @@ typedef struct ConnCacheEntry
 	/* Remaining fields are invalid when conn is NULL: */
 	int			xact_depth;		/* 0 = no xact open, 1 = main xact open, 2 =
 								 * one level of subxact open, etc */
+	bool		xact_read_only; /* xact r/o state */
 	bool		have_prep_stmt; /* have we prepared any stmts in this xact? */
 	bool		have_error;		/* have any subxacts aborted in this xact? */
 	bool		changing_xact_state;	/* xact state change in process */
@@ -86,6 +87,12 @@ static unsigned int prep_stmt_number = 0;
 /* tracks whether any work is needed in callback functions */
 static bool xact_got_connection = false;
 
+/*
+ * tracks the topmost read-only local transaction's nesting level determined
+ * by GetTopReadOnlyTransactionNestLevel()
+ */
+static int	read_only_level = 0;
+
 /* custom wait event values, retrieved from shared memory */
 static uint32 pgfdw_we_cleanup_result = 0;
 static uint32 pgfdw_we_connect = 0;
@@ -378,6 +385,7 @@ make_new_connection(ConnCacheEntry *entry, UserMapping *user)
 
 	/* Reset all transient state fields, to be sure all are clean */
 	entry->xact_depth = 0;
+	entry->xact_read_only = false;
 	entry->have_prep_stmt = false;
 	entry->have_error = false;
 	entry->changing_xact_state = false;
@@ -871,29 +879,106 @@ do_sql_command_end(PGconn *conn, const char *sql, bool consume_input)
  * those scans.  A disadvantage is that we can't provide sane emulation of
  * READ COMMITTED behavior --- it would be nice if we had some other way to
  * control which remote queries share a snapshot.
+ *
+ * Note also that we always start the remote transaction with the same
+ * read/write and deferrable properties as the local transaction, and start
+ * the remote subtransaction with the same read/write property as the local
+ * subtransaction.
  */
 static void
 begin_remote_xact(ConnCacheEntry *entry)
 {
 	int			curlevel = GetCurrentTransactionNestLevel();
 
-	/* Start main transaction if we haven't yet */
+	/*
+	 * If the current local (sub)transaction is read-only, set the topmost
+	 * read-only local transaction's nesting level if we haven't yet.
+	 *
+	 * Note: once it's set, it's retained until the topmost read-only local
+	 * transaction is committed/aborted (see pgfdw_xact_callback and
+	 * pgfdw_subxact_callback).
+	 */
+	if (XactReadOnly)
+	{
+		if (read_only_level == 0)
+			read_only_level = GetTopReadOnlyTransactionNestLevel();
+		Assert(read_only_level > 0);
+	}
+	else
+		Assert(read_only_level == 0);
+
+	/*
+	 * Start main transaction if we haven't yet; otherwise, change the current
+	 * remote (sub)transaction's read/write mode if needed.
+	 */
 	if (entry->xact_depth <= 0)
 	{
-		const char *sql;
+		/*
+		 * This is the case when we haven't yet started a main transaction.
+		 */
+		StringInfoData sql;
+		bool		ro = (read_only_level == 1);
 
 		elog(DEBUG3, "starting remote transaction on connection %p",
 			 entry->conn);
 
+		initStringInfo(&sql);
+		appendStringInfoString(&sql, "START TRANSACTION ISOLATION LEVEL ");
 		if (IsolationIsSerializable())
-			sql = "START TRANSACTION ISOLATION LEVEL SERIALIZABLE";
+			appendStringInfoString(&sql, "SERIALIZABLE");
 		else
-			sql = "START TRANSACTION ISOLATION LEVEL REPEATABLE READ";
+			appendStringInfoString(&sql, "REPEATABLE READ");
+		if (ro)
+			appendStringInfoString(&sql, " READ ONLY");
+		if (XactDeferrable)
+			appendStringInfoString(&sql, " DEFERRABLE");
 		entry->changing_xact_state = true;
-		do_sql_command(entry->conn, sql);
+		do_sql_command(entry->conn, sql.data);
 		entry->xact_depth = 1;
+		if (ro)
+		{
+			Assert(!entry->xact_read_only);
+			entry->xact_read_only = true;
+		}
 		entry->changing_xact_state = false;
 	}
+	else if (!entry->xact_read_only)
+	{
+		/*
+		 * The remote (sub)transaction has been opened in read-write mode.
+		 */
+		Assert(read_only_level == 0 ||
+			   entry->xact_depth <= read_only_level);
+
+		/*
+		 * If its nesting depth matches read_only_level, it means that the
+		 * local read-write (sub)transaction that started it has changed to
+		 * read-only after that; in which case change it to read-only as well.
+		 * Otherwise, the local (sub)transaction is still read-write, so there
+		 * is no need to do anything.
+		 */
+		if (entry->xact_depth == read_only_level)
+		{
+			entry->changing_xact_state = true;
+			do_sql_command(entry->conn, "SET transaction_read_only = on");
+			entry->xact_read_only = true;
+			entry->changing_xact_state = false;
+		}
+	}
+	else
+	{
+		/*
+		 * The remote (sub)transaction has been opened in read-only mode.
+		 */
+		Assert(read_only_level > 0 &&
+			   entry->xact_depth >= read_only_level);
+
+		/*
+		 * The local read-only (sub)transaction that started it is guaranteed
+		 * to be still read-only (see check_transaction_read_only), so there
+		 * is no need to do anything.
+		 */
+	}
 
 	/*
 	 * If we're in a subtransaction, stack up savepoints to match our level.
@@ -902,12 +987,21 @@ begin_remote_xact(ConnCacheEntry *entry)
 	 */
 	while (entry->xact_depth < curlevel)
 	{
-		char		sql[64];
+		StringInfoData sql;
+		bool		ro = (entry->xact_depth + 1 == read_only_level);
 
-		snprintf(sql, sizeof(sql), "SAVEPOINT s%d", entry->xact_depth + 1);
+		initStringInfo(&sql);
+		appendStringInfo(&sql, "SAVEPOINT s%d", entry->xact_depth + 1);
+		if (ro)
+			appendStringInfoString(&sql, "; SET transaction_read_only = on");
 		entry->changing_xact_state = true;
-		do_sql_command(entry->conn, sql);
+		do_sql_command(entry->conn, sql.data);
 		entry->xact_depth++;
+		if (ro)
+		{
+			Assert(!entry->xact_read_only);
+			entry->xact_read_only = true;
+		}
 		entry->changing_xact_state = false;
 	}
 }
@@ -1212,6 +1306,9 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
 	/* Also reset cursor numbering for next transaction */
 	cursor_number = 0;
+
+	/* Likewise for read_only_level */
+	read_only_level = 0;
 }
 
 /*
@@ -1310,6 +1407,10 @@ pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid,
 									   false);
 		}
 	}
+
+	/* If in read_only_level, reset it */
+	if (curlevel == read_only_level)
+		read_only_level = 0;
 }
 
 /*
@@ -1412,6 +1513,9 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
 		/* Reset state to show we're out of a transaction */
 		entry->xact_depth = 0;
 
+		/* Reset xact r/o state */
+		entry->xact_read_only = false;
+
 		/*
 		 * If the connection isn't in a good idle state, it is marked as
 		 * invalid or keep_connections option of its server is disabled, then
@@ -1432,6 +1536,10 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
 	{
 		/* Reset state to show we're out of a subtransaction */
 		entry->xact_depth--;
+
+		/* If in read_only_level, reset xact r/o state */
+		if (entry->xact_depth + 1 == read_only_level)
+			entry->xact_read_only = false;
 	}
 }
 
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index ac34a1acacb62..cd22553236f05 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -12575,6 +12575,142 @@ SELECT count(*) FROM remote_application_name
 DROP FOREIGN TABLE remote_application_name;
 DROP VIEW my_application_name;
 -- ===================================================================
+-- test read-only and/or deferrable transactions
+-- ===================================================================
+CREATE TABLE loct (f1 int, f2 text);
+CREATE FUNCTION locf() RETURNS SETOF loct LANGUAGE SQL AS
+  'UPDATE public.loct SET f2 = f2 || f2 RETURNING *';
+CREATE VIEW locv AS SELECT t.* FROM locf() t;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'locv');
+CREATE FOREIGN TABLE remt2 (f1 int, f2 text)
+  SERVER loopback2 OPTIONS (table_name 'locv');
+INSERT INTO loct VALUES (1, 'foo'), (2, 'bar');
+START TRANSACTION READ ONLY;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+START TRANSACTION;
+SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+-- Exercise abort code paths in pgfdw_xact_callback/pgfdw_subxact_callback
+-- in situations where multiple connections are involved
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+DROP FOREIGN TABLE remt;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'loct');
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+SELECT * FROM remt;
+ f1 | f2  
+----+-----
+  1 | foo
+  2 | bar
+(2 rows)
+
+COMMIT;
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE DEFERRABLE;
+SELECT * FROM remt;
+ f1 | f2  
+----+-----
+  1 | foo
+  2 | bar
+(2 rows)
+
+COMMIT;
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+SELECT * FROM remt;
+ f1 | f2  
+----+-----
+  1 | foo
+  2 | bar
+(2 rows)
+
+COMMIT;
+-- Clean up
+DROP FOREIGN TABLE remt;
+DROP FOREIGN TABLE remt2;
+DROP VIEW locv;
+DROP FUNCTION locf();
+DROP TABLE loct;
+-- ===================================================================
 -- test parallel commit and parallel abort
 -- ===================================================================
 ALTER SERVER loopback OPTIONS (ADD parallel_commit 'true');
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 41e47cc795ba8..cc8ec24c30eb0 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2779,7 +2779,7 @@ postgresIterateDirectModify(ForeignScanState *node)
 	if (!resultRelInfo->ri_projectReturning)
 	{
 		TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
-		Instrumentation *instr = node->ss.ps.instrument;
+		NodeInstrumentation *instr = node->ss.ps.instrument;
 
 		Assert(!dmstate->has_returning);
 
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 0e218b29a29ed..59963e298b846 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -4328,6 +4328,86 @@ SELECT count(*) FROM remote_application_name
 DROP FOREIGN TABLE remote_application_name;
 DROP VIEW my_application_name;
 
+-- ===================================================================
+-- test read-only and/or deferrable transactions
+-- ===================================================================
+CREATE TABLE loct (f1 int, f2 text);
+CREATE FUNCTION locf() RETURNS SETOF loct LANGUAGE SQL AS
+  'UPDATE public.loct SET f2 = f2 || f2 RETURNING *';
+CREATE VIEW locv AS SELECT t.* FROM locf() t;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'locv');
+CREATE FOREIGN TABLE remt2 (f1 int, f2 text)
+  SERVER loopback2 OPTIONS (table_name 'locv');
+INSERT INTO loct VALUES (1, 'foo'), (2, 'bar');
+
+START TRANSACTION READ ONLY;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ROLLBACK;
+
+START TRANSACTION;
+SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK;
+
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK;
+
+-- Exercise abort code paths in pgfdw_xact_callback/pgfdw_subxact_callback
+-- in situations where multiple connections are involved
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ROLLBACK;
+
+DROP FOREIGN TABLE remt;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'loct');
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+SELECT * FROM remt;
+COMMIT;
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE DEFERRABLE;
+SELECT * FROM remt;
+COMMIT;
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+SELECT * FROM remt;
+COMMIT;
+
+-- Clean up
+DROP FOREIGN TABLE remt;
+DROP FOREIGN TABLE remt2;
+DROP VIEW locv;
+DROP FUNCTION locf();
+DROP TABLE loct;
+
 -- ===================================================================
 -- test parallel commit and parallel abort
 -- ===================================================================
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 604e8578a8dcd..d28f4f22535b9 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -734,6 +734,7 @@ WHERE t1.unique1 &lt; 10 AND t1.unique2 = t2.unique2;
    -&gt;  Index Scan using tenk2_unique2 on tenk2 t2  (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10)
          Index Cond: (unique2 = t1.unique2)
          Index Searches: 10
+         Table Buffers: shared hit=10
          Buffers: shared hit=24 read=6
  Planning:
    Buffers: shared hit=15 dirtied=9
@@ -1005,7 +1006,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @&gt; polygon '(0.5,2.0)';
    Index Cond: (f1 @&gt; '((0.5,2))'::polygon)
    Rows Removed by Index Recheck: 1
    Index Searches: 1
-   Buffers: shared hit=1
+   Table Buffers: shared hit=1
+   Buffers: shared hit=2
  Planning Time: 0.039 ms
  Execution Time: 0.098 ms
 </screen>
@@ -1014,7 +1016,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @&gt; polygon '(0.5,2.0)';
     then rejected by a recheck of the index condition.  This happens because a
     GiST index is <quote>lossy</quote> for polygon containment tests: it actually
     returns the rows with polygons that overlap the target, and then we have
-    to do the exact containment test on those rows.
+    to do the exact containment test on those rows. The <literal>Table Buffers</literal>
+    counts indicate how many operations were performed on the table instead of
+    the index. This number is included in the <literal>Buffers</literal> counts.
    </para>
 
    <para>
@@ -1203,13 +1207,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 &lt; 100 AND unique2 &gt; 9000
                                                           QUERY PLAN
 -------------------------------------------------------------------&zwsp;------------------------------------------------------------
  Limit  (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1)
-   Buffers: shared hit=16
+   Buffers: shared hit=14
    -&gt;  Index Scan using tenk1_unique2 on tenk1  (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1)
          Index Cond: (unique2 &gt; 9000)
          Filter: (unique1 &lt; 100)
          Rows Removed by Filter: 287
          Index Searches: 1
-         Buffers: shared hit=16
+         Table Buffers: shared hit=11
+         Buffers: shared hit=14
  Planning Time: 0.077 ms
  Execution Time: 0.086 ms
 </screen>
diff --git a/doc/src/sgml/postgres-fdw.sgml b/doc/src/sgml/postgres-fdw.sgml
index de69ddcdebcc7..9185c76f93290 100644
--- a/doc/src/sgml/postgres-fdw.sgml
+++ b/doc/src/sgml/postgres-fdw.sgml
@@ -1103,6 +1103,23 @@ CREATE SUBSCRIPTION my_subscription SERVER subscription_server PUBLICATION testp
    <productname>PostgreSQL</productname> release might modify these rules.
   </para>
 
+  <para>
+   The remote transaction is opened in the same read/write mode as the local
+   transaction: if the local transaction is <literal>READ ONLY</literal>,
+   the remote transaction is opened in <literal>READ ONLY</literal> mode,
+   otherwise it is opened in <literal>READ WRITE</literal> mode.
+   (This rule is also applied to remote and local subtransactions.)
+   Note that this does not prevent login triggers executed on the remote
+   server from writing.
+  </para>
+
+  <para>
+   The remote transaction is also opened in the same deferrable mode as the
+   local transaction: if the local transaction is <literal>DEFERRABLE</literal>,
+   the remote transaction is opened in <literal>DEFERRABLE</literal> mode,
+   otherwise it is opened in <literal>NOT DEFERRABLE</literal> mode.
+  </para>
+
   <para>
    Note that it is currently not supported by
    <filename>postgres_fdw</filename> to prepare the remote transaction for
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 5b8b521802ee5..71070736acb98 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -509,6 +509,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200);
    -&gt;  Index Scan using test_pkey on test  (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1)
          Index Cond: ((id &gt; 100) AND (id &lt; 200))
          Index Searches: 1
+         Table Buffers: shared hit=1
          Buffers: shared hit=4
  Planning Time: 0.244 ms
  Execution Time: 0.073 ms
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index bdb30752e098c..9e545b4ef0e66 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -51,8 +51,7 @@
 #define PARALLEL_KEY_BRIN_SHARED		UINT64CONST(0xB000000000000001)
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xB000000000000002)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xB000000000000003)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xB000000000000004)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xB000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xB000000000000004)
 
 /*
  * Status for index builds performed in parallel.  This is allocated in a
@@ -148,8 +147,7 @@ typedef struct BrinLeader
 	BrinShared *brinshared;
 	Sharedsort *sharedsort;
 	Snapshot	snapshot;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 } BrinLeader;
 
 /*
@@ -2387,8 +2385,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	BrinShared *brinshared;
 	Sharedsort *sharedsort;
 	BrinLeader *brinleader = palloc0_object(BrinLeader);
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 	bool		leaderparticipates = true;
 	int			querylen;
 
@@ -2430,18 +2427,14 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	shm_toc_estimate_keys(&pcxt->estimator, 2);
 
 	/*
-	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
-	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgWalUsage or
-	 * pgBufferUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -2514,15 +2507,12 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	}
 
 	/*
-	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * Allocate space for each worker's Instrumentation; no need to
 	 * initialize.
 	 */
-	walusage = shm_toc_allocate(pcxt->toc,
-								mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
-	bufferusage = shm_toc_allocate(pcxt->toc,
-								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
 
 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
@@ -2533,8 +2523,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	brinleader->brinshared = brinshared;
 	brinleader->sharedsort = sharedsort;
 	brinleader->snapshot = snapshot;
-	brinleader->walusage = walusage;
-	brinleader->bufferusage = bufferusage;
+	brinleader->instr = instr;
 
 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
@@ -2573,7 +2562,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
-		InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
+		InstrAccumParallelQuery(&brinleader->instr[i]);
 
 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(brinleader->snapshot))
@@ -2887,8 +2876,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	QueryInstrumentation *instr;
+	Instrumentation *worker_instr;
 	int			sortmem;
 
 	/*
@@ -2936,7 +2925,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	tuplesort_attach_shared(sharedsort, seg);
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/*
 	 * Might as well use reliable figure when doling out maintenance_work_mem
@@ -2949,10 +2938,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 								  heapRel, indexRel, sortmem, false);
 
 	/* Report WAL/buffer usage during parallel execution */
-	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
-						  &walusage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 9d83a4957757b..f3de62ce7f339 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -45,8 +45,7 @@
 #define PARALLEL_KEY_GIN_SHARED			UINT64CONST(0xB000000000000001)
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xB000000000000002)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xB000000000000003)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xB000000000000004)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xB000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xB000000000000004)
 
 /*
  * Status for index builds performed in parallel.  This is allocated in a
@@ -138,8 +137,7 @@ typedef struct GinLeader
 	GinBuildShared *ginshared;
 	Sharedsort *sharedsort;
 	Snapshot	snapshot;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 } GinLeader;
 
 typedef struct
@@ -945,8 +943,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	GinBuildShared *ginshared;
 	Sharedsort *sharedsort;
 	GinLeader  *ginleader = palloc0_object(GinLeader);
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 	bool		leaderparticipates = true;
 	int			querylen;
 
@@ -987,18 +984,14 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	shm_toc_estimate_keys(&pcxt->estimator, 2);
 
 	/*
-	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
-	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgWalUsage or
-	 * pgBufferUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -1066,15 +1059,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	}
 
 	/*
-	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * Allocate space for each worker's Instrumentation; no need to
 	 * initialize.
 	 */
-	walusage = shm_toc_allocate(pcxt->toc,
-								mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
-	bufferusage = shm_toc_allocate(pcxt->toc,
-								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
 
 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
@@ -1085,8 +1075,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	ginleader->ginshared = ginshared;
 	ginleader->sharedsort = sharedsort;
 	ginleader->snapshot = snapshot;
-	ginleader->walusage = walusage;
-	ginleader->bufferusage = bufferusage;
+	ginleader->instr = instr;
 
 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
@@ -1125,7 +1114,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
-		InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
+		InstrAccumParallelQuery(&ginleader->instr[i]);
 
 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(ginleader->snapshot))
@@ -2118,8 +2107,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	QueryInstrumentation *instr;
+	Instrumentation *worker_instr;
 	int			sortmem;
 
 	/*
@@ -2186,7 +2175,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	tuplesort_attach_shared(sharedsort, seg);
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/*
 	 * Might as well use reliable figure when doling out maintenance_work_mem
@@ -2199,10 +2188,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 								 heapRel, indexRel, sortmem, false);
 
 	/* Report WAL/buffer usage during parallel execution */
-	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
-						  &walusage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c
index bbd8a165ddc23..33d14f1de7d52 100644
--- a/src/backend/access/heap/heapam_indexscan.c
+++ b/src/backend/access/heap/heapam_indexscan.c
@@ -41,20 +41,13 @@ heapam_index_fetch_begin(Relation rel, uint32 flags)
 void
 heapam_index_fetch_reset(IndexFetchTableData *scan)
 {
-	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
-
-	if (BufferIsValid(hscan->xs_cbuf))
-	{
-		ReleaseBuffer(hscan->xs_cbuf);
-		hscan->xs_cbuf = InvalidBuffer;
-		hscan->xs_blk = InvalidBlockNumber;
-	}
-
-	if (BufferIsValid(hscan->xs_vmbuffer))
-	{
-		ReleaseBuffer(hscan->xs_vmbuffer);
-		hscan->xs_vmbuffer = InvalidBuffer;
-	}
+	/*
+	 * Resets are a no-op.
+	 *
+	 * Deliberately avoid dropping pins now held in xs_cbuf and xs_vmbuffer.
+	 * This saves cycles during certain tight nested loop joins (it can avoid
+	 * repeated pinning and unpinning of the same buffer across rescans).
+	 */
 }
 
 void
@@ -62,7 +55,13 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 
-	heapam_index_fetch_reset(scan);
+	/* drop pin if there's a pinned heap page */
+	if (BufferIsValid(hscan->xs_cbuf))
+		ReleaseBuffer(hscan->xs_cbuf);
+
+	/* drop pin if there's a pinned visibility map page */
+	if (BufferIsValid(hscan->xs_vmbuffer))
+		ReleaseBuffer(hscan->xs_vmbuffer);
 
 	pfree(hscan);
 }
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 88c71cd85b60b..291d9d67bc295 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -637,8 +637,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 	TimestampTz starttime = 0;
 	PgStat_Counter startreadtime = 0,
 				startwritetime = 0;
-	WalUsage	startwalusage = pgWalUsage;
-	BufferUsage startbufferusage = pgBufferUsage;
+	QueryInstrumentation *instr = NULL;
 	ErrorContextCallback errcallback;
 	char	  **indnames = NULL;
 	Size		dead_items_max_bytes = 0;
@@ -654,6 +653,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 			startreadtime = pgStatBlockReadTime;
 			startwritetime = pgStatBlockWriteTime;
 		}
+		instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+		InstrQueryStart(instr);
 	}
 
 	/* Used for instrumentation and stats report */
@@ -984,14 +985,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
+		InstrQueryStopFinalize(instr);
+
 		if (verbose || params->log_vacuum_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
 									   params->log_vacuum_min_duration))
 		{
 			long		secs_dur;
 			int			usecs_dur;
-			WalUsage	walusage;
-			BufferUsage bufferusage;
 			StringInfoData buf;
 			char	   *msgfmt;
 			int32		diff;
@@ -1000,12 +1001,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 			int64		total_blks_hit;
 			int64		total_blks_read;
 			int64		total_blks_dirtied;
+			BufferUsage bufferusage = instr->instr.bufusage;
+			WalUsage	walusage = instr->instr.walusage;
 
 			TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
-			memset(&walusage, 0, sizeof(WalUsage));
-			WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-			memset(&bufferusage, 0, sizeof(BufferUsage));
-			BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
 
 			total_blks_hit = bufferusage.shared_blks_hit +
 				bufferusage.local_blks_hit;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 44496ae0963e1..23288a4f99490 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -375,7 +375,7 @@ index_rescan(IndexScanDesc scan,
 	Assert(nkeys == scan->numberOfKeys);
 	Assert(norderbys == scan->numberOfOrderBys);
 
-	/* Release resources (like buffer pins) from table accesses */
+	/* reset table AM state for rescan */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -452,7 +452,7 @@ index_restrpos(IndexScanDesc scan)
 	SCAN_CHECKS;
 	CHECK_SCAN_PROCEDURE(amrestrpos);
 
-	/* release resources (like buffer pins) from table accesses */
+	/* reset table AM state for restoring the marked position */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -578,6 +578,7 @@ index_parallelrescan(IndexScanDesc scan)
 {
 	SCAN_CHECKS;
 
+	/* reset table AM state for rescan */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -659,7 +660,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* If we're out of index entries, we're done */
 	if (!found)
 	{
-		/* release resources (like buffer pins) from table accesses */
+		/* reset table AM state */
 		if (scan->xs_heapfetch)
 			table_index_fetch_reset(scan->xs_heapfetch);
 
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 756dfa3dcf47e..cb238f862a7c8 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -66,8 +66,7 @@
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
 #define PARALLEL_KEY_TUPLESORT_SPOOL2	UINT64CONST(0xA000000000000003)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xA000000000000004)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xA000000000000005)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xA000000000000006)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xA000000000000005)
 
 /*
  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
@@ -195,8 +194,7 @@ typedef struct BTLeader
 	Sharedsort *sharedsort;
 	Sharedsort *sharedsort2;
 	Snapshot	snapshot;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 } BTLeader;
 
 /*
@@ -1408,8 +1406,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	Sharedsort *sharedsort2;
 	BTSpool    *btspool = buildstate->spool;
 	BTLeader   *btleader = palloc0_object(BTLeader);
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 	bool		leaderparticipates = true;
 	int			querylen;
 
@@ -1462,18 +1459,14 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	}
 
 	/*
-	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
-	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgWalUsage or
-	 * pgBufferUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -1560,15 +1553,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	}
 
 	/*
-	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * Allocate space for each worker's Instrumentation; no need to
 	 * initialize.
 	 */
-	walusage = shm_toc_allocate(pcxt->toc,
-								mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
-	bufferusage = shm_toc_allocate(pcxt->toc,
-								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
 
 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
@@ -1580,8 +1570,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	btleader->sharedsort = sharedsort;
 	btleader->sharedsort2 = sharedsort2;
 	btleader->snapshot = snapshot;
-	btleader->walusage = walusage;
-	btleader->bufferusage = bufferusage;
+	btleader->instr = instr;
 
 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
@@ -1620,7 +1609,7 @@ _bt_end_parallel(BTLeader *btleader)
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
-		InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
+		InstrAccumParallelQuery(&btleader->instr[i]);
 
 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(btleader->snapshot))
@@ -1753,8 +1742,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	QueryInstrumentation *instr;
+	Instrumentation *worker_instr;
 	int			sortmem;
 
 #ifdef BTREE_BUILD_STATS
@@ -1828,7 +1817,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	}
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/* Perform sorting of spool, and possibly a spool2 */
 	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
@@ -1836,10 +1825,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 							   sharedsort2, sortmem, false);
 
 	/* Report WAL/buffer usage during parallel execution */
-	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
-						  &walusage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index aafc53e016467..48bc90c967353 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1046,6 +1046,34 @@ TransactionStartedDuringRecovery(void)
 	return CurrentTransactionState->startedInRecovery;
 }
 
+/*
+ *	GetTopReadOnlyTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction or when neither
+ * a top-level transaction nor subtransactions are read-only, one when the
+ * top-level transaction is read-only, two when one level of subtransaction is
+ * read-only, etc.
+ *
+ * Note: subtransactions of the topmost read-only transaction are also
+ * read-only, because they inherit read-only mode from the transaction, and
+ * thus can't change to read-write mode (see check_transaction_read_only).
+ */
+int
+GetTopReadOnlyTransactionNestLevel(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (!XactReadOnly)
+		return 0;
+	while (s->nestingLevel > 1)
+	{
+		if (!s->prevXactReadOnly)
+			return s->nestingLevel;
+		s = s->parent;
+	}
+	return s->nestingLevel;
+}
+
 /*
  *	EnterParallelMode
  */
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 9e8999bbb616f..71c9a26566236 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1103,10 +1103,10 @@ XLogInsertRecord(XLogRecData *rdata,
 	/* Report WAL traffic to the instrumentation. */
 	if (inserted)
 	{
-		pgWalUsage.wal_bytes += rechdr->xl_tot_len;
-		pgWalUsage.wal_records++;
-		pgWalUsage.wal_fpi += num_fpi;
-		pgWalUsage.wal_fpi_bytes += fpi_bytes;
+		INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len);
+		INSTR_WALUSAGE_INCR(wal_records);
+		INSTR_WALUSAGE_ADD(wal_fpi, num_fpi);
+		INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes);
 
 		/* Required for the flush of pending stats WAL data */
 		pgstat_report_fixed = true;
@@ -2085,7 +2085,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 					WriteRqst.Flush = InvalidXLogRecPtr;
 					XLogWrite(WriteRqst, tli, false);
 					LWLockRelease(WALWriteLock);
-					pgWalUsage.wal_buffers_full++;
+					INSTR_WALUSAGE_INCR(wal_buffers_full);
 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
 
 					/*
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index c52c0a6023ddf..ebd41176b9446 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -1184,7 +1184,7 @@ build_indices(void)
 		heap = table_open(ILHead->il_heap, NoLock);
 		ind = index_open(ILHead->il_ind, NoLock);
 
-		index_build(heap, ind, ILHead->il_info, false, false);
+		index_build(heap, ind, ILHead->il_info, false, false, false);
 
 		index_close(ind, NoLock);
 		table_close(heap, NoLock);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 5748aa9a1a9af..ae6b7cda3ddfe 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -3570,7 +3570,8 @@ RelationTruncateIndexes(Relation heapRelation)
 
 		/* Initialize the index and rebuild */
 		/* Note: we do not need to re-establish pkey setting */
-		index_build(heapRelation, currentIndex, indexInfo, true, false);
+		index_build(heapRelation, currentIndex, indexInfo, true, false,
+					true);
 
 		/* We're done with this index */
 		index_close(currentIndex, NoLock);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 1ccfa687f052b..9407c357f2716 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -715,6 +715,9 @@ UpdateIndexRelation(Oid indexoid,
  *			already exists.
  *		INDEX_CREATE_PARTITIONED:
  *			create a partitioned index (table must be partitioned)
+ *		INDEX_CREATE_SUPPRESS_PROGRESS:
+ *			don't report progress during the index build.
+ *
  * constr_flags: flags passed to index_constraint_create
  *		(only if INDEX_CREATE_ADD_CONSTRAINT is set)
  * allow_system_table_mods: allow table to be a system catalog
@@ -760,6 +763,7 @@ index_create(Relation heapRelation,
 	bool		invalid = (flags & INDEX_CREATE_INVALID) != 0;
 	bool		concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
 	bool		partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0;
+	bool		progress = (flags & INDEX_CREATE_SUPPRESS_PROGRESS) == 0;
 	char		relkind;
 	TransactionId relfrozenxid;
 	MultiXactId relminmxid;
@@ -1276,7 +1280,8 @@ index_create(Relation heapRelation,
 	}
 	else
 	{
-		index_build(heapRelation, indexRelation, indexInfo, false, true);
+		index_build(heapRelation, indexRelation, indexInfo, false, true,
+					progress);
 	}
 
 	/*
@@ -1289,22 +1294,23 @@ index_create(Relation heapRelation,
 }
 
 /*
- * index_concurrently_create_copy
+ * index_create_copy
  *
- * Create concurrently an index based on the definition of the one provided by
- * caller.  The index is inserted into catalogs and needs to be built later
- * on.  This is called during concurrent reindex processing.
+ * Create an index based on the definition of the one provided by caller.  The
+ * index is inserted into catalogs.  'flags' are passed directly to
+ * index_create.
  *
  * "tablespaceOid" is the tablespace to use for this index.
  */
 Oid
-index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
-							   Oid tablespaceOid, const char *newName)
+index_create_copy(Relation heapRelation, uint16 flags,
+				  Oid oldIndexId, Oid tablespaceOid, const char *newName)
 {
 	Relation	indexRelation;
 	IndexInfo  *oldInfo,
 			   *newInfo;
 	Oid			newIndexId = InvalidOid;
+	bool		concurrently = (flags & INDEX_CREATE_CONCURRENT) != 0;
 	HeapTuple	indexTuple,
 				classTuple;
 	Datum		indclassDatum,
@@ -1328,7 +1334,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 	 * Concurrent build of an index with exclusion constraints is not
 	 * supported.
 	 */
-	if (oldInfo->ii_ExclusionOps != NULL)
+	if (oldInfo->ii_ExclusionOps != NULL && concurrently)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("concurrent index creation for exclusion constraints is not supported")));
@@ -1384,9 +1390,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 	}
 
 	/*
-	 * Build the index information for the new index.  Note that rebuild of
-	 * indexes with exclusion constraints is not supported, hence there is no
-	 * need to fill all the ii_Exclusion* fields.
+	 * Build the index information for the new index.
 	 */
 	newInfo = makeIndexInfo(oldInfo->ii_NumIndexAttrs,
 							oldInfo->ii_NumIndexKeyAttrs,
@@ -1395,11 +1399,24 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 							indexPreds,
 							oldInfo->ii_Unique,
 							oldInfo->ii_NullsNotDistinct,
-							false,	/* not ready for inserts */
-							true,
+							!concurrently,	/* isready */
+							concurrently,	/* concurrent */
 							indexRelation->rd_indam->amsummarizing,
 							oldInfo->ii_WithoutOverlaps);
 
+	/* fetch exclusion constraint info if any */
+	if (indexRelation->rd_index->indisexclusion)
+	{
+		/*
+		 * XXX Beware: we're making newInfo point to oldInfo-owned memory.  It
+		 * would be more orthodox to palloc+memcpy, but we don't need that
+		 * here at present.
+		 */
+		newInfo->ii_ExclusionOps = oldInfo->ii_ExclusionOps;
+		newInfo->ii_ExclusionProcs = oldInfo->ii_ExclusionProcs;
+		newInfo->ii_ExclusionStrats = oldInfo->ii_ExclusionStrats;
+	}
+
 	/*
 	 * Extract the list of column names and the column numbers for the new
 	 * index information.  All this information will be used for the index
@@ -1459,7 +1476,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 							  indcoloptions->values,
 							  stattargets,
 							  reloptionsDatum,
-							  INDEX_CREATE_SKIP_BUILD | INDEX_CREATE_CONCURRENT,
+							  flags,
 							  0,
 							  true, /* allow table to be a system catalog? */
 							  false,	/* is_internal? */
@@ -1523,7 +1540,7 @@ index_concurrently_build(Oid heapRelationId,
 	indexInfo->ii_BrokenHotChain = false;
 
 	/* Now build the index */
-	index_build(heapRel, indexRelation, indexInfo, false, true);
+	index_build(heapRel, indexRelation, indexInfo, false, true, true);
 
 	/* Roll back any GUC changes executed by index functions */
 	AtEOXact_GUC(false, save_nestlevel);
@@ -2994,6 +3011,7 @@ index_update_stats(Relation rel,
  *
  * isreindex indicates we are recreating a previously-existing index.
  * parallel indicates if parallelism may be useful.
+ * progress indicates if the backend should update its progress info.
  *
  * Note: before Postgres 8.2, the passed-in heap and index Relations
  * were automatically closed by this routine.  This is no longer the case.
@@ -3004,7 +3022,8 @@ index_build(Relation heapRelation,
 			Relation indexRelation,
 			IndexInfo *indexInfo,
 			bool isreindex,
-			bool parallel)
+			bool parallel,
+			bool progress)
 {
 	IndexBuildResult *stats;
 	Oid			save_userid;
@@ -3055,6 +3074,7 @@ index_build(Relation heapRelation,
 	RestrictSearchPath();
 
 	/* Set up initial progress report status */
+	if (progress)
 	{
 		const int	progress_index[] = {
 			PROGRESS_CREATEIDX_PHASE,
@@ -3812,7 +3832,7 @@ reindex_index(const ReindexStmt *stmt, Oid indexId,
 
 	/* Initialize the index and rebuild */
 	/* Note: we do not need to re-establish pkey setting */
-	index_build(heapRelation, iRel, indexInfo, true, true);
+	index_build(heapRelation, iRel, indexInfo, true, true, progress);
 
 	/* Re-allow use of target index */
 	ResetReindexProcessing();
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 49a5cdf579c16..10f8a2dc81cd5 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -309,9 +309,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 	Oid			save_userid;
 	int			save_sec_context;
 	int			save_nestlevel;
-	WalUsage	startwalusage = pgWalUsage;
-	BufferUsage startbufferusage = pgBufferUsage;
-	BufferUsage bufferusage;
+	QueryInstrumentation *instr = NULL;
 	PgStat_Counter startreadtime = 0;
 	PgStat_Counter startwritetime = 0;
 
@@ -362,6 +360,9 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 		}
 
 		pg_rusage_init(&ru0);
+
+		instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+		InstrQueryStart(instr);
 	}
 
 	/* Used for instrumentation and stats report */
@@ -742,12 +743,13 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
+		InstrQueryStopFinalize(instr);
+
 		if (verbose || params->log_analyze_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
 									   params->log_analyze_min_duration))
 		{
 			long		delay_in_ms;
-			WalUsage	walusage;
 			double		read_rate = 0;
 			double		write_rate = 0;
 			char	   *msgfmt;
@@ -755,18 +757,15 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 			int64		total_blks_hit;
 			int64		total_blks_read;
 			int64		total_blks_dirtied;
-
-			memset(&bufferusage, 0, sizeof(BufferUsage));
-			BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
-			memset(&walusage, 0, sizeof(WalUsage));
-			WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-
-			total_blks_hit = bufferusage.shared_blks_hit +
-				bufferusage.local_blks_hit;
-			total_blks_read = bufferusage.shared_blks_read +
-				bufferusage.local_blks_read;
-			total_blks_dirtied = bufferusage.shared_blks_dirtied +
-				bufferusage.local_blks_dirtied;
+			BufferUsage bufusage = instr->instr.bufusage;
+			WalUsage	walusage = instr->instr.walusage;
+
+			total_blks_hit = bufusage.shared_blks_hit +
+				bufusage.local_blks_hit;
+			total_blks_read = bufusage.shared_blks_read +
+				bufusage.local_blks_read;
+			total_blks_dirtied = bufusage.shared_blks_dirtied +
+				bufusage.local_blks_dirtied;
 
 			/*
 			 * We do not expect an analyze to take > 25 days and it simplifies
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e4b70166b0e50..42fc00cbd34db 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -144,7 +144,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
 static void show_wal_usage(ExplainState *es, const WalUsage *usage);
 static void show_memory_counters(ExplainState *es,
 								 const MemoryContextCounters *mem_counters);
@@ -324,13 +324,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 						 QueryEnvironment *queryEnv)
 {
 	PlannedStmt *plan;
-	instr_time	planstart,
-				planduration;
-	BufferUsage bufusage_start,
-				bufusage;
+	QueryInstrumentation *plan_instr = NULL;
 	MemoryContextCounters mem_counters;
 	MemoryContext planner_ctx = NULL;
 	MemoryContext saved_ctx = NULL;
+	int			instrument_options = INSTRUMENT_TIMER;
+
+	if (es->buffers)
+		instrument_options |= INSTRUMENT_BUFFERS;
+
+	plan_instr = InstrQueryAlloc(instrument_options);
 
 	if (es->memory)
 	{
@@ -348,15 +351,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 		saved_ctx = MemoryContextSwitchTo(planner_ctx);
 	}
 
-	if (es->buffers)
-		bufusage_start = pgBufferUsage;
-	INSTR_TIME_SET_CURRENT(planstart);
+	InstrQueryStart(plan_instr);
 
 	/* plan the query */
 	plan = pg_plan_query(query, queryString, cursorOptions, params, es);
 
-	INSTR_TIME_SET_CURRENT(planduration);
-	INSTR_TIME_SUBTRACT(planduration, planstart);
+	InstrQueryStopFinalize(plan_instr);
 
 	if (es->memory)
 	{
@@ -364,16 +364,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 		MemoryContextMemConsumed(planner_ctx, &mem_counters);
 	}
 
-	/* calc differences of buffer counters. */
-	if (es->buffers)
-	{
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-	}
-
 	/* run it (if needed) and produce output */
 	ExplainOnePlan(plan, into, es, queryString, params, queryEnv,
-				   &planduration, (es->buffers ? &bufusage : NULL),
+				   &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL),
 				   es->memory ? &mem_counters : NULL);
 }
 
@@ -590,7 +583,12 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 
 	/* grab serialization metrics before we destroy the DestReceiver */
 	if (es->serialize != EXPLAIN_SERIALIZE_NONE)
-		serializeMetrics = GetSerializationMetrics(dest);
+	{
+		SerializeMetrics *metrics = GetSerializationMetrics(dest);
+
+		if (metrics)
+			memcpy(&serializeMetrics, metrics, sizeof(SerializeMetrics));
+	}
 
 	/* call the DestReceiver's destroy method even during explain */
 	dest->rDestroy(dest);
@@ -613,7 +611,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		}
 
 		if (bufusage)
-			show_buffer_usage(es, bufusage);
+			show_buffer_usage(es, bufusage, NULL);
 
 		if (mem_counters)
 			show_memory_counters(es, mem_counters);
@@ -1019,7 +1017,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 		ExplainIndentText(es);
 		if (es->timing)
 			appendStringInfo(es->str, "Serialization: time=%.3f ms  output=" UINT64_FORMAT "kB  format=%s\n",
-							 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent),
+							 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total),
 							 BYTES_TO_KILOBYTES(metrics->bytesSent),
 							 format);
 		else
@@ -1027,10 +1025,10 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 							 BYTES_TO_KILOBYTES(metrics->bytesSent),
 							 format);
 
-		if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
+		if (es->buffers && peek_buffer_usage(es, &metrics->instr.bufusage))
 		{
 			es->indent++;
-			show_buffer_usage(es, &metrics->bufferUsage);
+			show_buffer_usage(es, &metrics->instr.bufusage, NULL);
 			es->indent--;
 		}
 	}
@@ -1038,13 +1036,13 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 	{
 		if (es->timing)
 			ExplainPropertyFloat("Time", "ms",
-								 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent),
+								 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total),
 								 3, es);
 		ExplainPropertyUInteger("Output Volume", "kB",
 								BYTES_TO_KILOBYTES(metrics->bytesSent), es);
 		ExplainPropertyText("Format", format, es);
 		if (es->buffers)
-			show_buffer_usage(es, &metrics->bufferUsage);
+			show_buffer_usage(es, &metrics->instr.bufusage, NULL);
 	}
 
 	ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1101,18 +1099,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 	for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
 	{
 		Trigger    *trig = rInfo->ri_TrigDesc->triggers + nt;
-		Instrumentation *instr = rInfo->ri_TrigInstrument + nt;
+		TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt;
 		char	   *relname;
 		char	   *conname = NULL;
 
-		/* Must clean up instrumentation state */
-		InstrEndLoop(instr);
-
 		/*
 		 * We ignore triggers that were never invoked; they likely aren't
 		 * relevant to the current query type.
 		 */
-		if (instr->ntuples == 0)
+		if (tginstr->firings == 0)
 			continue;
 
 		ExplainOpenGroup("Trigger", NULL, true, es);
@@ -1137,11 +1132,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 			if (show_relname)
 				appendStringInfo(es->str, " on %s", relname);
 			if (es->timing)
-				appendStringInfo(es->str, ": time=%.3f calls=%.0f\n",
-								 INSTR_TIME_GET_MILLISEC(instr->total),
-								 instr->ntuples);
+				appendStringInfo(es->str, ": time=%.3f calls=%d\n",
+								 INSTR_TIME_GET_MILLISEC(tginstr->instr.total),
+								 tginstr->firings);
 			else
-				appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples);
+				appendStringInfo(es->str, ": calls=%d\n", tginstr->firings);
 		}
 		else
 		{
@@ -1151,9 +1146,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 			ExplainPropertyText("Relation", relname, es);
 			if (es->timing)
 				ExplainPropertyFloat("Time", "ms",
-									 INSTR_TIME_GET_MILLISEC(instr->total), 3,
+									 INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3,
 									 es);
-			ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es);
+			ExplainPropertyInteger("Calls", NULL, tginstr->firings, es);
 		}
 
 		if (conname)
@@ -1840,7 +1835,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	{
 		double		nloops = planstate->instrument->nloops;
 		double		startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops;
-		double		total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops;
+		double		total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->instr.total) / nloops;
 		double		rows = planstate->instrument->ntuples / nloops;
 
 		if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -1893,11 +1888,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	/* prepare per-worker general execution details */
 	if (es->workers_state && es->verbose)
 	{
-		WorkerInstrumentation *w = planstate->worker_instrument;
+		WorkerNodeInstrumentation *w = planstate->worker_instrument;
 
 		for (int n = 0; n < w->num_workers; n++)
 		{
-			Instrumentation *instrument = &w->instrument[n];
+			NodeInstrumentation *instrument = &w->instrument[n];
 			double		nloops = instrument->nloops;
 			double		startup_ms;
 			double		total_ms;
@@ -1906,7 +1901,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			if (nloops <= 0)
 				continue;
 			startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops;
-			total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops;
+			total_ms = INSTR_TIME_GET_MILLISEC(instrument->instr.total) / nloops;
 			rows = instrument->ntuples / nloops;
 
 			ExplainOpenWorker(n, es);
@@ -1975,6 +1970,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
 			show_indexsearches_info(planstate, es);
+
+			if (es->buffers && planstate->instrument)
+				show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument->table_instr.bufusage, "Table");
 			break;
 		case T_IndexOnlyScan:
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1992,6 +1990,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				ExplainPropertyFloat("Heap Fetches", NULL,
 									 planstate->instrument->ntuples2, 0, es);
 			show_indexsearches_info(planstate, es);
+
+			if (es->buffers && planstate->instrument)
+				show_buffer_usage(es, &((IndexOnlyScanState *) planstate)->ioss_Instrument->table_instr.bufusage, "Table");
 			break;
 		case T_BitmapIndexScan:
 			show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
@@ -2293,18 +2294,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 	/* Show buffer/WAL usage */
 	if (es->buffers && planstate->instrument)
-		show_buffer_usage(es, &planstate->instrument->bufusage);
+		show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL);
 	if (es->wal && planstate->instrument)
-		show_wal_usage(es, &planstate->instrument->walusage);
+		show_wal_usage(es, &planstate->instrument->instr.walusage);
 
 	/* Prepare per-worker buffer/WAL usage */
 	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
 	{
-		WorkerInstrumentation *w = planstate->worker_instrument;
+		WorkerNodeInstrumentation *w = planstate->worker_instrument;
 
 		for (int n = 0; n < w->num_workers; n++)
 		{
-			Instrumentation *instrument = &w->instrument[n];
+			NodeInstrumentation *instrument = &w->instrument[n];
 			double		nloops = instrument->nloops;
 
 			if (nloops <= 0)
@@ -2312,9 +2313,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 			ExplainOpenWorker(n, es);
 			if (es->buffers)
-				show_buffer_usage(es, &instrument->bufusage);
+				show_buffer_usage(es, &instrument->instr.bufusage, NULL);
 			if (es->wal)
-				show_wal_usage(es, &instrument->walusage);
+				show_wal_usage(es, &instrument->instr.walusage);
 			ExplainCloseWorker(n, es);
 		}
 	}
@@ -3924,26 +3925,45 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es)
 static void
 show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 {
+	uint64		exact_pages;
+	uint64		lossy_pages;
+
 	if (!es->analyze)
 		return;
 
+	/* Start with leader's stats */
+	exact_pages = planstate->stats.exact_pages;
+	lossy_pages = planstate->stats.lossy_pages;
+
+	/* Accumulate worker stats into node-level totals */
+	if (planstate->sinstrument != NULL)
+	{
+		for (int n = 0; n < planstate->sinstrument->num_workers; n++)
+		{
+			BitmapHeapScanInstrumentation *si = &planstate->sinstrument->sinstrument[n];
+
+			exact_pages += si->exact_pages;
+			lossy_pages += si->lossy_pages;
+		}
+	}
+
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 		ExplainPropertyUInteger("Exact Heap Blocks", NULL,
-								planstate->stats.exact_pages, es);
+								exact_pages, es);
 		ExplainPropertyUInteger("Lossy Heap Blocks", NULL,
-								planstate->stats.lossy_pages, es);
+								lossy_pages, es);
 	}
 	else
 	{
-		if (planstate->stats.exact_pages > 0 || planstate->stats.lossy_pages > 0)
+		if (exact_pages > 0 || lossy_pages > 0)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "Heap Blocks:");
-			if (planstate->stats.exact_pages > 0)
-				appendStringInfo(es->str, " exact=" UINT64_FORMAT, planstate->stats.exact_pages);
-			if (planstate->stats.lossy_pages > 0)
-				appendStringInfo(es->str, " lossy=" UINT64_FORMAT, planstate->stats.lossy_pages);
+			if (exact_pages > 0)
+				appendStringInfo(es->str, " exact=" UINT64_FORMAT, exact_pages);
+			if (lossy_pages > 0)
+				appendStringInfo(es->str, " lossy=" UINT64_FORMAT, lossy_pages);
 			appendStringInfoChar(es->str, '\n');
 		}
 	}
@@ -4112,7 +4132,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
  * Show buffer usage details.  This better be sync with peek_buffer_usage.
  */
 static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
 {
 	if (es->format == EXPLAIN_FORMAT_TEXT)
 	{
@@ -4137,6 +4157,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 		if (has_shared || has_local || has_temp)
 		{
 			ExplainIndentText(es);
+			if (title)
+				appendStringInfo(es->str, "%s ", title);
 			appendStringInfoString(es->str, "Buffers:");
 
 			if (has_shared)
@@ -4192,6 +4214,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 		if (has_shared_timing || has_local_timing || has_temp_timing)
 		{
 			ExplainIndentText(es);
+			if (title)
+				appendStringInfo(es->str, "%s ", title);
 			appendStringInfoString(es->str, "I/O Timings:");
 
 			if (has_shared_timing)
@@ -4233,6 +4257,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 	}
 	else
 	{
+		char	   *buffers_title = NULL;
+
+		if (title)
+		{
+			buffers_title = psprintf("%s Buffers", title);
+			ExplainOpenGroup(buffers_title, buffers_title, true, es);
+		}
+
 		ExplainPropertyInteger("Shared Hit Blocks", NULL,
 							   usage->shared_blks_hit, es);
 		ExplainPropertyInteger("Shared Read Blocks", NULL,
@@ -4253,8 +4285,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 							   usage->temp_blks_read, es);
 		ExplainPropertyInteger("Temp Written Blocks", NULL,
 							   usage->temp_blks_written, es);
+
+		if (buffers_title)
+			ExplainCloseGroup(buffers_title, buffers_title, true, es);
+
 		if (track_io_timing)
 		{
+			char	   *timings_title = NULL;
+
+			if (title)
+			{
+				timings_title = psprintf("%s I/O Timings", title);
+				ExplainOpenGroup(timings_title, timings_title, true, es);
+			}
+
 			ExplainPropertyFloat("Shared I/O Read Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
 								 3, es);
@@ -4273,6 +4317,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 			ExplainPropertyFloat("Temp I/O Write Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
 								 3, es);
+
+			if (timings_title)
+				ExplainCloseGroup(timings_title, timings_title, true, es);
 		}
 	}
 }
diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c
index 3c96061cf32ab..9c1b30fb75b73 100644
--- a/src/backend/commands/explain_dr.c
+++ b/src/backend/commands/explain_dr.c
@@ -110,15 +110,11 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
 	MemoryContext oldcontext;
 	StringInfo	buf = &myState->buf;
 	int			natts = typeinfo->natts;
-	instr_time	start,
-				end;
-	BufferUsage instr_start;
+	Instrumentation *instr = &myState->metrics.instr;
 
 	/* only measure time, buffers if requested */
-	if (myState->es->timing)
-		INSTR_TIME_SET_CURRENT(start);
-	if (myState->es->buffers)
-		instr_start = pgBufferUsage;
+	if (instr->need_timer || instr->need_stack)
+		InstrStart(instr);
 
 	/* Set or update my derived attribute info, if needed */
 	if (myState->attrinfo != typeinfo || myState->nattrs != natts)
@@ -186,18 +182,9 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
 	MemoryContextSwitchTo(oldcontext);
 	MemoryContextReset(myState->tmpcontext);
 
-	/* Update timing data */
-	if (myState->es->timing)
-	{
-		INSTR_TIME_SET_CURRENT(end);
-		INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start);
-	}
-
-	/* Update buffer metrics */
-	if (myState->es->buffers)
-		BufferUsageAccumDiff(&myState->metrics.bufferUsage,
-							 &pgBufferUsage,
-							 &instr_start);
+	/* Stop per-tuple measurement */
+	if (instr->need_timer || instr->need_stack)
+		InstrStop(instr);
 
 	return true;
 }
@@ -233,9 +220,17 @@ serializeAnalyzeStartup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	/* The output buffer is re-used across rows, as in printtup.c */
 	initStringInfo(&receiver->buf);
 
-	/* Initialize results counters */
+	/* Initialize metrics and per-tuple instrumentation */
 	memset(&receiver->metrics, 0, sizeof(SerializeMetrics));
-	INSTR_TIME_SET_ZERO(receiver->metrics.timeSpent);
+	{
+		int			instrument_options = 0;
+
+		if (receiver->es->timing)
+			instrument_options |= INSTRUMENT_TIMER;
+		if (receiver->es->buffers)
+			instrument_options |= INSTRUMENT_BUFFERS;
+		InstrInitOptions(&receiver->metrics.instr, instrument_options);
+	}
 }
 
 /*
@@ -246,6 +241,8 @@ serializeAnalyzeShutdown(DestReceiver *self)
 {
 	SerializeDestReceiver *receiver = (SerializeDestReceiver *) self;
 
+	InstrFinalizeChild(&receiver->metrics.instr, instr_stack.current);
+
 	if (receiver->finfos)
 		pfree(receiver->finfos);
 	receiver->finfos = NULL;
@@ -290,22 +287,17 @@ CreateExplainSerializeDestReceiver(ExplainState *es)
 }
 
 /*
- * GetSerializationMetrics - collect metrics
+ * GetSerializationMetrics - get serialization metrics
  *
- * We have to be careful here since the receiver could be an IntoRel
- * receiver if the subject statement is CREATE TABLE AS.  In that
- * case, return all-zeroes stats.
+ * Returns a pointer to the SerializeMetrics inside the dest receiver,
+ * or NULL if the receiver is not a SerializeDestReceiver (e.g. an IntoRel
+ * receiver for CREATE TABLE AS).
  */
-SerializeMetrics
+SerializeMetrics *
 GetSerializationMetrics(DestReceiver *dest)
 {
-	SerializeMetrics empty;
-
 	if (dest->mydest == DestExplainSerialize)
-		return ((SerializeDestReceiver *) dest)->metrics;
-
-	memset(&empty, 0, sizeof(SerializeMetrics));
-	INSTR_TIME_SET_ZERO(empty.timeSpent);
+		return &((SerializeDestReceiver *) dest)->metrics;
 
-	return empty;
+	return NULL;
 }
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 373e823479466..9ab74c8df0a1b 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -3989,10 +3989,13 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein
 			tablespaceid = indexRel->rd_rel->reltablespace;
 
 		/* Create new index definition based on given index */
-		newIndexId = index_concurrently_create_copy(heapRel,
-													idx->indexId,
-													tablespaceid,
-													concurrentName);
+		newIndexId = index_create_copy(heapRel,
+									   INDEX_CREATE_CONCURRENT |
+									   INDEX_CREATE_SKIP_BUILD |
+									   INDEX_CREATE_SUPPRESS_PROGRESS,
+									   idx->indexId,
+									   tablespaceid,
+									   concurrentName);
 
 		/*
 		 * Now open the relation of the new index, a session-level lock is
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 876aad2100aeb..ee8113575882e 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -22,6 +22,7 @@
 #include "catalog/pg_type.h"
 #include "commands/createas.h"
 #include "commands/explain.h"
+#include "executor/instrument.h"
 #include "commands/explain_format.h"
 #include "commands/explain_state.h"
 #include "commands/prepare.h"
@@ -580,14 +581,17 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	ListCell   *p;
 	ParamListInfo paramLI = NULL;
 	EState	   *estate = NULL;
-	instr_time	planstart;
-	instr_time	planduration;
-	BufferUsage bufusage_start,
-				bufusage;
+	QueryInstrumentation *plan_instr = NULL;
+	int			instrument_options = INSTRUMENT_TIMER;
 	MemoryContextCounters mem_counters;
 	MemoryContext planner_ctx = NULL;
 	MemoryContext saved_ctx = NULL;
 
+	if (es->buffers)
+		instrument_options |= INSTRUMENT_BUFFERS;
+
+	plan_instr = InstrQueryAlloc(instrument_options);
+
 	if (es->memory)
 	{
 		/* See ExplainOneQuery about this */
@@ -598,9 +602,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 		saved_ctx = MemoryContextSwitchTo(planner_ctx);
 	}
 
-	if (es->buffers)
-		bufusage_start = pgBufferUsage;
-	INSTR_TIME_SET_CURRENT(planstart);
+	InstrQueryStart(plan_instr);
 
 	/* Look it up in the hash table */
 	entry = FetchPreparedStatement(execstmt->name, true);
@@ -635,8 +637,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	cplan = GetCachedPlan(entry->plansource, paramLI,
 						  CurrentResourceOwner, pstate->p_queryEnv);
 
-	INSTR_TIME_SET_CURRENT(planduration);
-	INSTR_TIME_SUBTRACT(planduration, planstart);
+	InstrQueryStopFinalize(plan_instr);
 
 	if (es->memory)
 	{
@@ -644,13 +645,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 		MemoryContextMemConsumed(planner_ctx, &mem_counters);
 	}
 
-	/* calc differences of buffer counters. */
-	if (es->buffers)
-	{
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-	}
-
 	plan_list = cplan->stmt_list;
 
 	/* Explain each query */
@@ -660,7 +654,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 
 		if (pstmt->commandType != CMD_UTILITY)
 			ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv,
-						   &planduration, (es->buffers ? &bufusage : NULL),
+						   &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL),
 						   es->memory ? &mem_counters : NULL);
 		else
 			ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 0ce2e81f9c2f2..f72c1ac521a3e 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -2139,7 +2139,7 @@ ExecuteTruncateGuts(List *explicit_rels,
 						  rel,
 						  0,	/* dummy rangetable index */
 						  NULL,
-						  0);
+						  NULL);
 		estate->es_opened_result_relations =
 			lappend(estate->es_opened_result_relations, resultRelInfo);
 		resultRelInfo++;
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 90e94fb8a5a4b..b8b8840345bd2 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -92,7 +92,8 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
 static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
 									 int tgindx,
 									 FmgrInfo *finfo,
-									 Instrumentation *instr,
+									 TriggerInstrumentation *instr,
+									 QueryInstrumentation *qinstr,
 									 MemoryContext per_tuple_context);
 static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 								  ResultRelInfo *src_partinfo,
@@ -2311,7 +2312,8 @@ static HeapTuple
 ExecCallTriggerFunc(TriggerData *trigdata,
 					int tgindx,
 					FmgrInfo *finfo,
-					Instrumentation *instr,
+					TriggerInstrumentation *instr,
+					QueryInstrumentation *qinstr,
 					MemoryContext per_tuple_context)
 {
 	LOCAL_FCINFO(fcinfo, 0);
@@ -2346,7 +2348,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
 	 * If doing EXPLAIN ANALYZE, start charging time to this trigger.
 	 */
 	if (instr)
-		InstrStartNode(instr + tgindx);
+		InstrStartTrigger(qinstr, instr + tgindx);
 
 	/*
 	 * Do the function evaluation in the per-tuple memory context, so that
@@ -2391,10 +2393,10 @@ ExecCallTriggerFunc(TriggerData *trigdata,
 
 	/*
 	 * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
-	 * one "tuple returned" (really the number of firings).
+	 * the firing of the trigger.
 	 */
 	if (instr)
-		InstrStopNode(instr + tgindx, 1);
+		InstrStopTrigger(instr + tgindx, 1);
 
 	return (HeapTuple) DatumGetPointer(result);
 }
@@ -2441,6 +2443,7 @@ ExecBSInsertTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -2502,6 +2505,7 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -2606,6 +2610,7 @@ ExecIRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -2670,6 +2675,7 @@ ExecBSDeleteTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -2780,6 +2786,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -2884,6 +2891,7 @@ ExecIRDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (rettuple == NULL)
 			return false;		/* Delete was suppressed */
@@ -2942,6 +2950,7 @@ ExecBSUpdateTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -3094,6 +3103,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple == NULL)
@@ -3258,6 +3268,7 @@ ExecIRUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -3316,6 +3327,7 @@ ExecBSTruncateTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -3947,7 +3959,7 @@ static void AfterTriggerExecute(EState *estate,
 								ResultRelInfo *dst_relInfo,
 								TriggerDesc *trigdesc,
 								FmgrInfo *finfo,
-								Instrumentation *instr,
+								TriggerInstrumentation *instr,
 								MemoryContext per_tuple_context,
 								TupleTableSlot *trig_tuple_slot1,
 								TupleTableSlot *trig_tuple_slot2);
@@ -4342,7 +4354,7 @@ AfterTriggerExecute(EState *estate,
 					ResultRelInfo *src_relInfo,
 					ResultRelInfo *dst_relInfo,
 					TriggerDesc *trigdesc,
-					FmgrInfo *finfo, Instrumentation *instr,
+					FmgrInfo *finfo, TriggerInstrumentation *instr,
 					MemoryContext per_tuple_context,
 					TupleTableSlot *trig_tuple_slot1,
 					TupleTableSlot *trig_tuple_slot2)
@@ -4383,7 +4395,7 @@ AfterTriggerExecute(EState *estate,
 	 * to include time spent re-fetching tuples in the trigger cost.
 	 */
 	if (instr)
-		InstrStartNode(instr + tgindx);
+		InstrStartTrigger(estate->es_instrument, instr + tgindx);
 
 	/*
 	 * Fetch the required tuple(s).
@@ -4571,6 +4583,7 @@ AfterTriggerExecute(EState *estate,
 								   tgindx,
 								   finfo,
 								   NULL,
+								   NULL,
 								   per_tuple_context);
 	if (rettuple != NULL &&
 		rettuple != LocTriggerData.tg_trigtuple &&
@@ -4600,10 +4613,10 @@ AfterTriggerExecute(EState *estate,
 
 	/*
 	 * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
-	 * one "tuple returned" (really the number of firings).
+	 * the firing of the trigger.
 	 */
 	if (instr)
-		InstrStopNode(instr + tgindx, 1);
+		InstrStopTrigger(instr + tgindx, 1);
 }
 
 
@@ -4719,7 +4732,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
 	Relation	rel = NULL;
 	TriggerDesc *trigdesc = NULL;
 	FmgrInfo   *finfo = NULL;
-	Instrumentation *instr = NULL;
+	TriggerInstrumentation *instr = NULL;
 	TupleTableSlot *slot1 = NULL,
 			   *slot2 = NULL;
 
diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c
index 77834b96a21c1..b5fed54fb85c3 100644
--- a/src/backend/commands/vacuumparallel.c
+++ b/src/backend/commands/vacuumparallel.c
@@ -47,9 +47,8 @@
  */
 #define PARALLEL_VACUUM_KEY_SHARED			1
 #define PARALLEL_VACUUM_KEY_QUERY_TEXT		2
-#define PARALLEL_VACUUM_KEY_BUFFER_USAGE	3
-#define PARALLEL_VACUUM_KEY_WAL_USAGE		4
-#define PARALLEL_VACUUM_KEY_INDEX_STATS		5
+#define PARALLEL_VACUUM_KEY_INSTRUMENTATION	3
+#define PARALLEL_VACUUM_KEY_INDEX_STATS		4
 
 /*
  * Shared information among parallel workers.  So this is allocated in the DSM
@@ -188,11 +187,8 @@ struct ParallelVacuumState
 	/* Shared dead items space among parallel vacuum workers */
 	TidStore   *dead_items;
 
-	/* Points to buffer usage area in DSM */
-	BufferUsage *buffer_usage;
-
-	/* Points to WAL usage area in DSM */
-	WalUsage   *wal_usage;
+	/* Points to instrumentation area in DSM */
+	Instrumentation *instr;
 
 	/*
 	 * False if the index is totally unsuitable target for all parallel
@@ -250,8 +246,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	PVShared   *shared;
 	TidStore   *dead_items;
 	PVIndStats *indstats;
-	BufferUsage *buffer_usage;
-	WalUsage   *wal_usage;
+	Instrumentation *instr;
 	bool	   *will_parallel_vacuum;
 	Size		est_indstats_len;
 	Size		est_shared_len;
@@ -304,18 +299,15 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/*
-	 * Estimate space for BufferUsage and WalUsage --
-	 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
+	 * Estimate space for Instrumentation --
+	 * PARALLEL_VACUUM_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgBufferUsage or
-	 * pgWalUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
@@ -396,17 +388,13 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	pvs->shared = shared;
 
 	/*
-	 * Allocate space for each worker's BufferUsage and WalUsage; no need to
-	 * initialize
+	 * Allocate space for each worker's Instrumentation; no need to
+	 * initialize.
 	 */
-	buffer_usage = shm_toc_allocate(pcxt->toc,
-									mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
-	pvs->buffer_usage = buffer_usage;
-	wal_usage = shm_toc_allocate(pcxt->toc,
-								 mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
-	pvs->wal_usage = wal_usage;
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, instr);
+	pvs->instr = instr;
 
 	/* Store query string for workers */
 	if (debug_query_string)
@@ -749,7 +737,7 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan
 		WaitForParallelWorkersToFinish(pvs->pcxt);
 
 		for (int i = 0; i < pvs->pcxt->nworkers_launched; i++)
-			InstrAccumParallelQuery(&pvs->buffer_usage[i], &pvs->wal_usage[i]);
+			InstrAccumParallelQuery(&pvs->instr[i]);
 	}
 
 	/*
@@ -1006,8 +994,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	PVIndStats *indstats;
 	PVShared   *shared;
 	TidStore   *dead_items;
-	BufferUsage *buffer_usage;
-	WalUsage   *wal_usage;
+	QueryInstrumentation *instr;
+	Instrumentation *worker_instr;
 	int			nindexes;
 	char	   *sharedquery;
 	ErrorContextCallback errcallback;
@@ -1095,16 +1083,14 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	error_context_stack = &errcallback;
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/* Process indexes to perform vacuum/cleanup */
 	parallel_vacuum_process_safe_indexes(&pvs);
 
 	/* Report buffer/WAL usage during parallel execution */
-	buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
-	wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
-						  &wal_usage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 	/* Report any remaining cost-based vacuum delay time */
 	if (track_cost_delay_timing)
diff --git a/src/backend/executor/README.instrument b/src/backend/executor/README.instrument
new file mode 100644
index 0000000000000..7df837dbc77e8
--- /dev/null
+++ b/src/backend/executor/README.instrument
@@ -0,0 +1,237 @@
+src/backend/executor/README.instrument
+
+Instrumentation
+===============
+
+The instrumentation subsystem measures time, buffer usage and WAL activity
+during query execution and other similar activities.  It is used by
+EXPLAIN ANALYZE, pg_stat_statements, and other consumers that need
+activity and/or timing metrics over a section of code.
+
+The design has two central goals:
+
+* Make it cheap to measure activity in a section of code, even when
+  that section is called many times and the aggregate is what is used
+  (as is the case with per-node instrumentation in the executor)
+
+* Ensure nested instrumentation accurately measures activity/timing,
+  even when execution is aborted due to errors being thrown.
+
+The key data structures are defined in src/include/executor/instrument.h
+and the implementation lives in src/backend/executor/instrument.c.
+
+
+Instrumentation Options
+-----------------------
+
+Callers specify what to measure with a bitmask of InstrumentOption flags:
+
+  INSTRUMENT_ROWS     -- row counts only (used with NodeInstrumentation)
+  INSTRUMENT_TIMER    -- wall-clock timing and row counts
+  INSTRUMENT_BUFFERS  -- buffer hit/read/dirtied/written counts and I/O time
+  INSTRUMENT_WAL      -- WAL records, FPI, bytes
+
+INSTRUMENT_BUFFERS and INSTRUMENT_WAL utilize the instrumentation stack
+(described below) for efficient handling of counter values.
+
+
+Struct Hierarchy
+----------------
+
+There are the following instrumentation structs, each specialized for a
+different scope:
+
+Instrumentation        Base struct.  Holds timing and buffer/WAL counters.
+
+QueryInstrumentation   Extends Instrumentation for query-level tracking.  When
+                       stack-based tracking is enabled, it owns a dedicated
+                       MemoryContext and uses the ResourceOwner mechanism for
+                       abort cleanup.
+
+NodeInstrumentation    Extends Instrumentation for per-plan-node statistics
+                       (startup time, tuple counts, loop counts, etc).
+
+TriggerInstrumentation Extends Instrumentation with a firing count.
+
+
+Stack-based instrumentation
+===========================
+
+For tracking WAL or buffer usage counters, the specialized stack-based
+instrumentation is used.
+
+A simple approach to measuring buffer/WAL activity in a code section could be
+to have a set of global counters, snapshot all the counters at the start, and
+diff them at the end. But, this is expensive in practice: BufferUsage alone
+has many fields, and the diff must be computed for every InstrStartNode /
+InstrStopNode cycle.
+
+An alternative is to write counter updates directly into the struct that
+should receive them, avoiding the diff.  But that has two complexities: Low-level
+code such as the buffer manager, has no direct pointers to higher level
+structs, such as plan nodes tracking buffer usage. And instrumentation is often
+nested: We might both be interested in the aggregate buffer usage of a query, and
+the individual per-node details. Stack-based instrumentation solves for that:
+
+At all times, there is a stack that tracks which Instrumentation is currently
+active.  The stack is represented by instr_stack, a per-backend global
+that holds a dynamic array of Instrumentation pointers.  The field
+instr_stack.current always points to the current stack entry that should
+be updated when activity occurs.  When the stack array is empty, the
+current stack points to instr_top.
+
+For example, if a backend has two portals open, the overall nesting of
+Instrumentation and their respective InstrStart/InstrStop calls creates a
+tree-like structure like this:
+
+    Session (instr_top)
+     |
+     +-- Query A (QueryInstrumentation)
+     |    |
+     |    +-- NestLoop (NodeInstrumentation)
+     |         |
+     |         +-- Seq Scan A (NodeInstrumentation)
+     |         +-- Seq Scan B (NodeInstrumentation)
+     |
+     +-- Query B (QueryInstrumentation)
+          |
+          +-- Seq Scan C (NodeInstrumentation)
+
+While executing Seq Scan B, the stack looks like:
+
+    instr_top              (implicit bottom, not in the entries array)
+    0: Query A
+    1: NestLoop
+    2: Seq Scan B          <-- instr_stack.current
+
+When no query is running, the stack is empty (stack_size == 0) and
+instr_stack.current points to instr_top.
+
+Any buffer or WAL counter update (via the INSTR_BUFUSAGE_* and
+INSTR_WALUSAGE_* macros in the buffer manager, WAL insertion code, etc.)
+writes directly into instr_stack.current.  Each instrumentation node starts
+zeroed, so the values it accumulates while on top of the stack represent
+exactly the activity that occurred during that time.
+
+Every Instrumentation node (except for instr_top) has a target, or parent, it
+will be accumulated into, which is typically the Instrumentation that was the
+current stack entry when it was created.
+
+For example, when Seq Scan A gets finalized in regular execution via ExecutorFinish,
+its instrumentation data gets added to the immediate parent in
+the execution tree, the NestLoop, which will then get added to Query A's
+QueryInstrumentation, which then accumulates to the parent.
+
+While we can typically think of this as a tree, the NodeInstrumentation
+underneath a particular QueryInstrumentation could behave differently --
+for example, it could propagate directly to the QueryInstrumentation, in
+order to not show cumulative numbers in EXPLAIN ANALYZE.
+
+Note these relationships are partially implicit, especially when it comes
+to NodeInstrumentation. Each QueryInstrumentation maintains a list of its
+unfinalized child nodes.  The parent of a QueryInstrumentation itself is
+determined by the stack (see below): when a query is finalized or cleaned
+up on abort, its counters are accumulated to whatever entry is then current
+on the stack, which is typically instr_top.
+
+
+Finalization and Abort Safety
+=============================
+
+Finalization is the process of rolling up a node's buffer/WAL counters to
+its parent.  In normal execution, nodes are pushed onto the stack when they
+start and popped when they stop; at finalization time their accumulated
+counters are added to the parent.
+
+Due to the use of longjmp for error handling, functions can exit abruptly
+without executing their normal cleanup code.  On abort, two things need
+to happen:
+
+1. The stack is reset to the level saved at the start of the aborting
+   (sub-)transaction level.  This ensures that we don't later try to update
+   counters on a freed stack entry.  We also need to ensure that the stack
+   entry that was current before a particular Instrumentation started, is
+   current again after it stops.
+
+2. Finalize all affected Instrumentation nodes, rolling up their counters
+   to the innermost surviving Instrumentation, so that data is not lost.
+
+For example, if Seq Scan B aborts while the stack is:
+
+    instr_top              (implicit bottom)
+    0: Query A
+    1: NestLoop
+    2: Seq Scan B
+
+The abort handler for Query A accumulates all unfinalized children (Seq
+Scan A, Seq Scan B, NestLoop) directly into Query A's counters, then
+unwinds the instrumentation stack and accumulates Query A's counters to
+instr_top.
+
+Note that on abort the children do not accumulate through each other (Seq
+Scan B -> NestLoop -> Query A); they all accumulate directly to their
+parent QueryInstrumentation.  This means the order in which children are
+released does not matter -- this is important because ResourceOwner cleanup
+does not guarantee a particular release order.  The per-node breakdown is lost,
+but the instrumentation active when the query was started (instr_top in the
+above example) survives the abort, and its counters include the activity.
+
+If multiple QueryInstrumentations are active on the stack (e.g. nested
+portals), the abort handler of each uses InstrStopFinalize() to accumulate
+the statistics to the parent entry of either the entry being released, or a
+previously released entry if it was higher up in the stack, so they compose
+correctly regardless of release order.
+
+There are two mechanisms for achieving abort safety:
+
+* Resource Owner (QueryInstrumentation): registers with the current
+  ResourceOwner at start.  On transaction abort, the resource owner system
+  calls the release callback, which walks unfinalized child entries,
+  accumulates their data, unwinds the stack, and destroys the dedicated
+  memory context (freeing the QueryInstrumentation and all child
+  allocations as a unit).  This is the recommended approach when the
+  instrumented code already has an appropriate resource owner (e.g. it
+  runs inside a portal).  The query executor uses this path.
+
+* PG_FINALLY (base Instrumentation): when no suitable resource owner
+  exists, or when the caller wants to inspect the instrumentation data
+  even after an error, the base Instrumentation can be used with a
+  PG_TRY/PG_FINALLY block that calls InstrStopFinalize().
+
+Both mechanisms add overhead, so neither is suitable for high-frequency
+instrumentation like per-node measurements in the executor.  Instead,
+plan node and trigger children rely on their parent QueryInstrumentation
+for abort safety: they are allocated in the parent's memory context and
+registered in its unfinalized-entries list, so the parent's abort handler
+recovers their data automatically.  In normal execution, children are
+finalized explicitly by the caller.
+
+Parallel Query
+--------------
+
+Parallel workers get their own QueryInstrumentation so they can measure
+buffer and WAL activity independently, then copy the totals into dynamic
+shared memory at worker shutdown.  The leader accumulates these into its
+own stack.
+
+When per-node instrumentation is active, parallel workers skip per-node
+finalization at shutdown to avoid double-counting; the per-node data is
+aggregated separately through InstrAggNode().
+
+
+Memory Handling
+===============
+
+Instrumentation objects that use the stack must survive until finalization
+runs, including the abort case.  To ensure this, QueryInstrumentation
+creates a dedicated "Instrumentation" MemoryContext (instr_cxt) as a child
+of TopMemoryContext.  All child instrumentation (nodes, triggers) should be
+allocated in this context.
+
+On successful completion, instr_cxt is reparented to CurrentMemoryContext
+so its lifetime is tied to the caller's context.  On abort, the
+ResourceOwner cleanup frees it after accumulating the instrumentation data
+to the current stack entry after resetting the stack.
+
+When the stack is not needed (timer/rows only), Instrumentation allocations
+happen in CurrentMemoryContext instead of TopMemoryContext.
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 45e00c6af85de..d0cd34d286c28 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -78,6 +78,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
 /* decls for local routines only used within this module */
 static void InitPlan(QueryDesc *queryDesc, int eflags);
 static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecFinalizeTriggerInstrumentation(EState *estate);
 static void ExecPostprocessPlan(EState *estate);
 static void ExecEndPlan(PlanState *planstate, EState *estate);
 static void ExecutePlan(QueryDesc *queryDesc,
@@ -247,9 +248,16 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot);
 	estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot);
 	estate->es_top_eflags = eflags;
-	estate->es_instrument = queryDesc->instrument_options;
 	estate->es_jit_flags = queryDesc->plannedstmt->jitFlags;
 
+	/*
+	 * Set up per-node instrumentation if needed. We do this before InitPlan
+	 * so that node and trigger instrumentation can be allocated within the
+	 * query's dedicated instrumentation memory context.
+	 */
+	if (!estate->es_instrument && queryDesc->instrument_options)
+		estate->es_instrument = InstrQueryAlloc(queryDesc->instrument_options);
+
 	/*
 	 * Set up an AFTER-trigger statement context, unless told not to, or
 	 * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
@@ -331,9 +339,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	 */
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
 
-	/* Allow instrumentation of Executor overall runtime */
+	/* Start up instrumentation for this execution run */
 	if (queryDesc->totaltime)
-		InstrStartNode(queryDesc->totaltime);
+		InstrQueryStart(queryDesc->totaltime);
+	if (estate->es_instrument)
+		InstrQueryStart(estate->es_instrument);
 
 	/*
 	 * extract information from the query descriptor and the query feature.
@@ -384,8 +394,10 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	if (sendTuples)
 		dest->rShutdown(dest);
 
+	if (estate->es_instrument)
+		InstrQueryStop(estate->es_instrument);
 	if (queryDesc->totaltime)
-		InstrStopNode(queryDesc->totaltime, estate->es_processed);
+		InstrQueryStop(queryDesc->totaltime);
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -435,7 +447,9 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
-		InstrStartNode(queryDesc->totaltime);
+		InstrQueryStart(queryDesc->totaltime);
+	if (estate->es_instrument)
+		InstrQueryStart(estate->es_instrument);
 
 	/* Run ModifyTable nodes to completion */
 	ExecPostprocessPlan(estate);
@@ -444,8 +458,32 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 	if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
 		AfterTriggerEndQuery(estate);
 
+	if (estate->es_instrument)
+	{
+		/*
+		 * Accumulate per-node and trigger statistics to their respective
+		 * parent instrumentation stacks.
+		 *
+		 * We skip this in parallel workers because their per-node stats are
+		 * reported individually via ExecParallelReportInstrumentation, and
+		 * the leader's own ExecFinalizeNodeInstrumentation handles
+		 * propagation.  If we accumulated here, the leader would
+		 * double-count: worker parent nodes would already include their
+		 * children's stats, and then the leader's accumulation would add the
+		 * children again.
+		 */
+		if (!IsParallelWorker())
+		{
+			ExecFinalizeNodeInstrumentation(queryDesc->planstate);
+
+			ExecFinalizeTriggerInstrumentation(estate);
+		}
+
+		InstrQueryStopFinalize(estate->es_instrument);
+	}
+
 	if (queryDesc->totaltime)
-		InstrStopNode(queryDesc->totaltime, 0);
+		InstrQueryStopFinalize(queryDesc->totaltime);
 
 	MemoryContextSwitchTo(oldcontext);
 
@@ -1263,7 +1301,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 				  Relation resultRelationDesc,
 				  Index resultRelationIndex,
 				  ResultRelInfo *partition_root_rri,
-				  int instrument_options)
+				  QueryInstrumentation *qinstr)
 {
 	MemSet(resultRelInfo, 0, sizeof(ResultRelInfo));
 	resultRelInfo->type = T_ResultRelInfo;
@@ -1284,8 +1322,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 			palloc0_array(FmgrInfo, n);
 		resultRelInfo->ri_TrigWhenExprs = (ExprState **)
 			palloc0_array(ExprState *, n);
-		if (instrument_options)
-			resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+		if (qinstr)
+			resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(qinstr, n);
 	}
 	else
 	{
@@ -1358,6 +1396,10 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
  * also provides a way for EXPLAIN ANALYZE to report the runtimes of such
  * triggers.)  So we make additional ResultRelInfo's as needed, and save them
  * in es_trig_target_relations.
+ *
+ * Note: if new relation lists are searched here, they must also be added to
+ * ExecFinalizeTriggerInstrumentation so that trigger instrumentation data
+ * is properly accumulated.
  */
 ResultRelInfo *
 ExecGetTriggerResultRel(EState *estate, Oid relid,
@@ -1500,6 +1542,30 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo)
 	return resultRelInfo->ri_ancestorResultRels;
 }
 
+static void
+ExecFinalizeTriggerInstrumentation(EState *estate)
+{
+	List	   *rels = NIL;
+
+	rels = list_concat(rels, estate->es_tuple_routing_result_relations);
+	rels = list_concat(rels, estate->es_opened_result_relations);
+	rels = list_concat(rels, estate->es_trig_target_relations);
+
+	foreach_node(ResultRelInfo, rInfo, rels)
+	{
+		TriggerInstrumentation *ti = rInfo->ri_TrigInstrument;
+
+		if (ti == NULL || rInfo->ri_TrigDesc == NULL)
+			continue;
+
+		for (int nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
+		{
+			if (ti[nt].instr.need_stack)
+				InstrAccumStack(&estate->es_instrument->instr, &ti[nt].instr);
+		}
+	}
+}
+
 /* ----------------------------------------------------------------
  *		ExecPostprocessPlan
  *
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 755191b51ef66..2e57136edfd9f 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -60,13 +60,12 @@
 #define PARALLEL_KEY_EXECUTOR_FIXED		UINT64CONST(0xE000000000000001)
 #define PARALLEL_KEY_PLANNEDSTMT		UINT64CONST(0xE000000000000002)
 #define PARALLEL_KEY_PARAMLISTINFO		UINT64CONST(0xE000000000000003)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xE000000000000004)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xE000000000000004)
 #define PARALLEL_KEY_TUPLE_QUEUE		UINT64CONST(0xE000000000000005)
-#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xE000000000000006)
+#define PARALLEL_KEY_NODE_INSTRUMENTATION UINT64CONST(0xE000000000000006)
 #define PARALLEL_KEY_DSA				UINT64CONST(0xE000000000000007)
 #define PARALLEL_KEY_QUERY_TEXT		UINT64CONST(0xE000000000000008)
 #define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xE00000000000000A)
 
 #define PARALLEL_TUPLE_QUEUE_SIZE		65536
 
@@ -87,7 +86,7 @@ typedef struct FixedParallelExecutorState
  * instrument_options: Same meaning here as in instrument.c.
  *
  * instrument_offset: Offset, relative to the start of this structure,
- * of the first Instrumentation object.  This will depend on the length of
+ * of the first NodeInstrumentation object.  This will depend on the length of
  * the plan_node_id array.
  *
  * num_workers: Number of workers.
@@ -104,11 +103,15 @@ struct SharedExecutorInstrumentation
 	int			num_workers;
 	int			num_plan_nodes;
 	int			plan_node_id[FLEXIBLE_ARRAY_MEMBER];
-	/* array of num_plan_nodes * num_workers Instrumentation objects follows */
+
+	/*
+	 * array of num_plan_nodes * num_workers NodeInstrumentation objects
+	 * follows
+	 */
 };
 #define GetInstrumentationArray(sei) \
 	(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
-	 (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+	 (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
 
 /* Context object for ExecParallelEstimate. */
 typedef struct ExecParallelEstimateContext
@@ -627,8 +630,6 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	char	   *pstmt_data;
 	char	   *pstmt_space;
 	char	   *paramlistinfo_space;
-	BufferUsage *bufusage_space;
-	WalUsage   *walusage_space;
 	SharedExecutorInstrumentation *instrumentation = NULL;
 	SharedJitInstrumentation *jit_instrumentation = NULL;
 	int			pstmt_len;
@@ -692,21 +693,14 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/*
-	 * Estimate space for BufferUsage.
+	 * Estimate space for Instrumentation.
 	 *
 	 * If EXPLAIN is not in use and there are no extensions loaded that care,
 	 * we could skip this.  But we have no way of knowing whether anyone's
-	 * looking at pgBufferUsage, so do it unconditionally.
-	 */
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-
-	/*
-	 * Same thing for WalUsage.
+	 * looking at instrumentation, so do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Estimate space for tuple queues. */
@@ -731,7 +725,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		instrumentation_len = MAXALIGN(instrumentation_len);
 		instrument_offset = instrumentation_len;
 		instrumentation_len +=
-			mul_size(sizeof(Instrumentation),
+			mul_size(sizeof(NodeInstrumentation),
 					 mul_size(e.nnodes, nworkers));
 		shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
 		shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -792,17 +786,18 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space);
 	SerializeParamList(estate->es_param_list_info, &paramlistinfo_space);
 
-	/* Allocate space for each worker's BufferUsage; no need to initialize. */
-	bufusage_space = shm_toc_allocate(pcxt->toc,
-									  mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space);
-	pei->buffer_usage = bufusage_space;
+	/*
+	 * Allocate space for each worker's Instrumentation; no need to
+	 * initialize.
+	 */
+	{
+		Instrumentation *instr;
 
-	/* Same for WalUsage. */
-	walusage_space = shm_toc_allocate(pcxt->toc,
-									  mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space);
-	pei->wal_usage = walusage_space;
+		instr = shm_toc_allocate(pcxt->toc,
+								 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
+		pei->instrumentation = instr;
+	}
 
 	/* Set up the tuple queues that the workers will write into. */
 	pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false);
@@ -817,20 +812,20 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	 */
 	if (estate->es_instrument)
 	{
-		Instrumentation *instrument;
+		NodeInstrumentation *instrument;
 		int			i;
 
 		instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
-		instrumentation->instrument_options = estate->es_instrument;
+		instrumentation->instrument_options = estate->es_instrument->instrument_options;
 		instrumentation->instrument_offset = instrument_offset;
 		instrumentation->num_workers = nworkers;
 		instrumentation->num_plan_nodes = e.nnodes;
 		instrument = GetInstrumentationArray(instrumentation);
 		for (i = 0; i < nworkers * e.nnodes; ++i)
-			InstrInit(&instrument[i], estate->es_instrument);
-		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
+			InstrInitNode(&instrument[i], estate->es_instrument->instrument_options);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_NODE_INSTRUMENTATION,
 					   instrumentation);
-		pei->instrumentation = instrumentation;
+		pei->node_instrumentation = instrumentation;
 
 		if (estate->es_jit_flags != PGJIT_NONE)
 		{
@@ -1059,7 +1054,7 @@ static bool
 ExecParallelRetrieveInstrumentation(PlanState *planstate,
 									SharedExecutorInstrumentation *instrumentation)
 {
-	Instrumentation *instrument;
+	NodeInstrumentation *instrument;
 	int			i;
 	int			n;
 	int			ibytes;
@@ -1077,19 +1072,33 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	instrument = GetInstrumentationArray(instrumentation);
 	instrument += i * instrumentation->num_workers;
 	for (n = 0; n < instrumentation->num_workers; ++n)
+	{
 		InstrAggNode(planstate->instrument, &instrument[n]);
 
+		/*
+		 * Also add worker WAL usage to the global pgWalUsage counter.
+		 *
+		 * When per-node instrumentation is active, parallel workers skip
+		 * ExecFinalizeNodeInstrumentation (to avoid double-counting in
+		 * EXPLAIN), so per-node WAL activity is not rolled up into the
+		 * query-level stats that InstrAccumParallelQuery receives. Without
+		 * this, pgWalUsage would under-report WAL generated by parallel
+		 * workers when instrumentation is active.
+		 */
+		WalUsageAdd(&pgWalUsage, &instrument[n].instr.walusage);
+	}
+
 	/*
 	 * Also store the per-worker detail.
 	 *
-	 * Worker instrumentation should be allocated in the same context as the
-	 * regular instrumentation information, which is the per-query context.
-	 * Switch into per-query memory context.
+	 * Ensure worker instrumentation is allocated in the per-query context. We
+	 * don't need to place this in the instrumentation context since no more
+	 * stack-based instrumentation work is being done.
 	 */
 	oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
-	ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+	ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
 	planstate->worker_instrument =
-		palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+		palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument));
 	MemoryContextSwitchTo(oldcontext);
 
 	planstate->worker_instrument->num_workers = instrumentation->num_workers;
@@ -1218,7 +1227,7 @@ ExecParallelFinish(ParallelExecutorInfo *pei)
 	 * finish, or we might get incomplete data.)
 	 */
 	for (i = 0; i < nworkers; i++)
-		InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]);
+		InstrAccumParallelQuery(&pei->instrumentation[i]);
 
 	pei->finished = true;
 }
@@ -1232,10 +1241,14 @@ ExecParallelFinish(ParallelExecutorInfo *pei)
 void
 ExecParallelCleanup(ParallelExecutorInfo *pei)
 {
-	/* Accumulate instrumentation, if any. */
-	if (pei->instrumentation)
+	/* Accumulate node instrumentation, if any. */
+	if (pei->node_instrumentation)
+	{
 		ExecParallelRetrieveInstrumentation(pei->planstate,
-											pei->instrumentation);
+											pei->node_instrumentation);
+
+		ExecFinalizeWorkerInstrumentation(pei->planstate);
+	}
 
 	/* Accumulate JIT instrumentation, if any. */
 	if (pei->jit_instrumentation)
@@ -1319,7 +1332,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 {
 	int			i;
 	int			plan_node_id = planstate->plan->plan_node_id;
-	Instrumentation *instrument;
+	NodeInstrumentation *instrument;
 
 	InstrEndLoop(planstate->instrument);
 
@@ -1458,8 +1471,7 @@ void
 ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 {
 	FixedParallelExecutorState *fpes;
-	BufferUsage *buffer_usage;
-	WalUsage   *wal_usage;
+	QueryInstrumentation *instr;
 	DestReceiver *receiver;
 	QueryDesc  *queryDesc;
 	SharedExecutorInstrumentation *instrumentation;
@@ -1474,7 +1486,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 
 	/* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */
 	receiver = ExecParallelGetReceiver(seg, toc);
-	instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true);
+	instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_NODE_INSTRUMENTATION, true);
 	if (instrumentation != NULL)
 		instrument_options = instrumentation->instrument_options;
 	jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
@@ -1518,7 +1530,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	 * leader, which also doesn't count buffer accesses and WAL activity that
 	 * occur during executor startup.
 	 */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/*
 	 * Run the plan.  If we specified a tuple bound, be careful not to demand
@@ -1532,10 +1544,12 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	ExecutorFinish(queryDesc);
 
 	/* Report buffer/WAL usage during parallel execution. */
-	buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
-						  &wal_usage[ParallelWorkerNumber]);
+	{
+		Instrumentation *worker_instr;
+
+		worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+		InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
+	}
 
 	/* Report instrumentation data if any instrumentation options are set. */
 	if (instrumentation != NULL)
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index d96d4f9947b79..6f2909a1bc35a 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1381,7 +1381,7 @@ ExecInitPartitionDispatchInfo(EState *estate,
 	{
 		ResultRelInfo *rri = makeNode(ResultRelInfo);
 
-		InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
+		InitResultRelInfo(rri, rel, 0, rootResultRelInfo, NULL);
 		proute->nonleaf_partitions[dispatchidx] = rri;
 	}
 	else
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d35976925ae76..a59de0ef22b29 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -121,8 +121,9 @@
 #include "nodes/nodeFuncs.h"
 
 static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
-static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
 static bool ExecShutdownNode_walker(PlanState *node, void *context);
+static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context);
+static bool ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context);
 
 
 /* ------------------------------------------------------------------------
@@ -414,8 +415,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation for this node if requested */
 	if (estate->es_instrument)
-		result->instrument = InstrAlloc(1, estate->es_instrument,
-										result->async_capable);
+		result->instrument = InstrAllocNode(estate->es_instrument,
+											result->async_capable);
 
 	return result;
 }
@@ -463,7 +464,7 @@ ExecProcNodeFirst(PlanState *node)
 	 * have ExecProcNode() directly call the relevant function from now on.
 	 */
 	if (node->instrument)
-		node->ExecProcNode = ExecProcNodeInstr;
+		node->ExecProcNode = InstrNodeSetupExecProcNode(node->instrument);
 	else
 		node->ExecProcNode = node->ExecProcNodeReal;
 
@@ -471,25 +472,6 @@ ExecProcNodeFirst(PlanState *node)
 }
 
 
-/*
- * ExecProcNode wrapper that performs instrumentation calls.  By keeping
- * this a separate function, we avoid overhead in the normal case where
- * no instrumentation is wanted.
- */
-static TupleTableSlot *
-ExecProcNodeInstr(PlanState *node)
-{
-	TupleTableSlot *result;
-
-	InstrStartNode(node->instrument);
-
-	result = node->ExecProcNodeReal(node);
-
-	InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
-
-	return result;
-}
-
 
 /* ----------------------------------------------------------------
  *		MultiExecProcNode
@@ -788,10 +770,10 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	 * at least once already.  We don't expect much CPU consumption during
 	 * node shutdown, but in the case of Gather or Gather Merge, we may shut
 	 * down workers at this stage.  If so, their buffer usage will get
-	 * propagated into pgBufferUsage at this point, and we want to make sure
-	 * that it gets associated with the Gather node.  We skip this if the node
-	 * has never been executed, so as to avoid incorrectly making it appear
-	 * that it has.
+	 * propagated into the current instrumentation stack entry at this point,
+	 * and we want to make sure that it gets associated with the Gather node.
+	 * We skip this if the node has never been executed, so as to avoid
+	 * incorrectly making it appear that it has.
 	 */
 	if (node->instrument && node->instrument->running)
 		InstrStartNode(node->instrument);
@@ -829,6 +811,145 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	return false;
 }
 
+/*
+ * ExecFinalizeNodeInstrumentation
+ *
+ * Accumulate instrumentation stats from all execution nodes to their respective
+ * parents (or the original parent instrumentation).
+ *
+ * This must run after the cleanup done by ExecShutdownNode, and not rely on any
+ * resources cleaned up by it. We also expect shutdown actions to have occurred,
+ * e.g. parallel worker instrumentation to have been added to the leader.
+ */
+void
+ExecFinalizeNodeInstrumentation(PlanState *node)
+{
+	(void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current);
+}
+
+static bool
+ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context)
+{
+	Instrumentation *parent = (Instrumentation *) context;
+
+	Assert(parent != NULL);
+
+	if (node == NULL)
+		return false;
+
+	Assert(node->instrument != NULL);
+
+	/*
+	 * Recurse into children first (bottom-up accumulation), and accummulate
+	 * to this nodes instrumentation as the parent context.
+	 */
+	planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker,
+						  &node->instrument->instr);
+
+	/* IndexScan/IndexOnlyScan have a separate entry to track table access */
+	if (IsA(node, IndexScanState))
+	{
+		IndexScanState *iss = castNode(IndexScanState, node);
+
+		InstrFinalizeChild(&iss->iss_Instrument->table_instr, &node->instrument->instr);
+	}
+	else if (IsA(node, IndexOnlyScanState))
+	{
+		IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node);
+
+		InstrFinalizeChild(&ioss->ioss_Instrument->table_instr, &node->instrument->instr);
+	}
+
+	InstrFinalizeChild(&node->instrument->instr, parent);
+
+	return false;
+}
+
+/*
+ * ExecFinalizeWorkerInstrumentation
+ *
+ * Accumulate per-worker instrumentation stats from child nodes into their
+ * parents, mirroring what ExecFinalizeNodeInstrumentation does for the
+ * leader's own stats.  Without this, per-worker buffer/WAL stats shown by
+ * EXPLAIN (ANALYZE, VERBOSE) would only reflect each node's own direct
+ * activity, not including children.
+ *
+ * This must run after ExecParallelRetrieveInstrumentation has populated
+ * worker_instrument for all nodes in the parallel subtree.
+ */
+void
+ExecFinalizeWorkerInstrumentation(PlanState *node)
+{
+	(void) ExecFinalizeWorkerInstrumentation_walker(node, NULL);
+}
+
+static bool
+ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context)
+{
+	PlanState  *parent = (PlanState *) context;
+	int			num_workers;
+
+	if (node == NULL)
+		return false;
+
+	/*
+	 * Recurse into children first (bottom-up accumulation), passing this node
+	 * as parent context if it has worker_instrument, otherwise pass through
+	 * the previous parent.
+	 */
+	planstate_tree_walker(node, ExecFinalizeWorkerInstrumentation_walker,
+						  node->worker_instrument ? (void *) node : context);
+
+	if (!node->worker_instrument)
+		return false;
+
+	num_workers = node->worker_instrument->num_workers;
+
+	/*
+	 * Fold per-worker IndexScan/IndexOnlyScan table buffer stats into the
+	 * per-worker node stats, matching what ExecFinalizeNodeInstrumentation
+	 * does for the leader.
+	 */
+	if (IsA(node, IndexScanState))
+	{
+		IndexScanState *iss = castNode(IndexScanState, node);
+
+		if (iss->iss_SharedInfo)
+		{
+			int			nworkers = Min(num_workers, iss->iss_SharedInfo->num_workers);
+
+			for (int n = 0; n < nworkers; n++)
+				InstrAccumStack(&node->worker_instrument->instrument[n].instr,
+								&iss->iss_SharedInfo->winstrument[n].table_instr);
+		}
+	}
+	else if (IsA(node, IndexOnlyScanState))
+	{
+		IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node);
+
+		if (ioss->ioss_SharedInfo)
+		{
+			int			nworkers = Min(num_workers, ioss->ioss_SharedInfo->num_workers);
+
+			for (int n = 0; n < nworkers; n++)
+				InstrAccumStack(&node->worker_instrument->instrument[n].instr,
+								&ioss->ioss_SharedInfo->winstrument[n].table_instr);
+		}
+	}
+
+	/* Accumulate this node's per-worker stats to parent's per-worker stats */
+	if (parent && parent->worker_instrument)
+	{
+		int			parent_workers = parent->worker_instrument->num_workers;
+
+		for (int n = 0; n < Min(num_workers, parent_workers); n++)
+			InstrAccumStack(&parent->worker_instrument->instrument[n].instr,
+							&node->worker_instrument->instrument[n].instr);
+	}
+
+	return false;
+}
+
 /*
  * ExecSetTupleBound
  *
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 1eb6b9f1f4068..700764daf45d3 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -150,7 +150,7 @@ CreateExecutorState(void)
 	estate->es_total_processed = 0;
 
 	estate->es_top_eflags = 0;
-	estate->es_instrument = 0;
+	estate->es_instrument = NULL;
 	estate->es_finished = false;
 
 	estate->es_exprcontexts = NIL;
@@ -227,6 +227,15 @@ FreeExecutorState(EState *estate)
 		estate->es_partition_directory = NULL;
 	}
 
+	/*
+	 * Make sure the instrumentation context gets freed. This usually gets
+	 * re-parented under the per-query context in InstrQueryStopFinalize, but
+	 * that won't happen during EXPLAIN (BUFFERS) since ExecutorFinish never
+	 * gets called, so we would otherwise leak it in TopMemoryContext.
+	 */
+	if (estate->es_instrument && estate->es_instrument->instr.need_stack)
+		MemoryContextDelete(estate->es_instrument->instr_cxt);
+
 	/*
 	 * Free the per-query memory context, thereby releasing all working
 	 * memory, including the EState node itself.
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index a40610bc2522f..3183f00d6930a 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,124 +16,438 @@
 #include <unistd.h>
 
 #include "executor/instrument.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
 
-BufferUsage pgBufferUsage;
-static BufferUsage save_pgBufferUsage;
 WalUsage	pgWalUsage;
-static WalUsage save_pgWalUsage;
+Instrumentation instr_top;
+InstrStackState instr_stack = {
+	.stack_space = 0,
+	.stack_size = 0,
+	.entries = NULL,
+	.current = &instr_top,
+};
 
-static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
-static void WalUsageAdd(WalUsage *dst, WalUsage *add);
-
-
-/* Allocate new instrumentation structure(s) */
-Instrumentation *
-InstrAlloc(int n, int instrument_options, bool async_mode)
+void
+InstrStackGrow(void)
 {
-	Instrumentation *instr;
+	int			space = instr_stack.stack_space;
 
-	/* initialize all fields to zeroes, then modify as needed */
-	instr = palloc0(n * sizeof(Instrumentation));
-	if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
-	{
-		bool		need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
-		bool		need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
-		bool		need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
-		int			i;
+	Assert(instr_stack.stack_size >= instr_stack.stack_space);
 
-		for (i = 0; i < n; i++)
-		{
-			instr[i].need_bufusage = need_buffers;
-			instr[i].need_walusage = need_wal;
-			instr[i].need_timer = need_timer;
-			instr[i].async_mode = async_mode;
-		}
+	if (instr_stack.entries == NULL)
+	{
+		space = 10;				/* Allocate sufficient initial space for
+								 * typical activity */
+		instr_stack.entries = MemoryContextAlloc(TopMemoryContext,
+												 sizeof(Instrumentation *) * space);
+	}
+	else
+	{
+		space *= 2;
+		instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, space);
 	}
 
-	return instr;
+	/* Update stack space after allocation succeeded to protect against OOMs */
+	instr_stack.stack_space = space;
+}
+
+/* General purpose instrumentation handling */
+static inline bool
+InstrNeedStack(int instrument_options)
+{
+	return (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) != 0;
 }
 
-/* Initialize a pre-allocated instrumentation structure. */
 void
-InstrInit(Instrumentation *instr, int instrument_options)
+InstrInitOptions(Instrumentation *instr, int instrument_options)
 {
-	memset(instr, 0, sizeof(Instrumentation));
-	instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
-	instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+	instr->need_stack = InstrNeedStack(instrument_options);
 	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
 }
 
-/* Entry to a plan node */
+static inline void
+InstrStartTimer(Instrumentation *instr)
+{
+	Assert(INSTR_TIME_IS_ZERO(instr->starttime));
+
+	INSTR_TIME_SET_CURRENT(instr->starttime);
+}
+
+static inline void
+InstrStopTimer(Instrumentation *instr)
+{
+	instr_time	endtime;
+
+	Assert(!INSTR_TIME_IS_ZERO(instr->starttime));
+
+	INSTR_TIME_SET_CURRENT(endtime);
+	INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+
+	INSTR_TIME_SET_ZERO(instr->starttime);
+}
+
 void
-InstrStartNode(Instrumentation *instr)
+InstrStart(Instrumentation *instr)
 {
 	if (instr->need_timer)
-	{
-		if (!INSTR_TIME_IS_ZERO(instr->starttime))
-			elog(ERROR, "InstrStartNode called twice in a row");
-		else
-			INSTR_TIME_SET_CURRENT(instr->starttime);
-	}
+		InstrStartTimer(instr);
+
+	if (instr->need_stack)
+		InstrPushStack(instr);
+}
 
-	/* save buffer usage totals at node entry, if needed */
-	if (instr->need_bufusage)
-		instr->bufusage_start = pgBufferUsage;
+void
+InstrStop(Instrumentation *instr)
+{
+	if (instr->need_timer)
+		InstrStopTimer(instr);
 
-	if (instr->need_walusage)
-		instr->walusage_start = pgWalUsage;
+	if (instr->need_stack)
+		InstrPopStack(instr);
 }
 
-/* Exit from a plan node */
+/*
+ * Stops instrumentation, finalizes the stack entry and accumulates to its parent.
+ *
+ * Note that this intentionally allows passing a stack that is not the current
+ * top, as can happen with PG_FINALLY, or resource owners, which don't have a
+ * guaranteed cleanup order.
+ *
+ * We are careful here to achieve two goals:
+ *
+ * 1) Reset the stack to the parent of whichever of the released stack entries
+ *    has the lowest index
+ * 2) Accumulate all instrumentation to the currently active instrumentation,
+ *    so that callers get a complete picture of activity, even after an abort
+ */
 void
-InstrStopNode(Instrumentation *instr, double nTuples)
+InstrStopFinalize(Instrumentation *instr)
 {
-	double		save_tuplecount = instr->tuplecount;
-	instr_time	endtime;
+	if (instr->on_stack)
+	{
+		int			idx = -1;
 
-	/* count the returned tuples */
-	instr->tuplecount += nTuples;
+		for (int i = instr_stack.stack_size - 1; i >= 0; i--)
+		{
+			if (instr_stack.entries[i] == instr)
+			{
+				idx = i;
+				break;
+			}
+		}
+
+		if (idx < 0)
+			elog(ERROR, "instrumentation entry not found on stack");
+
+		/* Clear on_stack for any intermediate entries we're skipping over */
+		for (int i = instr_stack.stack_size - 1; i > idx; i--)
+			instr_stack.entries[i]->on_stack = false;
+
+		while (instr_stack.stack_size > idx + 1)
+			instr_stack.stack_size--;
+
+		InstrPopStack(instr);
+	}
 
-	/* let's update the time only if the timer was requested */
 	if (instr->need_timer)
-	{
-		if (INSTR_TIME_IS_ZERO(instr->starttime))
-			elog(ERROR, "InstrStopNode called without start");
+		InstrStopTimer(instr);
+
+	InstrAccumStack(instr_stack.current, instr);
+}
 
-		INSTR_TIME_SET_CURRENT(endtime);
-		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
+/*
+ * Finalize child instrumentation by accumulating buffer/WAL usage to the
+ * provided instrumentation, which may be the current entry, or one the caller
+ * treats as a parent and will add to the totals later.
+ *
+ * Also deletes the unfinalized entry to avoid double counting in an abort
+ * situation, e.g. during executor finish.
+ */
+void
+InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent)
+{
+	if (instr->need_stack)
+	{
+		if (!dlist_node_is_detached(&instr->unfinalized_entry))
+			dlist_delete_thoroughly(&instr->unfinalized_entry);
 
-		INSTR_TIME_SET_ZERO(instr->starttime);
+		InstrAccumStack(parent, instr);
 	}
+}
 
-	/* Add delta of buffer usage since entry to node's totals */
-	if (instr->need_bufusage)
-		BufferUsageAccumDiff(&instr->bufusage,
-							 &pgBufferUsage, &instr->bufusage_start);
 
-	if (instr->need_walusage)
-		WalUsageAccumDiff(&instr->walusage,
-						  &pgWalUsage, &instr->walusage_start);
+/* Query instrumentation handling */
 
-	/* Is this the first tuple of this cycle? */
-	if (!instr->running)
+/*
+ * Use ResourceOwner mechanism to correctly reset instr_stack on abort.
+ */
+static void ResOwnerReleaseInstrumentation(Datum res);
+static const ResourceOwnerDesc instrumentation_resowner_desc =
+{
+	.name = "instrumentation",
+	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+	.release_priority = RELEASE_PRIO_INSTRUMENTATION,
+	.ReleaseResource = ResOwnerReleaseInstrumentation,
+	.DebugPrint = NULL,			/* default message is fine */
+};
+
+static inline void
+ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+	ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static inline void
+ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+	ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static void
+ResOwnerReleaseInstrumentation(Datum res)
+{
+	QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res);
+	MemoryContext instr_cxt = qinstr->instr_cxt;
+	dlist_mutable_iter iter;
+
+	/* Accumulate data from all unfinalized child entries (nodes, triggers) */
+	dlist_foreach_modify(iter, &qinstr->unfinalized_entries)
 	{
-		instr->running = true;
-		instr->firsttuple = instr->counter;
+		Instrumentation *child = dlist_container(Instrumentation, unfinalized_entry, iter.cur);
+
+		InstrAccumStack(&qinstr->instr, child);
 	}
+
+	/* Ensure the stack is reset as expected, and we accumulate to the parent */
+	InstrStopFinalize(&qinstr->instr);
+
+	/*
+	 * Destroy the dedicated instrumentation context, which frees the
+	 * QueryInstrumentation and all child allocations.
+	 */
+	MemoryContextDelete(instr_cxt);
+}
+
+QueryInstrumentation *
+InstrQueryAlloc(int instrument_options)
+{
+	QueryInstrumentation *instr;
+	MemoryContext instr_cxt;
+
+	/*
+	 * When the instrumentation stack is used, create a dedicated memory
+	 * context for this query's instrumentation allocations. This context is a
+	 * child of TopMemoryContext so it survives transaction abort —
+	 * ResourceOwner release needs to access it.
+	 *
+	 * For simpler cases (timer/rows only), use the current memory context.
+	 *
+	 * All child instrumentation allocations (nodes, triggers, etc) must be
+	 * allocated within this context to ensure correct clean up on abort.
+	 */
+	if (InstrNeedStack(instrument_options))
+		instr_cxt = AllocSetContextCreate(TopMemoryContext,
+										  "Instrumentation",
+										  ALLOCSET_SMALL_SIZES);
 	else
+		instr_cxt = CurrentMemoryContext;
+
+	instr = MemoryContextAllocZero(instr_cxt, sizeof(QueryInstrumentation));
+	instr->instrument_options = instrument_options;
+	instr->instr_cxt = instr_cxt;
+
+	InstrInitOptions(&instr->instr, instrument_options);
+	dlist_init(&instr->unfinalized_entries);
+
+	return instr;
+}
+
+void
+InstrQueryStart(QueryInstrumentation *qinstr)
+{
+	InstrStart(&qinstr->instr);
+
+	if (qinstr->instr.need_stack)
+	{
+		Assert(CurrentResourceOwner != NULL);
+		qinstr->owner = CurrentResourceOwner;
+
+		ResourceOwnerEnlarge(qinstr->owner);
+		ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr);
+	}
+}
+
+void
+InstrQueryStop(QueryInstrumentation *qinstr)
+{
+	InstrStop(&qinstr->instr);
+
+	if (qinstr->instr.need_stack)
 	{
-		/*
-		 * In async mode, if the plan node hadn't emitted any tuples before,
-		 * this might be the first tuple
-		 */
-		if (instr->async_mode && save_tuplecount < 1.0)
-			instr->firsttuple = instr->counter;
+		Assert(qinstr->owner != NULL);
+		ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+		qinstr->owner = NULL;
 	}
 }
 
+void
+InstrQueryStopFinalize(QueryInstrumentation *qinstr)
+{
+	InstrStopFinalize(&qinstr->instr);
+
+	if (!qinstr->instr.need_stack)
+	{
+		Assert(qinstr->owner == NULL);
+		return;
+	}
+
+	Assert(qinstr->owner != NULL);
+	ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+	qinstr->owner = NULL;
+
+	/*
+	 * Reparent the dedicated instrumentation context under the current memory
+	 * context, so that its lifetime is now tied to the caller's context
+	 * rather than TopMemoryContext.
+	 */
+	MemoryContextSetParent(qinstr->instr_cxt, CurrentMemoryContext);
+}
+
+/*
+ * Register a child Instrumentation entry for abort processing.
+ *
+ * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to
+ * recover buffer/WAL data from entries that were never finalized, in order for
+ * aggregate totals to be accurate despite the query erroring out.
+ */
+void
+InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *child)
+{
+	if (child->need_stack)
+		dlist_push_head(&parent->unfinalized_entries, &child->unfinalized_entry);
+}
+
+/* start instrumentation during parallel executor startup */
+QueryInstrumentation *
+InstrStartParallelQuery(void)
+{
+	QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+
+	InstrQueryStart(qinstr);
+	return qinstr;
+}
+
+/* report usage after parallel executor shutdown */
+void
+InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst)
+{
+	InstrQueryStopFinalize(qinstr);
+	dst->need_stack = qinstr->instr.need_stack;
+	memcpy(&dst->bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
+	memcpy(&dst->walusage, &qinstr->instr.walusage, sizeof(WalUsage));
+}
+
+/*
+ * Accumulate work done by parallel workers in the leader's stats.
+ *
+ * Note that what gets added here effectively depends on whether per-node
+ * instrumentation is active. If it's active the parallel worker intentionally
+ * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would
+ * cause double counting. Instead, this only accumulates any extra activity
+ * outside of nodes.
+ *
+ * Otherwise this is responsible for making sure that the complete query
+ * activity is accumulated.
+ */
+void
+InstrAccumParallelQuery(Instrumentation *instr)
+{
+	InstrAccumStack(instr_stack.current, instr);
+
+	WalUsageAdd(&pgWalUsage, &instr->walusage);
+}
+
+/* Node instrumentation handling */
+
+/* Allocate new node instrumentation structure */
+NodeInstrumentation *
+InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode)
+{
+	NodeInstrumentation *instr = MemoryContextAlloc(qinstr->instr_cxt, sizeof(NodeInstrumentation));
+
+	InstrInitNode(instr, qinstr->instrument_options);
+	instr->async_mode = async_mode;
+
+	InstrQueryRememberChild(qinstr, &instr->instr);
+
+	return instr;
+}
+
+/* Initialize a pre-allocated instrumentation structure. */
+void
+InstrInitNode(NodeInstrumentation *instr, int instrument_options)
+{
+	memset(instr, 0, sizeof(NodeInstrumentation));
+	InstrInitOptions(&instr->instr, instrument_options);
+}
+
+/* Entry to a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStartNode(NodeInstrumentation *instr)
+{
+	InstrStart(&instr->instr);
+}
+
+/*
+ * Updates the node instrumentation time counter.
+ *
+ * Note this is different from InstrStop because total is only updated in
+ * InstrEndLoop. We need the separate counter variable because we need to
+ * calculate start-up time for the first tuple in each cycle, and then
+ * accumulate it together.
+ */
+static inline void
+InstrStopNodeTimer(NodeInstrumentation *instr)
+{
+	instr_time	endtime;
+
+	Assert(!INSTR_TIME_IS_ZERO(instr->instr.starttime));
+
+	INSTR_TIME_SET_CURRENT(endtime);
+	INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime);
+	INSTR_TIME_SET_ZERO(instr->instr.starttime);
+
+	/*
+	 * Is this the first tuple of this cycle?
+	 *
+	 * In async mode, if the plan node hadn't emitted any tuples before, this
+	 * might be the first tuple
+	 */
+	if (!instr->running || (instr->async_mode && instr->tuplecount < 1.0))
+		instr->firsttuple = instr->counter;
+}
+
+/* Exit from a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStopNode(NodeInstrumentation *instr, double nTuples)
+{
+	if (instr->instr.need_timer)
+		InstrStopNodeTimer(instr);
+
+	/* Only pop the stack, accumulation runs in InstrFinalizeNode */
+	if (instr->instr.need_stack)
+		InstrPopStack(&instr->instr);
+
+	instr->running = true;
+
+	/* count the returned tuples */
+	instr->tuplecount += nTuples;
+}
+
 /* Update tuple count */
 void
-InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
 {
 	/* count the returned tuples */
 	instr->tuplecount += nTuples;
@@ -141,47 +455,40 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
 
 /* Finish a run cycle for a plan node */
 void
-InstrEndLoop(Instrumentation *instr)
+InstrEndLoop(NodeInstrumentation *instr)
 {
 	/* Skip if nothing has happened, or already shut down */
 	if (!instr->running)
 		return;
 
-	if (!INSTR_TIME_IS_ZERO(instr->starttime))
-		elog(ERROR, "InstrEndLoop called on running node");
+	/* Ensure InstrNodeStop was called */
+	Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime));
 
 	/* Accumulate per-cycle statistics into totals */
 	INSTR_TIME_ADD(instr->startup, instr->firsttuple);
-	INSTR_TIME_ADD(instr->total, instr->counter);
+	INSTR_TIME_ADD(instr->instr.total, instr->counter);
 	instr->ntuples += instr->tuplecount;
 	instr->nloops += 1;
 
 	/* Reset for next cycle (if any) */
 	instr->running = false;
-	INSTR_TIME_SET_ZERO(instr->starttime);
+	INSTR_TIME_SET_ZERO(instr->instr.starttime);
 	INSTR_TIME_SET_ZERO(instr->counter);
 	INSTR_TIME_SET_ZERO(instr->firsttuple);
 	instr->tuplecount = 0;
 }
 
-/* aggregate instrumentation information */
+/*
+ * Aggregate instrumentation from parallel workers. Must be called after
+ * InstrEndLoop.
+ */
 void
-InstrAggNode(Instrumentation *dst, Instrumentation *add)
+InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
 {
-	if (!dst->running && add->running)
-	{
-		dst->running = true;
-		dst->firsttuple = add->firsttuple;
-	}
-	else if (dst->running && add->running &&
-			 INSTR_TIME_GT(dst->firsttuple, add->firsttuple))
-		dst->firsttuple = add->firsttuple;
+	Assert(!add->running);
 
-	INSTR_TIME_ADD(dst->counter, add->counter);
-
-	dst->tuplecount += add->tuplecount;
 	INSTR_TIME_ADD(dst->startup, add->startup);
-	INSTR_TIME_ADD(dst->total, add->total);
+	INSTR_TIME_ADD(dst->instr.total, add->instr.total);
 	dst->ntuples += add->ntuples;
 	dst->ntuples2 += add->ntuples2;
 	dst->nloops += add->nloops;
@@ -189,41 +496,167 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
 	dst->nfiltered2 += add->nfiltered2;
 
 	/* Add delta of buffer usage since entry to node's totals */
-	if (dst->need_bufusage)
-		BufferUsageAdd(&dst->bufusage, &add->bufusage);
+	if (dst->instr.need_stack)
+		InstrAccumStack(&dst->instr, &add->instr);
+}
+
+/*
+ * Specialized handling of instrumented ExecProcNode
+ *
+ * These functions are equivalent to running ExecProcNodeReal wrapped in
+ * InstrStartNode and InstrStopNode, but avoid the conditionals in the hot path
+ * by checking the instrumentation options when the ExecProcNode pointer gets
+ * first set, and then using a special-purpose function for each. This results
+ * in a more optimized set of compiled instructions.
+ */
+
+#include "executor/tuptable.h"
+#include "nodes/execnodes.h"
+
+/* Simplified pop: restore saved state instead of re-deriving from array */
+static inline void
+InstrPopStackTo(Instrumentation *prev)
+{
+	Assert(instr_stack.stack_size > 0);
+	Assert(instr_stack.stack_size > 1 ? instr_stack.entries[instr_stack.stack_size - 2] == prev : &instr_top == prev);
+	instr_stack.entries[instr_stack.stack_size - 1]->on_stack = false;
+	instr_stack.stack_size--;
+	instr_stack.current = prev;
+}
+
+static pg_attribute_always_inline TupleTableSlot *
+ExecProcNodeInstr(PlanState *node, bool need_timer, bool need_stack)
+{
+	NodeInstrumentation *instr = node->instrument;
+	Instrumentation *prev = instr_stack.current;
+	TupleTableSlot *result;
+
+	if (need_stack)
+		InstrPushStack(&instr->instr);
+	if (need_timer)
+		InstrStartTimer(&instr->instr);
+
+	result = node->ExecProcNodeReal(node);
+
+	if (need_timer)
+		InstrStopNodeTimer(instr);
+	if (need_stack)
+		InstrPopStackTo(prev);
+
+	instr->running = true;
+	if (!TupIsNull(result))
+		instr->tuplecount += 1.0;
+
+	return result;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrFull(PlanState *node)
+{
+	return ExecProcNodeInstr(node, true, true);
+}
 
-	if (dst->need_walusage)
-		WalUsageAdd(&dst->walusage, &add->walusage);
+static TupleTableSlot *
+ExecProcNodeInstrRowsStackOnly(PlanState *node)
+{
+	return ExecProcNodeInstr(node, false, true);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsTimerOnly(PlanState *node)
+{
+	return ExecProcNodeInstr(node, true, false);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsOnly(PlanState *node)
+{
+	return ExecProcNodeInstr(node, false, false);
+}
+
+/*
+ * Returns an ExecProcNode wrapper that performs instrumentation calls,
+ * tailored to the instrumentation options enabled for the node.
+ */
+ExecProcNodeMtd
+InstrNodeSetupExecProcNode(NodeInstrumentation *instr)
+{
+	bool		need_timer = instr->instr.need_timer;
+	bool		need_stack = instr->instr.need_stack;
+
+	if (need_timer && need_stack)
+		return ExecProcNodeInstrFull;
+	else if (need_stack)
+		return ExecProcNodeInstrRowsStackOnly;
+	else if (need_timer)
+		return ExecProcNodeInstrRowsTimerOnly;
+	else
+		return ExecProcNodeInstrRowsOnly;
+}
+
+/* Trigger instrumentation handling */
+TriggerInstrumentation *
+InstrAllocTrigger(QueryInstrumentation *qinstr, int n)
+{
+	TriggerInstrumentation *tginstr;
+	int			i;
+
+	/*
+	 * Allocate in the query's dedicated instrumentation context so all
+	 * instrumentation data is grouped together and cleaned up as a unit.
+	 */
+	Assert(qinstr != NULL && qinstr->instr_cxt != NULL);
+	tginstr = MemoryContextAllocZero(qinstr->instr_cxt,
+									 n * sizeof(TriggerInstrumentation));
+
+	for (i = 0; i < n; i++)
+		InstrInitOptions(&tginstr[i].instr, qinstr->instrument_options);
+
+	return tginstr;
 }
 
-/* note current values during parallel executor startup */
 void
-InstrStartParallelQuery(void)
+InstrStartTrigger(QueryInstrumentation *qinstr, TriggerInstrumentation *tginstr)
 {
-	save_pgBufferUsage = pgBufferUsage;
-	save_pgWalUsage = pgWalUsage;
+	InstrStart(&tginstr->instr);
+
+	/*
+	 * On first call, register with the parent QueryInstrumentation for abort
+	 * recovery.
+	 */
+	if (qinstr && tginstr->instr.need_stack &&
+		dlist_node_is_detached(&tginstr->instr.unfinalized_entry))
+		dlist_push_head(&qinstr->unfinalized_entries,
+						&tginstr->instr.unfinalized_entry);
 }
 
-/* report usage after parallel executor shutdown */
 void
-InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
 {
-	memset(bufusage, 0, sizeof(BufferUsage));
-	BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
-	memset(walusage, 0, sizeof(WalUsage));
-	WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
+	/*
+	 * This trigger may be called again, so we don't finalize instrumentation
+	 * here. Accumulation to the parent happens at ExecutorFinish through
+	 * ExecFinalizeTriggerInstrumentation.
+	 */
+	InstrStop(&tginstr->instr);
+	tginstr->firings += firings;
 }
 
-/* accumulate work done by workers in leader's stats */
 void
-InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrAccumStack(Instrumentation *dst, Instrumentation *add)
 {
-	BufferUsageAdd(&pgBufferUsage, bufusage);
-	WalUsageAdd(&pgWalUsage, walusage);
+	Assert(dst != NULL);
+	Assert(add != NULL);
+
+	if (!add->need_stack)
+		return;
+
+	BufferUsageAdd(&dst->bufusage, &add->bufusage);
+	WalUsageAdd(&dst->walusage, &add->walusage);
 }
 
 /* dst += add */
-static void
+void
 BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 {
 	dst->shared_blks_hit += add->shared_blks_hit;
@@ -244,39 +677,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
 }
 
-/* dst += add - sub */
+/* dst += add */
 void
-BufferUsageAccumDiff(BufferUsage *dst,
-					 const BufferUsage *add,
-					 const BufferUsage *sub)
-{
-	dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
-	dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
-	dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
-	dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
-	dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
-	dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
-	dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
-	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
-	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
-	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
-	INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
-						  add->shared_blk_read_time, sub->shared_blk_read_time);
-	INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
-						  add->shared_blk_write_time, sub->shared_blk_write_time);
-	INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
-						  add->local_blk_read_time, sub->local_blk_read_time);
-	INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
-						  add->local_blk_write_time, sub->local_blk_write_time);
-	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
-						  add->temp_blk_read_time, sub->temp_blk_read_time);
-	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
-						  add->temp_blk_write_time, sub->temp_blk_write_time);
-}
-
-/* helper functions for WAL usage accumulation */
-static void
-WalUsageAdd(WalUsage *dst, WalUsage *add)
+WalUsageAdd(WalUsage *dst, const WalUsage *add)
 {
 	dst->wal_bytes += add->wal_bytes;
 	dst->wal_records += add->wal_records;
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 70c55ee6d614d..63e24a0bcd4cf 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -276,7 +276,7 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation of bitmap index scans if requested */
 	if (estate->es_instrument)
-		indexstate->biss_Instrument = palloc0_object(IndexScanInstrumentation);
+		indexstate->biss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
 
 	/* Open the index relation. */
 	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index de6154fd54139..9e64ce2bd2da7 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -67,6 +67,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid;
+	Instrumentation *table_instr = NULL;
 
 	/*
 	 * extract necessary information from index scan node
@@ -83,6 +84,9 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
+	if (node->ioss_Instrument && node->ioss_Instrument->table_instr.need_stack)
+		table_instr = &node->ioss_Instrument->table_instr;
+
 	if (scandesc == NULL)
 	{
 		/*
@@ -165,11 +169,22 @@ IndexOnlyNext(IndexOnlyScanState *node)
 							ItemPointerGetBlockNumber(tid),
 							&node->ioss_VMBuffer))
 		{
+			bool		found;
+
 			/*
 			 * Rats, we have to visit the heap to check visibility.
 			 */
 			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+
+			if (table_instr)
+				InstrPushStack(table_instr);
+
+			found = index_fetch_heap(scandesc, node->ioss_TableSlot);
+
+			if (table_instr)
+				InstrPopStack(table_instr);
+
+			if (!found)
 				continue;		/* no visible tuple, try next index entry */
 
 			ExecClearTuple(node->ioss_TableSlot);
@@ -436,6 +451,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node)
 		 * which will have a new IndexOnlyScanState and zeroed stats.
 		 */
 		winstrument->nsearches += node->ioss_Instrument->nsearches;
+		InstrAccumStack(&winstrument->table_instr, &node->ioss_Instrument->table_instr);
 	}
 
 	/*
@@ -610,7 +626,21 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation of index-only scans if requested */
 	if (estate->es_instrument)
-		indexstate->ioss_Instrument = palloc0_object(IndexScanInstrumentation);
+	{
+		indexstate->ioss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
+
+		/*
+		 * Track table and index access separately. We intentionally don't
+		 * collect timing (even if enabled), since we don't need it, and
+		 * IndexOnlyNext calls InstrPushStack / InstrPopStack (instead of the
+		 * full InstrNode*) to reduce overhead.
+		 */
+		if ((estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0)
+		{
+			InstrInitOptions(&indexstate->ioss_Instrument->table_instr, INSTRUMENT_BUFFERS);
+			InstrQueryRememberChild(estate->es_instrument, &indexstate->ioss_Instrument->table_instr);
+		}
+	}
 
 	/* Open the index relation. */
 	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
@@ -899,4 +929,11 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
 		SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
 	node->ioss_SharedInfo = palloc(size);
 	memcpy(node->ioss_SharedInfo, SharedInfo, size);
+
+	/* Aggregate workers' table buffer/WAL usage into leader's entry */
+	for (int i = 0; i < node->ioss_SharedInfo->num_workers; i++)
+	{
+		InstrAccumStack(&node->ioss_Instrument->table_instr,
+						&node->ioss_SharedInfo->winstrument[i].table_instr);
+	}
 }
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 1620d14607173..02ef9d124a368 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -85,7 +85,10 @@ IndexNext(IndexScanState *node)
 	ExprContext *econtext;
 	ScanDirection direction;
 	IndexScanDesc scandesc;
+	ItemPointer tid;
 	TupleTableSlot *slot;
+	bool		found;
+	Instrumentation *table_instr = NULL;
 
 	/*
 	 * extract necessary information from index scan node
@@ -102,6 +105,9 @@ IndexNext(IndexScanState *node)
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
+	if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack)
+		table_instr = &node->iss_Instrument->table_instr;
+
 	if (scandesc == NULL)
 	{
 		/*
@@ -132,8 +138,24 @@ IndexNext(IndexScanState *node)
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
+		if (table_instr)
+			InstrPushStack(table_instr);
+
+		for (;;)
+		{
+			found = index_fetch_heap(scandesc, slot);
+			if (found || !scandesc->xs_heap_continue)
+				break;
+		}
+
+		if (table_instr)
+			InstrPopStack(table_instr);
+
+		if (unlikely(!found))
+			continue;
+
 		CHECK_FOR_INTERRUPTS();
 
 		/*
@@ -181,6 +203,7 @@ IndexNextWithReorder(IndexScanState *node)
 	Datum	   *lastfetched_vals;
 	bool	   *lastfetched_nulls;
 	int			cmp;
+	Instrumentation *table_instr = NULL;
 
 	estate = node->ss.ps.state;
 
@@ -200,6 +223,9 @@ IndexNextWithReorder(IndexScanState *node)
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
+	if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack)
+		table_instr = &node->iss_Instrument->table_instr;
+
 	if (scandesc == NULL)
 	{
 		/*
@@ -263,36 +289,67 @@ IndexNextWithReorder(IndexScanState *node)
 		}
 
 		/*
-		 * Fetch next tuple from the index.
+		 * Fetch next valid tuple from the index.
 		 */
-next_indextuple:
-		if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+		for (;;)
 		{
+			ItemPointer tid;
+			bool		found;
+
+			/* Time to fetch the next TID from the index */
+			tid = index_getnext_tid(scandesc, ForwardScanDirection);
+
+			/* If we're out of index entries, we're done */
+			if (tid == NULL)
+			{
+				/*
+				 * No more tuples from the index.  But we still need to drain
+				 * any remaining tuples from the queue before we're done.
+				 */
+				node->iss_ReachedEnd = true;
+				break;
+			}
+
+			Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid));
+
+			if (table_instr)
+				InstrPushStack(table_instr);
+
+			for (;;)
+			{
+				found = index_fetch_heap(scandesc, slot);
+				if (found || !scandesc->xs_heap_continue)
+					break;
+			}
+
+			if (table_instr)
+				InstrPopStack(table_instr);
+
 			/*
-			 * No more tuples from the index.  But we still need to drain any
-			 * remaining tuples from the queue before we're done.
+			 * If the index was lossy, we have to recheck the index quals and
+			 * ORDER BY expressions using the fetched tuple.
 			 */
-			node->iss_ReachedEnd = true;
-			continue;
-		}
-
-		/*
-		 * If the index was lossy, we have to recheck the index quals and
-		 * ORDER BY expressions using the fetched tuple.
-		 */
-		if (scandesc->xs_recheck)
-		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->indexqualorig, econtext))
+			if (found && scandesc->xs_recheck)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				/* allow this loop to be cancellable */
-				CHECK_FOR_INTERRUPTS();
-				goto next_indextuple;
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->indexqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					/* allow this loop to be cancellable */
+					CHECK_FOR_INTERRUPTS();
+					continue;
+				}
 			}
+
+			if (found)
+				break;
 		}
 
+		/* No more index entries, re-run to clear the reorder queue */
+		if (node->iss_ReachedEnd)
+			continue;
+
 		if (scandesc->xs_recheckorderby)
 		{
 			econtext->ecxt_scantuple = slot;
@@ -818,6 +875,7 @@ ExecEndIndexScan(IndexScanState *node)
 		 * which will have a new IndexOnlyScanState and zeroed stats.
 		 */
 		winstrument->nsearches += node->iss_Instrument->nsearches;
+		InstrAccumStack(&winstrument->table_instr, &node->iss_Instrument->table_instr);
 	}
 
 	/*
@@ -980,7 +1038,21 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation of index scans if requested */
 	if (estate->es_instrument)
-		indexstate->iss_Instrument = palloc0_object(IndexScanInstrumentation);
+	{
+		indexstate->iss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
+
+		/*
+		 * Track table and index access separately. We intentionally don't
+		 * collect timing (even if enabled), since we don't need it, and
+		 * IndexNext / IndexNextWithReorder call InstrPushStack /
+		 * InstrPopStack (instead of the full InstrNode*) to reduce overhead.
+		 */
+		if ((estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0)
+		{
+			InstrInitOptions(&indexstate->iss_Instrument->table_instr, INSTRUMENT_BUFFERS);
+			InstrQueryRememberChild(estate->es_instrument, &indexstate->iss_Instrument->table_instr);
+		}
+	}
 
 	/* Open the index relation. */
 	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
@@ -1834,4 +1906,11 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node)
 		SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
 	node->iss_SharedInfo = palloc(size);
 	memcpy(node->iss_SharedInfo, SharedInfo, size);
+
+	/* Aggregate workers' table buffer/WAL usage into leader's entry */
+	for (int i = 0; i < node->iss_SharedInfo->num_workers; i++)
+	{
+		InstrAccumStack(&node->iss_Instrument->table_instr,
+						&node->iss_SharedInfo->winstrument[i].table_instr);
+	}
 }
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index b38170f0fbe99..3ca0a7a635dba 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -904,7 +904,7 @@ create_edata_for_relation(LogicalRepRelMapEntry *rel)
 	 * Use Relation opened by logicalrep_rel_open() instead of opening it
 	 * again.
 	 */
-	InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
+	InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, NULL);
 
 	/*
 	 * We put the ResultRelInfo in the es_opened_result_relations list, even
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 39984df31b458..9f76d2683c0c9 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -409,7 +409,6 @@ static int
 pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 {
 	struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
-	int			in_flight_before = dclist_count(&pgaio_my_backend->in_flight_ios);
 
 	Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
 
@@ -425,27 +424,6 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 
 		pgaio_io_prepare_submit(ioh);
 		pgaio_uring_sq_from_io(ioh, sqe);
-
-		/*
-		 * io_uring executes IO in process context if possible. That's
-		 * generally good, as it reduces context switching. When performing a
-		 * lot of buffered IO that means that copying between page cache and
-		 * userspace memory happens in the foreground, as it can't be
-		 * offloaded to DMA hardware as is possible when using direct IO. When
-		 * executing a lot of buffered IO this causes io_uring to be slower
-		 * than worker mode, as worker mode parallelizes the copying. io_uring
-		 * can be told to offload work to worker threads instead.
-		 *
-		 * If an IO is buffered IO and we already have IOs in flight or
-		 * multiple IOs are being submitted, we thus tell io_uring to execute
-		 * the IO in the background. We don't do so for the first few IOs
-		 * being submitted as executing in this process' context has lower
-		 * latency.
-		 */
-		if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
-			io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
-
-		in_flight_before++;
 	}
 
 	while (true)
@@ -701,10 +679,65 @@ pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation)
 	LWLockRelease(&owner_context->completion_lock);
 }
 
+/*
+ * io_uring executes IO in process context if possible. That's generally good,
+ * as it reduces context switching. When performing a lot of buffered IO that
+ * means that copying between page cache and userspace memory happens in the
+ * foreground, as it can't be offloaded to DMA hardware as is possible when
+ * using direct IO. When executing a lot of buffered IO this causes io_uring
+ * to be slower than worker mode, as worker mode parallelizes the
+ * copying. io_uring can be told to offload work to worker threads instead.
+ *
+ * If the IOs are small, we only benefit from forcing things into the
+ * background if there is a lot of IO, as otherwise the overhead from context
+ * switching is higher than the gain.
+ *
+ * If IOs are large, there is benefit from asynchronous processing at lower
+ * queue depths, as IO latency is less of a crucial factor and parallelizing
+ * memory copies is more important.  In addition, it is important to trigger
+ * asynchronous processing even at low queue depth, as with foreground
+ * processing we might never actually reach deep enough IO depths to trigger
+ * asynchronous processing, which in turn would deprive readahead control
+ * logic of information about whether a deeper look-ahead distance would be
+ * advantageous.
+ *
+ * We have done some basic benchmarking to validate the thresholds used, but
+ * it's quite plausible that there are better values.  See
+ * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg
+ * for some details of this benchmarking.
+ */
+static bool
+pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size)
+{
+	/*
+	 * With DIO there's no benefit from forcing asynchronous processing, as
+	 * io_uring will never execute direct IO synchronously during submission.
+	 */
+	if (!(ioh->flags & PGAIO_HF_BUFFERED))
+		return false;
+
+	/*
+	 * Once the IO queue depth is not that shallow anymore, the overhead of
+	 * dispatching to the background is a less significant factor.
+	 */
+	if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4)
+		return true;
+
+	/*
+	 * If the IO is larger, the gains from parallelizing the memory copy are
+	 * larger and typically the impact of the latency is smaller.
+	 */
+	if (io_size >= (BLCKSZ * 4))
+		return true;
+
+	return false;
+}
+
 static void
 pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 {
 	struct iovec *iov;
+	size_t		io_size = 0;
 
 	switch ((PgAioOp) ioh->op)
 	{
@@ -717,6 +750,8 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 								   iov->iov_base,
 								   iov->iov_len,
 								   ioh->op_data.read.offset);
+
+				io_size = iov->iov_len;
 			}
 			else
 			{
@@ -726,7 +761,13 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 									ioh->op_data.read.iov_length,
 									ioh->op_data.read.offset);
 
+				for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++)
+					io_size += iov->iov_len;
 			}
+
+			if (pgaio_uring_should_use_async(ioh, io_size))
+				io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
+
 			break;
 
 		case PGAIO_OP_WRITEV:
@@ -747,6 +788,12 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 									 ioh->op_data.write.iov_length,
 									 ioh->op_data.write.offset);
 			}
+
+			/*
+			 * For now don't trigger use of IOSQE_ASYNC for writes, it's not
+			 * clear there is a performance benefit in doing so.
+			 */
+
 			break;
 
 		case PGAIO_OP_INVALID:
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index efe38e9f1134f..e24357a7a0a23 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -239,8 +239,8 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh)
 		|| !pgaio_io_can_reopen(ioh);
 }
 
-static void
-pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
+static int
+pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 {
 	PgAioHandle **synchronous_ios = NULL;
 	int			nsync = 0;
@@ -249,6 +249,9 @@ pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
 
 	Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
 
+	for (int i = 0; i < num_staged_ios; i++)
+		pgaio_io_prepare_submit(staged_ios[i]);
+
 	if (LWLockConditionalAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE))
 	{
 		for (int i = 0; i < num_staged_ios; ++i)
@@ -299,19 +302,6 @@ pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
 			pgaio_io_perform_synchronously(synchronous_ios[i]);
 		}
 	}
-}
-
-static int
-pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
-{
-	for (int i = 0; i < num_staged_ios; i++)
-	{
-		PgAioHandle *ioh = staged_ios[i];
-
-		pgaio_io_prepare_submit(ioh);
-	}
-
-	pgaio_worker_submit_internal(num_staged_ios, staged_ios);
 
 	return num_staged_ios;
 }
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 31f9e35dee310..0b6cdf7c8730d 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -18,11 +18,13 @@
  * to StartReadBuffers() so that a new one can begin to form.
  *
  * The algorithm for controlling the look-ahead distance is based on recent
- * cache hit and miss history.  When no I/O is necessary, there is no benefit
- * in looking ahead more than one block.  This is the default initial
- * assumption, but when blocks needing I/O are streamed, the distance is
- * increased rapidly to try to benefit from I/O combining and concurrency.  It
- * is reduced gradually when cached blocks are streamed.
+ * cache / miss history, as well as whether we need to wait for I/O completion
+ * after a miss.  When no I/O is necessary, there is no benefit in looking
+ * ahead more than one block.  This is the default initial assumption.  When
+ * blocks needing I/O are streamed, the combine distance is increased to
+ * benefit from I/O combining and the read-ahead distance is increased
+ * whenever we need to wait for I/O to try to benefit from increased I/O
+ * concurrency. Both are reduced gradually when cached blocks are streamed.
  *
  * The main data structure is a circular queue of buffers of size
  * max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -98,10 +100,23 @@ struct ReadStream
 	int16		max_pinned_buffers;
 	int16		forwarded_buffers;
 	int16		pinned_buffers;
-	int16		distance;
+
+	/*
+	 * Limit of how far, in blocks, to look-ahead for IO combining and for
+	 * read-ahead.
+	 *
+	 * The limits for read-ahead and combining are handled separately to allow
+	 * for IO combining even in cases where the I/O subsystem can keep up at a
+	 * low read-ahead distance, as doing larger IOs is more efficient.
+	 *
+	 * Set to 0 when the end of the stream is reached.
+	 */
+	int16		combine_distance;
+	int16		readahead_distance;
 	uint16		distance_decay_holdoff;
 	int16		initialized_buffers;
-	int16		resume_distance;
+	int16		resume_readahead_distance;
+	int16		resume_combine_distance;
 	int			read_buffers_flags;
 	bool		sync_mode;		/* using io_method=sync */
 	bool		batch_mode;		/* READ_STREAM_USE_BATCHING */
@@ -332,8 +347,8 @@ read_stream_start_pending_read(ReadStream *stream)
 
 		/* Shrink distance: no more look-ahead until buffers are released. */
 		new_distance = stream->pinned_buffers + buffer_limit;
-		if (stream->distance > new_distance)
-			stream->distance = new_distance;
+		if (stream->readahead_distance > new_distance)
+			stream->readahead_distance = new_distance;
 
 		/* Unless we have nothing to give the consumer, stop here. */
 		if (stream->pinned_buffers > 0)
@@ -374,12 +389,29 @@ read_stream_start_pending_read(ReadStream *stream)
 		 * perform IO asynchronously when starting out with a small look-ahead
 		 * distance.
 		 */
-		if (stream->distance > 1 && stream->ios_in_progress == 0)
+		if (stream->ios_in_progress == 0)
 		{
-			if (stream->distance_decay_holdoff == 0)
-				stream->distance--;
-			else
+			if (stream->distance_decay_holdoff > 0)
 				stream->distance_decay_holdoff--;
+			else
+			{
+				if (stream->readahead_distance > 1)
+					stream->readahead_distance--;
+
+				/*
+				 * For now we reduce the IO combine distance after
+				 * sufficiently many buffer hits. There is no clear
+				 * performance argument for doing so, but at the moment we
+				 * need to do so to make the entrance into fast_path work
+				 * correctly: We require combine_distance == 1 to enter
+				 * fast-path, as without that condition we would wrongly
+				 * re-enter fast-path when readahead_distance == 1 and
+				 * pinned_buffers == 1, as we would not yet have prepared
+				 * another IO in that situation.
+				 */
+				if (stream->combine_distance > 1)
+					stream->combine_distance--;
+			}
 		}
 	}
 	else
@@ -440,6 +472,114 @@ read_stream_start_pending_read(ReadStream *stream)
 	return true;
 }
 
+/*
+ * Should we continue to perform look ahead?  Looking ahead may allow us to
+ * make the pending IO larger via IO combining or to issue more read-ahead.
+ */
+static inline bool
+read_stream_should_look_ahead(ReadStream *stream)
+{
+	/* If the callback has signaled end-of-stream, we're done */
+	if (stream->readahead_distance == 0)
+		return false;
+
+	/* never start more IOs than our cap */
+	if (stream->ios_in_progress >= stream->max_ios)
+		return false;
+
+	/*
+	 * Allow looking further ahead if we are in the process of building a
+	 * larger IO, the IO is not yet big enough, and we don't yet have IO in
+	 * flight.
+	 *
+	 * We do so to allow building larger reads when readahead_distance is
+	 * small (e.g. because the I/O subsystem is keeping up or
+	 * effective_io_concurrency is small). That's a useful goal because larger
+	 * reads are more CPU efficient than smaller reads, even if the system is
+	 * not IO bound.
+	 *
+	 * The reason we do *not* do so when we already have a read prepared (i.e.
+	 * why we check for pinned_buffers == 0) is once we are actually reading
+	 * ahead, we don't need it:
+	 *
+	 * - We won't issue unnecessarily small reads as
+	 * read_stream_should_issue_now() will return false until the IO is
+	 * suitably sized. The issuance of the pending read will be delayed until
+	 * enough buffers have been consumed.
+	 *
+	 * - If we are not reading ahead aggressively enough, future
+	 * WaitReadBuffers() calls will return true, leading to readahead_distance
+	 * being increased. After that more full-sized IOs can be issued.
+	 *
+	 * Furthermore, if we did not have the pinned_buffers == 0 condition, we
+	 * might end up issuing I/O more aggressively than we need.
+	 *
+	 * Note that a return of true here can lead to exceeding the read-ahead
+	 * limit, but we won't exceed the buffer pin limit (because pinned_buffers
+	 * == 0 and combine_distance is capped by max_pinned_buffers).
+	 */
+	if (stream->pending_read_nblocks > 0 &&
+		stream->pinned_buffers == 0 &&
+		stream->pending_read_nblocks < stream->combine_distance)
+		return true;
+
+	/*
+	 * Don't start more read-ahead if that'd put us over the distance limit
+	 * for doing read-ahead. As stream->readahead_distance is capped by
+	 * max_pinned_buffers, this prevents us from looking ahead so far that it
+	 * would put us over the pin limit.
+	 */
+	if (stream->pinned_buffers + stream->pending_read_nblocks >= stream->readahead_distance)
+		return false;
+
+	return true;
+}
+
+/*
+ * We don't start the pending read just because we've hit the distance limit,
+ * preferring to give it another chance to grow to full io_combine_limit size
+ * once more buffers have been consumed.  But this is not desirable in all
+ * situations - see below.
+ */
+static inline bool
+read_stream_should_issue_now(ReadStream *stream)
+{
+	int16		pending_read_nblocks = stream->pending_read_nblocks;
+
+	/* there is no pending IO that could be issued */
+	if (pending_read_nblocks == 0)
+		return false;
+
+	/* never start more IOs than our cap */
+	if (stream->ios_in_progress >= stream->max_ios)
+		return false;
+
+	/*
+	 * If the callback has signaled end-of-stream, start the pending read
+	 * immediately. There is no further potential for IO combining.
+	 */
+	if (stream->readahead_distance == 0)
+		return true;
+
+	/*
+	 * If we've already reached combine_distance, there's no chance of growing
+	 * the read further.
+	 */
+	if (pending_read_nblocks >= stream->combine_distance)
+		return true;
+
+	/*
+	 * If we currently have no reads in flight or prepared, issue the IO once
+	 * we are not looking ahead further. This ensures there's always at least
+	 * one IO prepared.
+	 */
+	if (stream->pinned_buffers == 0 &&
+		!read_stream_should_look_ahead(stream))
+		return true;
+
+	return false;
+}
+
 static void
 read_stream_look_ahead(ReadStream *stream)
 {
@@ -452,14 +592,13 @@ read_stream_look_ahead(ReadStream *stream)
 	if (stream->batch_mode)
 		pgaio_enter_batchmode();
 
-	while (stream->ios_in_progress < stream->max_ios &&
-		   stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
+	while (read_stream_should_look_ahead(stream))
 	{
 		BlockNumber blocknum;
 		int16		buffer_index;
 		void	   *per_buffer_data;
 
-		if (stream->pending_read_nblocks == stream->io_combine_limit)
+		if (read_stream_should_issue_now(stream))
 		{
 			read_stream_start_pending_read(stream);
 			continue;
@@ -479,7 +618,8 @@ read_stream_look_ahead(ReadStream *stream)
 		if (blocknum == InvalidBlockNumber)
 		{
 			/* End of stream. */
-			stream->distance = 0;
+			stream->readahead_distance = 0;
+			stream->combine_distance = 0;
 			break;
 		}
 
@@ -511,21 +651,13 @@ read_stream_look_ahead(ReadStream *stream)
 	}
 
 	/*
-	 * We don't start the pending read just because we've hit the distance
-	 * limit, preferring to give it another chance to grow to full
-	 * io_combine_limit size once more buffers have been consumed.  However,
-	 * if we've already reached io_combine_limit, or we've reached the
-	 * distance limit and there isn't anything pinned yet, or the callback has
-	 * signaled end-of-stream, we start the read immediately.  Note that the
-	 * pending read can exceed the distance goal, if the latter was reduced
-	 * after hitting the per-backend buffer limit.
+	 * Check if the pending read should be issued now, or if we should give it
+	 * another chance to grow to the full size.
+	 *
+	 * Note that the pending read can exceed the distance goal, if the latter
+	 * was reduced after hitting the per-backend buffer limit.
 	 */
-	if (stream->pending_read_nblocks > 0 &&
-		(stream->pending_read_nblocks == stream->io_combine_limit ||
-		 (stream->pending_read_nblocks >= stream->distance &&
-		  stream->pinned_buffers == 0) ||
-		 stream->distance == 0) &&
-		stream->ios_in_progress < stream->max_ios)
+	if (read_stream_should_issue_now(stream))
 		read_stream_start_pending_read(stream);
 
 	/*
@@ -534,7 +666,7 @@ read_stream_look_ahead(ReadStream *stream)
 	 * stream.  In the worst case we can always make progress one buffer at a
 	 * time.
 	 */
-	Assert(stream->pinned_buffers > 0 || stream->distance == 0);
+	Assert(stream->pinned_buffers > 0 || stream->readahead_distance == 0);
 
 	if (stream->batch_mode)
 		pgaio_exit_batchmode();
@@ -724,10 +856,17 @@ read_stream_begin_impl(int flags,
 	 * doing full io_combine_limit sized reads.
 	 */
 	if (flags & READ_STREAM_FULL)
-		stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
+	{
+		stream->readahead_distance = Min(max_pinned_buffers, stream->io_combine_limit);
+		stream->combine_distance = Min(max_pinned_buffers, stream->io_combine_limit);
+	}
 	else
-		stream->distance = 1;
-	stream->resume_distance = stream->distance;
+	{
+		stream->readahead_distance = 1;
+		stream->combine_distance = 1;
+	}
+	stream->resume_readahead_distance = stream->readahead_distance;
+	stream->resume_combine_distance = stream->combine_distance;
 
 	/*
 	 * Since we always access the same relation, we can initialize parts of
@@ -826,7 +965,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		Assert(stream->ios_in_progress == 0);
 		Assert(stream->forwarded_buffers == 0);
 		Assert(stream->pinned_buffers == 1);
-		Assert(stream->distance == 1);
+		Assert(stream->readahead_distance == 1);
+		Assert(stream->combine_distance == 1);
 		Assert(stream->pending_read_nblocks == 0);
 		Assert(stream->per_buffer_data_size == 0);
 		Assert(stream->initialized_buffers > stream->oldest_buffer_index);
@@ -900,7 +1040,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No more blocks, end of stream. */
-			stream->distance = 0;
+			stream->readahead_distance = 0;
+			stream->combine_distance = 0;
 			stream->oldest_buffer_index = stream->next_buffer_index;
 			stream->pinned_buffers = 0;
 			stream->buffers[oldest_buffer_index] = InvalidBuffer;
@@ -916,7 +1057,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		Assert(stream->oldest_buffer_index == stream->next_buffer_index);
 
 		/* End of stream reached?  */
-		if (stream->distance == 0)
+		if (stream->readahead_distance == 0)
 			return InvalidBuffer;
 
 		/*
@@ -930,7 +1071,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		/* End of stream reached? */
 		if (stream->pinned_buffers == 0)
 		{
-			Assert(stream->distance == 0);
+			Assert(stream->readahead_distance == 0);
 			return InvalidBuffer;
 		}
 	}
@@ -951,27 +1092,59 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		stream->ios[stream->oldest_io_index].buffer_index == oldest_buffer_index)
 	{
 		int16		io_index = stream->oldest_io_index;
-		int32		distance;	/* wider temporary value, clamped below */
+		bool		needed_wait;
 
 		/* Sanity check that we still agree on the buffers. */
 		Assert(stream->ios[io_index].op.buffers ==
 			   &stream->buffers[oldest_buffer_index]);
 
-		WaitReadBuffers(&stream->ios[io_index].op);
+		needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
 
 		Assert(stream->ios_in_progress > 0);
 		stream->ios_in_progress--;
 		if (++stream->oldest_io_index == stream->max_ios)
 			stream->oldest_io_index = 0;
 
-		/* Look-ahead distance ramps up rapidly after we do I/O. */
-		distance = stream->distance * 2;
-		distance = Min(distance, stream->max_pinned_buffers);
-		stream->distance = distance;
+		/*
+		 * If the IO was executed synchronously, we will never see
+		 * WaitReadBuffers() block. Treat it as if it did block. This is
+		 * particularly crucial when effective_io_concurrency=0 is used, as
+		 * all IO will be synchronous.  Without treating synchronous IO as
+		 * having waited, we'd never allow the distance to get large enough to
+		 * allow for IO combining, resulting in bad performance.
+		 */
+		if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY)
+			needed_wait = true;
 
 		/*
-		 * As we needed IO, prevent distance from being reduced within our
-		 * maximum look-ahead window. This avoids having distance collapse too
+		 * Have the read-ahead distance ramp up rapidly after we needed to
+		 * wait for IO. We only increase the read-ahead-distance when we
+		 * needed to wait, to avoid increasing the distance further than
+		 * necessary, as looking ahead too far can be costly, both due to the
+		 * cost of unnecessarily pinning many buffers and due to doing IOs
+		 * that may never be consumed if the stream is ended/reset before
+		 * completion.
+		 *
+		 * If we did not need to wait, the current distance was evidently
+		 * sufficient.
+		 *
+		 * NB: Must not increase the distance if we already reached the end of
+		 * the stream, as stream->readahead_distance == 0 is used to keep
+		 * track of having reached the end.
+		 */
+		if (stream->readahead_distance > 0 && needed_wait)
+		{
+			/* wider temporary value, due to overflow risk */
+			int32		readahead_distance;
+
+			readahead_distance = stream->readahead_distance * 2;
+			readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
+			stream->readahead_distance = readahead_distance;
+		}
+
+		/*
+		 * As we needed IO, prevent distances from being reduced within our
+		 * maximum look-ahead window. This avoids collapsing distances too
 		 * quickly in workloads where most of the required blocks are cached,
 		 * but where the remaining IOs are a sufficient enough factor to cause
 		 * a substantial slowdown if executed synchronously.
@@ -983,6 +1156,30 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		 */
 		stream->distance_decay_holdoff = stream->max_pinned_buffers;
 
+		/*
+		 * Whether we needed to wait or not, allow for more IO combining if we
+		 * needed to do IO. The reason to do so independent of needing to wait
+		 * is that when the data is resident in the kernel page cache, IO
+		 * combining reduces the syscall / dispatch overhead, making it
+		 * worthwhile regardless of needing to wait.
+		 *
+		 * It is also important with io_uring as it will never signal the need
+		 * to wait for reads if all the data is in the page cache. There are
+		 * heuristics to deal with that in method_io_uring.c, but they only
+		 * work when the IO gets large enough.
+		 */
+		if (stream->combine_distance > 0 &&
+			stream->combine_distance < stream->io_combine_limit)
+		{
+			/* wider temporary value, due to overflow risk */
+			int32		combine_distance;
+
+			combine_distance = stream->combine_distance * 2;
+			combine_distance = Min(combine_distance, stream->io_combine_limit);
+			combine_distance = Min(combine_distance, stream->max_pinned_buffers);
+			stream->combine_distance = combine_distance;
+		}
+
 		/*
 		 * If we've reached the first block of a sequential region we're
 		 * issuing advice for, cancel that until the next jump.  The kernel
@@ -1048,7 +1245,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 	if (stream->ios_in_progress == 0 &&
 		stream->forwarded_buffers == 0 &&
 		stream->pinned_buffers == 1 &&
-		stream->distance == 1 &&
+		stream->readahead_distance == 1 &&
+		stream->combine_distance == 1 &&
 		stream->pending_read_nblocks == 0 &&
 		stream->per_buffer_data_size == 0)
 	{
@@ -1094,8 +1292,10 @@ read_stream_next_block(ReadStream *stream, BufferAccessStrategy *strategy)
 BlockNumber
 read_stream_pause(ReadStream *stream)
 {
-	stream->resume_distance = stream->distance;
-	stream->distance = 0;
+	stream->resume_readahead_distance = stream->readahead_distance;
+	stream->resume_combine_distance = stream->combine_distance;
+	stream->readahead_distance = 0;
+	stream->combine_distance = 0;
 	return InvalidBlockNumber;
 }
 
@@ -1107,7 +1307,8 @@ read_stream_pause(ReadStream *stream)
 void
 read_stream_resume(ReadStream *stream)
 {
-	stream->distance = stream->resume_distance;
+	stream->readahead_distance = stream->resume_readahead_distance;
+	stream->combine_distance = stream->resume_combine_distance;
 }
 
 /*
@@ -1123,7 +1324,8 @@ read_stream_reset(ReadStream *stream)
 	Buffer		buffer;
 
 	/* Stop looking ahead. */
-	stream->distance = 0;
+	stream->readahead_distance = 0;
+	stream->combine_distance = 0;
 
 	/* Forget buffered block number and fast path state. */
 	stream->buffered_blocknum = InvalidBlockNumber;
@@ -1155,8 +1357,10 @@ read_stream_reset(ReadStream *stream)
 	Assert(stream->ios_in_progress == 0);
 
 	/* Start off assuming data is cached. */
-	stream->distance = 1;
-	stream->resume_distance = stream->distance;
+	stream->readahead_distance = 1;
+	stream->combine_distance = 1;
+	stream->resume_readahead_distance = stream->readahead_distance;
+	stream->resume_combine_distance = stream->combine_distance;
 	stream->distance_decay_holdoff = 0;
 }
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3cc0b0bdd929f..cf4f4246ca2b4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -840,7 +840,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 		{
 			PinLocalBuffer(bufHdr, true);
 
-			pgBufferUsage.local_blks_hit++;
+			INSTR_BUFUSAGE_INCR(local_blks_hit);
 
 			return true;
 		}
@@ -861,7 +861,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 		{
 			if (BufferTagsEqual(&tag, &bufHdr->tag))
 			{
-				pgBufferUsage.shared_blks_hit++;
+				INSTR_BUFUSAGE_INCR(shared_blks_hit);
 				return true;
 			}
 			UnpinBuffer(bufHdr);
@@ -1266,9 +1266,9 @@ PinBufferForBlock(Relation rel,
 	if (rel)
 	{
 		/*
-		 * While pgBufferUsage's "read" counter isn't bumped unless we reach
-		 * WaitReadBuffers() (so, not for hits, and not for buffers that are
-		 * zeroed instead), the per-relation stats always count them.
+		 * While the current buffer usage "read" counter isn't bumped unless
+		 * we reach WaitReadBuffers() (so, not for hits, and not for buffers
+		 * that are zeroed instead), the per-relation stats always count them.
 		 */
 		pgstat_count_buffer_read(rel);
 	}
@@ -1684,9 +1684,9 @@ TrackBufferHit(IOObject io_object, IOContext io_context,
 									  true);
 
 	if (persistence == RELPERSISTENCE_TEMP)
-		pgBufferUsage.local_blks_hit += 1;
+		INSTR_BUFUSAGE_INCR(local_blks_hit);
 	else
-		pgBufferUsage.shared_blks_hit += 1;
+		INSTR_BUFUSAGE_INCR(shared_blks_hit);
 
 	pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
 
@@ -2148,9 +2148,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 							io_start, 1, io_buffers_len * BLCKSZ);
 
 	if (persistence == RELPERSISTENCE_TEMP)
-		pgBufferUsage.local_blks_read += io_buffers_len;
+		INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len);
 	else
-		pgBufferUsage.shared_blks_read += io_buffers_len;
+		INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len);
 
 	/*
 	 * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
@@ -3043,7 +3043,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
 	}
 
-	pgBufferUsage.shared_blks_written += extend_by;
+	INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by);
 
 	*extended_by = extend_by;
 
@@ -3189,7 +3189,7 @@ MarkBufferDirty(Buffer buffer)
 	 */
 	if (!(old_buf_state & BM_DIRTY))
 	{
-		pgBufferUsage.shared_blks_dirtied++;
+		INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
@@ -4601,7 +4601,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
 							IOOP_WRITE, io_start, 1, BLCKSZ);
 
-	pgBufferUsage.shared_blks_written++;
+	INSTR_BUFUSAGE_INCR(shared_blks_written);
 
 	/*
 	 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
@@ -5796,7 +5796,7 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
 			UnlockBufHdr(bufHdr);
 		}
 
-		pgBufferUsage.shared_blks_dirtied++;
+		INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 396da84b25c55..851b99056d571 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -218,7 +218,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
 	/* Mark not-dirty */
 	TerminateLocalBufferIO(bufHdr, true, 0, false);
 
-	pgBufferUsage.local_blks_written++;
+	INSTR_BUFUSAGE_INCR(local_blks_written);
 }
 
 static Buffer
@@ -479,7 +479,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
 
 	*extended_by = extend_by;
 
-	pgBufferUsage.local_blks_written += extend_by;
+	INSTR_BUFUSAGE_ADD(local_blks_written, extend_by);
 
 	return first_block;
 }
@@ -510,7 +510,7 @@ MarkLocalBufferDirty(Buffer buffer)
 	buf_state = pg_atomic_read_u64(&bufHdr->state);
 
 	if (!(buf_state & BM_DIRTY))
-		pgBufferUsage.local_blks_dirtied++;
+		INSTR_BUFUSAGE_INCR(local_blks_dirtied);
 
 	buf_state |= BM_DIRTY;
 
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index c4afe4d368a34..8b501dfcadd02 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -475,13 +475,13 @@ BufFileLoadBuffer(BufFile *file)
 	if (track_io_timing)
 	{
 		INSTR_TIME_SET_CURRENT(io_time);
-		INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+		INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start);
 	}
 
 	/* we choose not to advance curOffset here */
 
 	if (file->nbytes > 0)
-		pgBufferUsage.temp_blks_read++;
+		INSTR_BUFUSAGE_INCR(temp_blks_read);
 }
 
 /*
@@ -549,13 +549,13 @@ BufFileDumpBuffer(BufFile *file)
 		if (track_io_timing)
 		{
 			INSTR_TIME_SET_CURRENT(io_time);
-			INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+			INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start);
 		}
 
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
-		pgBufferUsage.temp_blks_written++;
+		INSTR_BUFUSAGE_INCR(temp_blks_written);
 	}
 	file->dirty = false;
 
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 2be26e9228361..e7fc7f071d84e 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc)
  * pg_stat_database only counts block read and write times, these are done for
  * IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
  *
- * pgBufferUsage is used for EXPLAIN.  pgBufferUsage has write and read stats
- * for shared, local and temporary blocks.  pg_stat_io does not track the
- * activity of temporary blocks, so these are ignored here.
+ * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has
+ * write and read stats for shared, local and temporary blocks. pg_stat_io
+ * does not track the activity of temporary blocks, so these are ignored here.
  */
 void
 pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
@@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 			{
 				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
 				if (io_object == IOOBJECT_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time);
 				else if (io_object == IOOBJECT_TEMP_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time);
 			}
 			else if (io_op == IOOP_READ)
 			{
 				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
 				if (io_object == IOOBJECT_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time);
 				else if (io_object == IOOBJECT_TEMP_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time);
 			}
 		}
 
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index f0b4d795071af..a8cbdf247c866 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -459,6 +459,7 @@ extern TimestampTz GetCurrentTransactionStopTimestamp(void);
 extern void SetCurrentStatementStartTimestamp(void);
 extern int	GetCurrentTransactionNestLevel(void);
 extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
+extern int	GetTopReadOnlyTransactionNestLevel(void);
 extern void CommandCounterIncrement(void);
 extern void ForceSyncCommit(void);
 extern void StartTransactionCommand(void);
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index a38e95bc0eb59..9aee822634781 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -71,6 +71,7 @@ extern void index_check_primary_key(Relation heapRel,
 #define	INDEX_CREATE_IF_NOT_EXISTS			(1 << 4)
 #define	INDEX_CREATE_PARTITIONED			(1 << 5)
 #define INDEX_CREATE_INVALID				(1 << 6)
+#define INDEX_CREATE_SUPPRESS_PROGRESS		(1 << 7)
 
 extern Oid	index_create(Relation heapRelation,
 						 const char *indexRelationName,
@@ -101,10 +102,9 @@ extern Oid	index_create(Relation heapRelation,
 #define	INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS	(1 << 4)
 #define	INDEX_CONSTR_CREATE_WITHOUT_OVERLAPS (1 << 5)
 
-extern Oid	index_concurrently_create_copy(Relation heapRelation,
-										   Oid oldIndexId,
-										   Oid tablespaceOid,
-										   const char *newName);
+extern Oid	index_create_copy(Relation heapRelation, uint16 flags,
+							  Oid oldIndexId, Oid tablespaceOid,
+							  const char *newName);
 
 extern void index_concurrently_build(Oid heapRelationId,
 									 Oid indexRelationId);
@@ -149,7 +149,8 @@ extern void index_build(Relation heapRelation,
 						Relation indexRelation,
 						IndexInfo *indexInfo,
 						bool isreindex,
-						bool parallel);
+						bool parallel,
+						bool progress);
 
 extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
 
diff --git a/src/include/commands/explain_dr.h b/src/include/commands/explain_dr.h
index f98eaae186457..ab5c53023e1e6 100644
--- a/src/include/commands/explain_dr.h
+++ b/src/include/commands/explain_dr.h
@@ -23,11 +23,10 @@ typedef struct ExplainState ExplainState;
 typedef struct SerializeMetrics
 {
 	uint64		bytesSent;		/* # of bytes serialized */
-	instr_time	timeSpent;		/* time spent serializing */
-	BufferUsage bufferUsage;	/* buffers accessed during serialization */
+	Instrumentation instr;		/* time and buffer usage */
 } SerializeMetrics;
 
 extern DestReceiver *CreateExplainSerializeDestReceiver(ExplainState *es);
-extern SerializeMetrics GetSerializationMetrics(DestReceiver *dest);
+extern SerializeMetrics *GetSerializationMetrics(DestReceiver *dest);
 
 #endif
diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h
index 5a2034811d563..6c8b602d07f98 100644
--- a/src/include/executor/execParallel.h
+++ b/src/include/executor/execParallel.h
@@ -25,9 +25,8 @@ typedef struct ParallelExecutorInfo
 {
 	PlanState  *planstate;		/* plan subtree we're running in parallel */
 	ParallelContext *pcxt;		/* parallel context we're using */
-	BufferUsage *buffer_usage;	/* points to bufusage area in DSM */
-	WalUsage   *wal_usage;		/* walusage area in DSM */
-	SharedExecutorInstrumentation *instrumentation; /* optional */
+	Instrumentation *instrumentation;	/* instrumentation area in DSM */
+	SharedExecutorInstrumentation *node_instrumentation;	/* optional */
 	struct SharedJitInstrumentation *jit_instrumentation;	/* optional */
 	dsa_area   *area;			/* points to DSA area in DSM */
 	dsa_pointer param_exec;		/* serialized PARAM_EXEC parameters */
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index d3a572428449d..340029a203422 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -51,8 +51,8 @@ typedef struct QueryDesc
 	/* This field is set by ExecutePlan */
 	bool		already_executed;	/* true if previously executed */
 
-	/* This is always set NULL by the core system, but plugins can change it */
-	struct Instrumentation *totaltime;	/* total time spent in ExecutorRun */
+	/* This field is set by ExecutorRun, or plugins */
+	struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */
 } QueryDesc;
 
 /* in pquery.c */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 491c48865066a..03f0e864176eb 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -233,6 +233,7 @@ ExecGetJunkAttribute(TupleTableSlot *slot, AttrNumber attno, bool *isNull)
 /*
  * prototypes from functions in execMain.c
  */
+typedef struct QueryInstrumentation QueryInstrumentation;
 extern void ExecutorStart(QueryDesc *queryDesc, int eflags);
 extern void standard_ExecutorStart(QueryDesc *queryDesc, int eflags);
 extern void ExecutorRun(QueryDesc *queryDesc,
@@ -254,7 +255,7 @@ extern void InitResultRelInfo(ResultRelInfo *resultRelInfo,
 							  Relation resultRelationDesc,
 							  Index resultRelationIndex,
 							  ResultRelInfo *partition_root_rri,
-							  int instrument_options);
+							  QueryInstrumentation *qinstr);
 extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid,
 											  ResultRelInfo *rootRelInfo);
 extern List *ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo);
@@ -301,6 +302,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function);
 extern Node *MultiExecProcNode(PlanState *node);
 extern void ExecEndNode(PlanState *node);
 extern void ExecShutdownNode(PlanState *node);
+extern void ExecFinalizeNodeInstrumentation(PlanState *node);
+extern void ExecFinalizeWorkerInstrumentation(PlanState *node);
 extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
 
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9759f3ea5d8d9..bae8a9b0e62ed 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -13,6 +13,7 @@
 #ifndef INSTRUMENT_H
 #define INSTRUMENT_H
 
+#include "lib/ilist.h"
 #include "portability/instr_time.h"
 
 
@@ -67,56 +68,270 @@ typedef enum InstrumentOption
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
+/*
+ * Instrumentation base class for capturing time and WAL/buffer usage
+ *
+ * If used directly:
+ * - Allocate on the stack and zero initialize the struct
+ * - Call InstrInitOptions to set instrumentation options
+ * - Call InstrStart before the activity you want to measure
+ * - Call InstrStop / InstrStopFinalize after the activity to capture totals
+ *
+ * InstrStart/InstrStop may be called multiple times. The last stop call must
+ * be to InstrStopFinalize to ensure parent stack entries get the accumulated
+ * totals. If there is risk of transaction aborts you must call
+ * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the
+ * instrumentation stack.
+ *
+ * In a query context use QueryInstrumentation instead, which handles aborts
+ * using the resource owner logic.
+ */
 typedef struct Instrumentation
 {
-	/* Parameters set at node creation: */
+	/* Parameters set at creation: */
 	bool		need_timer;		/* true if we need timer data */
-	bool		need_bufusage;	/* true if we need buffer usage data */
-	bool		need_walusage;	/* true if we need WAL usage data */
+	bool		need_stack;		/* true if we need WAL/buffer usage data */
+	/* Internal state keeping: */
+	bool		on_stack;		/* true if currently on instr_stack */
+	instr_time	starttime;		/* start time of last InstrStart */
+	/* Accumulated statistics: */
+	instr_time	total;			/* total runtime */
+	BufferUsage bufusage;		/* total buffer usage */
+	WalUsage	walusage;		/* total WAL usage */
+	/* Abort handling: link in parent QueryInstrumentation's unfinalized list */
+	dlist_node	unfinalized_entry;
+} Instrumentation;
+
+/*
+ * Query-related instrumentation tracking.
+ *
+ * Usage:
+ * - Allocate on the heap using InstrQueryAlloc (required for abort handling)
+ * - Call InstrQueryStart before the activity you want to measure
+ * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals
+ *
+ * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop
+ * call must be to InstrQueryStopFinalize to ensure parent stack entries get
+ * the accumulated totals.
+ *
+ * Uses resource owner mechanism for handling aborts, as such, the caller
+ * *must* not exit out of the top level transaction after having called
+ * InstrQueryStart, without first calling InstrQueryStop or
+ * InstrQueryStopFinalize. In the case of a transaction abort, logic equivalent
+ * to InstrQueryStopFinalize will be called automatically.
+ */
+struct ResourceOwnerData;
+typedef struct QueryInstrumentation
+{
+	Instrumentation instr;
+
+	/* Original instrument_options flags used to create this instrumentation */
+	int			instrument_options;
+
+	/* Resource owner used for cleanup for aborts between InstrStart/InstrStop */
+	struct ResourceOwnerData *owner;
+
+	/*
+	 * Dedicated memory context for all instrumentation allocations belonging
+	 * to this query (node instrumentation, trigger instrumentation, etc.).
+	 * Initially a child of TopMemoryContext so it survives transaction abort
+	 * for ResourceOwner cleanup, which is then reassigned to the current
+	 * memory context on InstrQueryStopFinalize.
+	 */
+	MemoryContext instr_cxt;
+
+	/*
+	 * Child entries that need to be cleaned up on abort, since they are not
+	 * registered as a resource owner themselves. Contains both node and
+	 * trigger instrumentation entries linked via instr.unfinalized_entry.
+	 */
+	dlist_head	unfinalized_entries;
+} QueryInstrumentation;
+
+/*
+ * Specialized instrumentation for per-node execution statistics
+ *
+ * Relies on an outer QueryInstrumentation having been set up to handle the
+ * stack used for WAL/buffer usage statistics, and relies on it for managing
+ * aborts. Solely intended for the executor and anyone reporting about its
+ * activities (e.g. EXPLAIN ANALYZE).
+ */
+typedef struct NodeInstrumentation
+{
+	Instrumentation instr;
+	/* Parameters set at node creation: */
 	bool		async_mode;		/* true if node is in async mode */
 	/* Info about current plan cycle: */
 	bool		running;		/* true if we've completed first tuple */
-	instr_time	starttime;		/* start time of current iteration of node */
 	instr_time	counter;		/* accumulated runtime for this node */
 	instr_time	firsttuple;		/* time for first tuple of this cycle */
 	double		tuplecount;		/* # of tuples emitted so far this cycle */
-	BufferUsage bufusage_start; /* buffer usage at start */
-	WalUsage	walusage_start; /* WAL usage at start */
 	/* Accumulated statistics across all completed cycles: */
 	instr_time	startup;		/* total startup time */
-	instr_time	total;			/* total time */
 	double		ntuples;		/* total tuples produced */
 	double		ntuples2;		/* secondary node-specific tuple counter */
 	double		nloops;			/* # of run cycles for this node */
 	double		nfiltered1;		/* # of tuples removed by scanqual or joinqual */
 	double		nfiltered2;		/* # of tuples removed by "other" quals */
-	BufferUsage bufusage;		/* total buffer usage */
-	WalUsage	walusage;		/* total WAL usage */
-} Instrumentation;
+} NodeInstrumentation;
 
-typedef struct WorkerInstrumentation
+/*
+ * Care must be taken with any pointers contained within this struct, as this
+ * gets copied across processes during parallel query execution.
+ */
+typedef struct WorkerNodeInstrumentation
 {
 	int			num_workers;	/* # of structures that follow */
-	Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
-} WorkerInstrumentation;
+	NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} WorkerNodeInstrumentation;
+
+typedef struct TriggerInstrumentation
+{
+	Instrumentation instr;
+	int			firings;		/* number of times the instrumented trigger
+								 * was fired */
+} TriggerInstrumentation;
+
+/*
+ * Dynamic array-based stack for tracking current WAL/buffer usage context.
+ *
+ * When the stack is empty, 'current' points to instr_top which accumulates
+ * session-level totals.
+ */
+typedef struct InstrStackState
+{
+	int			stack_space;	/* allocated capacity of entries array */
+	int			stack_size;		/* current number of entries */
+
+	Instrumentation **entries;	/* dynamic array of pointers */
+	Instrumentation *current;	/* top of stack, or &instr_top when empty */
+} InstrStackState;
 
-extern PGDLLIMPORT BufferUsage pgBufferUsage;
 extern PGDLLIMPORT WalUsage pgWalUsage;
 
-extern Instrumentation *InstrAlloc(int n, int instrument_options,
-								   bool async_mode);
-extern void InstrInit(Instrumentation *instr, int instrument_options);
-extern void InstrStartNode(Instrumentation *instr);
-extern void InstrStopNode(Instrumentation *instr, double nTuples);
-extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
-extern void InstrEndLoop(Instrumentation *instr);
-extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
-extern void InstrStartParallelQuery(void);
-extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void BufferUsageAccumDiff(BufferUsage *dst,
-								 const BufferUsage *add, const BufferUsage *sub);
+/*
+ * The top instrumentation represents a running total of the current backend
+ * WAL/buffer usage information. This will not be updated immediately, but
+ * rather when the current stack entry gets accumulated which typically happens
+ * at query end.
+ *
+ * Care must be taken when utilizing this in the parallel worker context:
+ * Parallel workers will report back their instrumentation to the caller,
+ * and this gets added to the caller's stack. If this were to be used in the
+ * shared memory stats infrastructure it would need to be skipped on parallel
+ * workers to avoid double counting.
+ */
+extern PGDLLIMPORT Instrumentation instr_top;
+
+/*
+ * The instrumentation stack state. The 'current' field points to the
+ * currently active stack entry that is getting updated as activity happens,
+ * and will be accumulated to parent stacks when it gets finalized by
+ * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation
+ * (executor finish) or ResOwnerReleaseInstrumentation on abort.
+ */
+extern PGDLLIMPORT InstrStackState instr_stack;
+
+extern void InstrStackGrow(void);
+
+/*
+ * Pushes the stack so that all WAL/buffer usage updates go to the passed in
+ * instrumentation entry.
+ *
+ * See note on InstrPopStack regarding safe use of these functions.
+ */
+static inline void
+InstrPushStack(Instrumentation *instr)
+{
+	if (unlikely(instr_stack.stack_size == instr_stack.stack_space))
+		InstrStackGrow();
+
+	instr_stack.entries[instr_stack.stack_size++] = instr;
+	instr_stack.current = instr;
+	instr->on_stack = true;
+}
+
+/*
+ * Pops the stack entry back to the previous one that was effective at
+ * InstrPushStack.
+ *
+ * Callers must ensure that no intermediate stack entries are skipped, to
+ * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY
+ * block, consider instead using InstrStart + InstrStopFinalize which can skip
+ * intermediate stack entries.
+ */
+static inline void
+InstrPopStack(Instrumentation *instr)
+{
+	Assert(instr_stack.stack_size > 0);
+	Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr);
+	instr_stack.stack_size--;
+	instr_stack.current = instr_stack.stack_size > 0
+		? instr_stack.entries[instr_stack.stack_size - 1]
+		: &instr_top;
+	instr->on_stack = false;
+}
+
+extern void InstrInitOptions(Instrumentation *instr, int instrument_options);
+extern void InstrStart(Instrumentation *instr);
+extern void InstrStop(Instrumentation *instr);
+extern void InstrStopFinalize(Instrumentation *instr);
+extern void InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent);
+extern void InstrAccumStack(Instrumentation *dst, Instrumentation *add);
+
+extern QueryInstrumentation *InstrQueryAlloc(int instrument_options);
+extern void InstrQueryStart(QueryInstrumentation *instr);
+extern void InstrQueryStop(QueryInstrumentation *instr);
+extern void InstrQueryStopFinalize(QueryInstrumentation *instr);
+extern void InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *instr);
+
+pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void);
+extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst);
+extern void InstrAccumParallelQuery(Instrumentation *instr);
+
+extern NodeInstrumentation *InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode);
+extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);
+extern void InstrStartNode(NodeInstrumentation *instr);
+extern void InstrStopNode(NodeInstrumentation *instr, double nTuples);
+extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
+extern void InstrEndLoop(NodeInstrumentation *instr);
+extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
+
+typedef struct TupleTableSlot TupleTableSlot;
+typedef struct PlanState PlanState;
+typedef TupleTableSlot *(*ExecProcNodeMtd) (PlanState *pstate);
+extern ExecProcNodeMtd InstrNodeSetupExecProcNode(NodeInstrumentation *instr);
+
+extern TriggerInstrumentation *InstrAllocTrigger(QueryInstrumentation *qinstr, int n);
+extern void InstrStartTrigger(QueryInstrumentation *qinstr,
+							  TriggerInstrumentation *tginstr);
+extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings);
+
+extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
+extern void WalUsageAdd(WalUsage *dst, const WalUsage *add);
 extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
 							  const WalUsage *sub);
 
+#define INSTR_BUFUSAGE_INCR(fld) do { \
+		instr_stack.current->bufusage.fld++; \
+	} while(0)
+#define INSTR_BUFUSAGE_ADD(fld,val) do { \
+		instr_stack.current->bufusage.fld += (val); \
+	} while(0)
+#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
+	INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \
+	} while (0)
+#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
+	INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \
+	} while (0)
+
+#define INSTR_WALUSAGE_INCR(fld) do { \
+		pgWalUsage.fld++; \
+		instr_stack.current->walusage.fld++; \
+	} while(0)
+#define INSTR_WALUSAGE_ADD(fld,val) do { \
+		pgWalUsage.fld += (val); \
+		instr_stack.current->walusage.fld += (val); \
+	} while(0)
+
 #endif							/* INSTRUMENT_H */
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 2a0ff377a7312..e2315cef384c6 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -18,6 +18,8 @@
 #ifndef INSTRUMENT_NODE_H
 #define INSTRUMENT_NODE_H
 
+#include "executor/instrument.h"
+
 
 /* ---------------------
  *	Instrumentation information for aggregate function execution
@@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation
 {
 	/* Index search count (incremented with pgstat_count_index_scan call) */
 	uint64		nsearches;
+
+	/* Instrumentation utilized for tracking buffer usage during table access */
+	Instrumentation table_instr;
 } IndexScanInstrumentation;
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 090cfccf65fa0..b28288aa1e8db 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -54,12 +54,15 @@ typedef struct Instrumentation Instrumentation;
 typedef struct pairingheap pairingheap;
 typedef struct PlanState PlanState;
 typedef struct QueryEnvironment QueryEnvironment;
+typedef struct QueryInstrumentation QueryInstrumentation;
 typedef struct RelationData *Relation;
 typedef Relation *RelationPtr;
 typedef struct ScanKeyData ScanKeyData;
 typedef struct SnapshotData *Snapshot;
 typedef struct SortSupportData *SortSupport;
 typedef struct TIDBitmap TIDBitmap;
+typedef struct NodeInstrumentation NodeInstrumentation;
+typedef struct TriggerInstrumentation TriggerInstrumentation;
 typedef struct TupleConversionMap TupleConversionMap;
 typedef struct TupleDescData *TupleDesc;
 typedef struct Tuplesortstate Tuplesortstate;
@@ -67,7 +70,7 @@ typedef struct Tuplestorestate Tuplestorestate;
 typedef struct TupleTableSlot TupleTableSlot;
 typedef struct TupleTableSlotOps TupleTableSlotOps;
 typedef struct WalUsage WalUsage;
-typedef struct WorkerInstrumentation WorkerInstrumentation;
+typedef struct WorkerNodeInstrumentation WorkerNodeInstrumentation;
 
 
 /* ----------------
@@ -552,7 +555,7 @@ typedef struct ResultRelInfo
 	ExprState **ri_TrigWhenExprs;
 
 	/* optional runtime measurements for triggers */
-	Instrumentation *ri_TrigInstrument;
+	TriggerInstrumentation *ri_TrigInstrument;
 
 	/* On-demand created slots for triggers / returning processing */
 	TupleTableSlot *ri_ReturningSlot;	/* for trigger output tuples */
@@ -751,7 +754,7 @@ typedef struct EState
 									 * ExecutorRun() calls. */
 
 	int			es_top_eflags;	/* eflags passed to ExecutorStart */
-	int			es_instrument;	/* OR of InstrumentOption flags */
+	QueryInstrumentation *es_instrument;	/* query-level instrumentation */
 	bool		es_finished;	/* true when ExecutorFinish is done */
 
 	List	   *es_exprcontexts;	/* List of ExprContexts within EState */
@@ -1206,8 +1209,10 @@ typedef struct PlanState
 	ExecProcNodeMtd ExecProcNodeReal;	/* actual function, if above is a
 										 * wrapper */
 
-	Instrumentation *instrument;	/* Optional runtime stats for this node */
-	WorkerInstrumentation *worker_instrument;	/* per-worker instrumentation */
+	NodeInstrumentation *instrument;	/* Optional runtime stats for this
+										 * node */
+	WorkerNodeInstrumentation *worker_instrument;	/* per-worker
+													 * instrumentation */
 
 	/* Per-worker JIT instrumentation */
 	struct SharedJitInstrumentation *worker_jit_instrument;
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
index ccded021433b0..a22a83a2f237c 100644
--- a/src/include/replication/snapbuild.h
+++ b/src/include/replication/snapbuild.h
@@ -15,6 +15,14 @@
 #include "access/xlogdefs.h"
 #include "utils/snapmgr.h"
 
+/*
+ * forward declarations in this file
+ */
+typedef struct ReorderBuffer ReorderBuffer;
+typedef struct SnapBuild SnapBuild;
+typedef struct xl_heap_new_cid xl_heap_new_cid;
+typedef struct xl_running_xacts xl_running_xacts;
+
 /*
  * Please keep get_snapbuild_state_desc() (located in the pg_logicalinspect
  * module) updated if a change needs to be made to SnapBuildState.
@@ -50,20 +58,11 @@ typedef enum
 	SNAPBUILD_CONSISTENT = 2,
 } SnapBuildState;
 
-/* forward declare so we don't have to include snapbuild_internal.h */
-struct SnapBuild;
-typedef struct SnapBuild SnapBuild;
-
-/* forward declare so we don't have to include reorderbuffer.h */
-struct ReorderBuffer;
 
-/* forward declare so we don't have to include heapam_xlog.h */
-struct xl_heap_new_cid;
-struct xl_running_xacts;
 
 extern void CheckPointSnapBuild(void);
 
-extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *reorder,
+extern SnapBuild *AllocateSnapshotBuilder(ReorderBuffer *reorder,
 										  TransactionId xmin_horizon, XLogRecPtr start_lsn,
 										  bool need_full_snapshot,
 										  bool in_slot_creation,
@@ -91,9 +90,9 @@ extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
 								   XLogRecPtr lsn);
 extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
 								   XLogRecPtr lsn,
-								   struct xl_heap_new_cid *xlrec);
+								   xl_heap_new_cid *xlrec);
 extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
-										 struct xl_running_xacts *running);
+										 xl_running_xacts *running);
 extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
 
 extern bool SnapBuildSnapshotExists(XLogRecPtr lsn);
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index eb6033b4fdb65..5463bc921f06e 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority;
 #define RELEASE_PRIO_SNAPSHOT_REFS			500
 #define RELEASE_PRIO_FILES					600
 #define RELEASE_PRIO_WAITEVENTSETS			700
+#define RELEASE_PRIO_INSTRUMENTATION		800
 
 /* 0 is considered invalid */
 #define RELEASE_PRIO_FIRST					1
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index 72d70aea1e164..591e23df44b45 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -108,7 +108,8 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
-static inline bool
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+static bool
 pg_pmull_available(void)
 {
 #if defined(__aarch64__) && defined(HWCAP_PMULL)
@@ -128,6 +129,7 @@ pg_pmull_available(void)
 	return false;
 #endif
 }
+#endif							/* USE_PMULL_CRC32C_WITH_RUNTIME_CHECK */
 
 /*
  * This gets called on the first call. It replaces the function pointer
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 864b407abcff7..c5ace162fe23c 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -48,6 +48,7 @@ SUBDIRS = \
 		  test_resowner \
 		  test_rls_hooks \
 		  test_saslprep \
+		  test_session_buffer_usage \
 		  test_shm_mq \
 		  test_slru \
 		  test_tidstore \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index e5acacd508368..802cc93d71a48 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -49,6 +49,7 @@ subdir('test_regex')
 subdir('test_resowner')
 subdir('test_rls_hooks')
 subdir('test_saslprep')
+subdir('test_session_buffer_usage')
 subdir('test_shm_mq')
 subdir('test_slru')
 subdir('test_tidstore')
diff --git a/src/test/modules/test_session_buffer_usage/Makefile b/src/test/modules/test_session_buffer_usage/Makefile
new file mode 100644
index 0000000000000..1252b222cb9f8
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_session_buffer_usage/Makefile
+
+MODULE_big = test_session_buffer_usage
+OBJS = \
+	$(WIN32RES) \
+	test_session_buffer_usage.o
+
+EXTENSION = test_session_buffer_usage
+DATA = test_session_buffer_usage--1.0.sql
+PGFILEDESC = "test_session_buffer_usage - show buffer usage statistics for the current session"
+
+REGRESS = test_session_buffer_usage
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_session_buffer_usage
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out b/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
new file mode 100644
index 0000000000000..5f7d349871af8
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
@@ -0,0 +1,342 @@
+LOAD 'test_session_buffer_usage';
+CREATE EXTENSION test_session_buffer_usage;
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM test_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+  AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+  AND local_blks_hit >= 0 AND local_blks_read >= 0
+  AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+  AND temp_blks_read >= 0 AND temp_blks_written >= 0
+  AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+  AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+  AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+ ok 
+----
+ t
+(1 row)
+
+-- Verify counters increase after buffer activity
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+ count 
+-------
+  1000
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM test_session_buffer_usage();
+ blocks_increased 
+------------------
+ t
+(1 row)
+
+DROP TABLE test_buf_activity;
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+SELECT count(*) FROM par_dc_tab;
+ count 
+-------
+  5000
+(1 row)
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count 
+-------
+  5000
+(1 row)
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count 
+-------
+  5000
+(1 row)
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+       AS leader_buffers_match
+FROM test_session_buffer_usage(), dc_serial_result s;
+ leader_buffers_match 
+----------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_dc_tab, dc_serial_result;
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+        WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+        SELECT a / 0 FROM ins;
+ERROR:  division by zero
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM test_session_buffer_usage();
+ exception_buffers_visible 
+---------------------------
+ t
+(1 row)
+
+DROP TABLE exc_tab;
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+ERROR:  duplicate key value violates unique constraint "unique_tab_a_key"
+DETAIL:  Key (a)=(1) already exists.
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM test_session_buffer_usage();
+ unique_violation_buffers_visible 
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE unique_tab;
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+    BEGIN
+        EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+            WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+            SELECT a / 0 FROM ins';
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT subxact_exc_func();
+ subxact_exc_func 
+------------------
+ caught
+(1 row)
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM test_session_buffer_usage();
+ subxact_buffers_visible 
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+    rec record;
+    cnt int := 0;
+BEGIN
+    BEGIN
+        FOR rec IN SELECT * FROM cursor_tab LOOP
+            cnt := cnt + 1;
+            IF cnt = 250 THEN
+                PERFORM 1 / 0;
+            END IF;
+        END LOOP;
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught after ' || cnt || ' rows';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT cursor_exc_func();
+    cursor_exc_func    
+-----------------------
+ caught after 250 rows
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0
+       AS cursor_subxact_buffers_visible
+FROM test_session_buffer_usage();
+ cursor_subxact_buffers_visible 
+--------------------------------
+ t
+(1 row)
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a
+-- trigger that throws an error is still properly propagated.
+CREATE TEMP TABLE trig_err_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int, b char(200));
+INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+-- Warm local buffers so trig_work_tab reads become hits
+SELECT count(*) FROM trig_work_tab;
+ count 
+-------
+   500
+(1 row)
+
+CREATE FUNCTION trig_err_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM count(*) FROM trig_work_tab;
+    RAISE EXCEPTION 'trigger error';
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_err_func();
+-- Measure how many local buffer hits a scan of trig_work_tab produces
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) FROM trig_work_tab;
+ count 
+-------
+   500
+(1 row)
+
+CREATE TEMP TABLE trig_serial_result AS
+SELECT local_blks_hit AS serial_hits FROM test_session_buffer_usage();
+-- Now trigger the same scan via a trigger that errors
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+    INSERT INTO trig_err_tab VALUES (1);
+ERROR:  trigger error
+CONTEXT:  PL/pgSQL function trig_err_func() line 4 at RAISE
+-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran.
+-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated.
+SELECT local_blks_hit >= s.serial_hits / 2
+       AS trigger_abort_buffers_propagated
+FROM test_session_buffer_usage(), trig_serial_result s;
+ trigger_abort_buffers_propagated 
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result;
+DROP FUNCTION trig_err_func;
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+ count 
+-------
+  5000
+(1 row)
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR:  invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR:  invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CONTEXT:  parallel worker
+RESET debug_parallel_query;
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+       AS worker_abort_buffers_not_propagated
+FROM test_session_buffer_usage(), par_abort_serial_result s;
+ worker_abort_buffers_not_propagated 
+-------------------------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_abort_tab, par_abort_serial_result;
+-- Cleanup
+DROP EXTENSION test_session_buffer_usage;
diff --git a/src/test/modules/test_session_buffer_usage/meson.build b/src/test/modules/test_session_buffer_usage/meson.build
new file mode 100644
index 0000000000000..b96f67dc7fe37
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_session_buffer_usage_sources = files(
+  'test_session_buffer_usage.c',
+)
+
+if host_system == 'windows'
+  test_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_session_buffer_usage',
+    '--FILEDESC', 'test_session_buffer_usage - show buffer usage statistics for the current session',])
+endif
+
+test_session_buffer_usage = shared_module('test_session_buffer_usage',
+  test_session_buffer_usage_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_session_buffer_usage
+
+test_install_data += files(
+  'test_session_buffer_usage.control',
+  'test_session_buffer_usage--1.0.sql',
+)
+
+tests += {
+  'name': 'test_session_buffer_usage',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_session_buffer_usage',
+    ],
+  },
+}
diff --git a/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql b/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
new file mode 100644
index 0000000000000..daf2159c4a653
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
@@ -0,0 +1,245 @@
+LOAD 'test_session_buffer_usage';
+CREATE EXTENSION test_session_buffer_usage;
+
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM test_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+  AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+  AND local_blks_hit >= 0 AND local_blks_read >= 0
+  AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+  AND temp_blks_read >= 0 AND temp_blks_written >= 0
+  AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+  AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+  AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+
+-- Verify counters increase after buffer activity
+SELECT test_session_buffer_usage_reset();
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM test_session_buffer_usage();
+
+DROP TABLE test_buf_activity;
+
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+SELECT count(*) FROM par_dc_tab;
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+       AS leader_buffers_match
+FROM test_session_buffer_usage(), dc_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_dc_tab, dc_serial_result;
+
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+
+SELECT test_session_buffer_usage_reset();
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+        WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+        SELECT a / 0 FROM ins;
+
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP TABLE exc_tab;
+
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+
+SELECT test_session_buffer_usage_reset();
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP TABLE unique_tab;
+
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+    BEGIN
+        EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+            WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+            SELECT a / 0 FROM ins';
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT test_session_buffer_usage_reset();
+SELECT subxact_exc_func();
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+    rec record;
+    cnt int := 0;
+BEGIN
+    BEGIN
+        FOR rec IN SELECT * FROM cursor_tab LOOP
+            cnt := cnt + 1;
+            IF cnt = 250 THEN
+                PERFORM 1 / 0;
+            END IF;
+        END LOOP;
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught after ' || cnt || ' rows';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT test_session_buffer_usage_reset();
+SELECT cursor_exc_func();
+
+SELECT local_blks_hit + local_blks_read > 0
+       AS cursor_subxact_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+
+-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a
+-- trigger that throws an error is still properly propagated.
+CREATE TEMP TABLE trig_err_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int, b char(200));
+INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+-- Warm local buffers so trig_work_tab reads become hits
+SELECT count(*) FROM trig_work_tab;
+
+CREATE FUNCTION trig_err_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM count(*) FROM trig_work_tab;
+    RAISE EXCEPTION 'trigger error';
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_err_func();
+
+-- Measure how many local buffer hits a scan of trig_work_tab produces
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM trig_work_tab;
+
+CREATE TEMP TABLE trig_serial_result AS
+SELECT local_blks_hit AS serial_hits FROM test_session_buffer_usage();
+
+-- Now trigger the same scan via a trigger that errors
+SELECT test_session_buffer_usage_reset();
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+    INSERT INTO trig_err_tab VALUES (1);
+
+-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran.
+-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated.
+SELECT local_blks_hit >= s.serial_hits / 2
+       AS trigger_abort_buffers_propagated
+FROM test_session_buffer_usage(), trig_serial_result s;
+
+DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result;
+DROP FUNCTION trig_err_func;
+
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT test_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+
+SELECT test_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+RESET debug_parallel_query;
+
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+       AS worker_abort_buffers_not_propagated
+FROM test_session_buffer_usage(), par_abort_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_abort_tab, par_abort_serial_result;
+
+-- Cleanup
+DROP EXTENSION test_session_buffer_usage;
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
new file mode 100644
index 0000000000000..e9833be470ae5
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
@@ -0,0 +1,31 @@
+/* src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_session_buffer_usage" to load this file. \quit
+
+CREATE FUNCTION test_session_buffer_usage(
+    OUT shared_blks_hit bigint,
+    OUT shared_blks_read bigint,
+    OUT shared_blks_dirtied bigint,
+    OUT shared_blks_written bigint,
+    OUT local_blks_hit bigint,
+    OUT local_blks_read bigint,
+    OUT local_blks_dirtied bigint,
+    OUT local_blks_written bigint,
+    OUT temp_blks_read bigint,
+    OUT temp_blks_written bigint,
+    OUT shared_blk_read_time double precision,
+    OUT shared_blk_write_time double precision,
+    OUT local_blk_read_time double precision,
+    OUT local_blk_write_time double precision,
+    OUT temp_blk_read_time double precision,
+    OUT temp_blk_write_time double precision
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'test_session_buffer_usage'
+LANGUAGE C PARALLEL RESTRICTED;
+
+CREATE FUNCTION test_session_buffer_usage_reset()
+RETURNS void
+AS 'MODULE_PATHNAME', 'test_session_buffer_usage_reset'
+LANGUAGE C PARALLEL RESTRICTED;
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
new file mode 100644
index 0000000000000..50eb1a2ffe621
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_session_buffer_usage.c
+ *	  show buffer usage statistics for the current session
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *	  src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/instrument.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC_EXT(
+					.name = "test_session_buffer_usage",
+					.version = PG_VERSION
+);
+
+#define NUM_BUFFER_USAGE_COLUMNS 16
+
+PG_FUNCTION_INFO_V1(test_session_buffer_usage);
+PG_FUNCTION_INFO_V1(test_session_buffer_usage_reset);
+
+#define HAVE_INSTR_STACK 1		/* Change to 0 when testing before stack
+								 * change */
+
+/*
+ * SQL function: test_session_buffer_usage()
+ *
+ * Returns a single row with all BufferUsage counters accumulated since the
+ * start of the session. Excludes any usage not yet added to the top of the
+ * stack (e.g. if this gets called inside a statement that also had buffer
+ * activity).
+ */
+Datum
+test_session_buffer_usage(PG_FUNCTION_ARGS)
+{
+	TupleDesc	tupdesc;
+	Datum		values[NUM_BUFFER_USAGE_COLUMNS];
+	bool		nulls[NUM_BUFFER_USAGE_COLUMNS];
+	BufferUsage *usage;
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	memset(nulls, 0, sizeof(nulls));
+
+#if HAVE_INSTR_STACK
+	usage = &instr_top.bufusage;
+#else
+	usage = &pgBufferUsage;
+#endif
+
+	values[0] = Int64GetDatum(usage->shared_blks_hit);
+	values[1] = Int64GetDatum(usage->shared_blks_read);
+	values[2] = Int64GetDatum(usage->shared_blks_dirtied);
+	values[3] = Int64GetDatum(usage->shared_blks_written);
+	values[4] = Int64GetDatum(usage->local_blks_hit);
+	values[5] = Int64GetDatum(usage->local_blks_read);
+	values[6] = Int64GetDatum(usage->local_blks_dirtied);
+	values[7] = Int64GetDatum(usage->local_blks_written);
+	values[8] = Int64GetDatum(usage->temp_blks_read);
+	values[9] = Int64GetDatum(usage->temp_blks_written);
+	values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time));
+	values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time));
+	values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time));
+	values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time));
+	values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+	values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * SQL function: test_session_buffer_usage_reset()
+ *
+ * Resets all BufferUsage counters on the top instrumentation stack to zero.
+ * Useful in tests to avoid the baseline/delta pattern.
+ */
+Datum
+test_session_buffer_usage_reset(PG_FUNCTION_ARGS)
+{
+#if HAVE_INSTR_STACK
+	memset(&instr_top.bufusage, 0, sizeof(BufferUsage));
+#else
+	memset(&pgBufferUsage, 0, sizeof(BufferUsage));
+#endif
+
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control
new file mode 100644
index 0000000000000..41cfb15a7650a
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control
@@ -0,0 +1,5 @@
+# test_session_buffer_usage extension
+comment = 'show buffer usage statistics for the current session'
+default_version = '1.0'
+module_pathname = '$libdir/test_session_buffer_usage'
+relocatable = true
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 7c1f26b182cb0..f630acd5f54fc 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -822,3 +822,298 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
 (9 rows)
 
 reset work_mem;
+-- Test parallel bitmap heap scan reports per-worker heap block stats.
+CREATE FUNCTION check_parallel_bitmap_heap_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_indexscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE hundred > 1' INTO plan_json;
+
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Bitmap Heap Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Exact Heap Blocks')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
+ parallel_bitmap_instrumentation 
+---------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_parallel_bitmap_heap_scan;
+-- Test parallel index-only scan reports per-worker index search stats.
+CREATE FUNCTION check_parallel_indexonly_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_bitmapscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE thousand > 95' INTO plan_json;
+
+    -- Drill down to the Index Only Scan node
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Index Only Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Index Searches')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
+ parallel_indexonly_instrumentation 
+------------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_parallel_indexonly_scan;
+-- Test parallel query reports similar buffer stats to a serial run
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+    plan_json json;
+    serial_buffers int;
+    parallel_buffers int;
+    node json;
+BEGIN
+    -- Serial --
+    SET LOCAL max_parallel_workers_per_gather = 0;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    serial_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    -- Parallel --
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    parallel_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_parallel_explain_buffers();
+ ratio 
+-------
+     1
+(1 row)
+
+DROP FUNCTION check_parallel_explain_buffers;
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT explain_exc_func()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+ exception_buffers_visible 
+---------------------------
+ t
+(1 row)
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT check_explain_exception_buffers()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+ exception_buffers_nested_visible 
+----------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly reported.
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+    plan_json json;
+    node json;
+    direct_buf int;
+    noscroll_buf int;
+    scroll_buf int;
+BEGIN
+    -- Direct scan: get leaf Seq Scan node buffers as baseline
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT * FROM cursor_buf_test' INTO plan_json;
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+    direct_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_noscroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    noscroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Scrollable cursor path: ExecShutdownNode is skipped
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_scroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    scroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Both cursor paths should report buffer counts about as high as
+    -- the direct scan (same data plus minor catalog overhead), and not
+    -- double-counted (< 2x the direct scan)
+    RETURN QUERY SELECT
+        (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+        (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_cursor_explain_buffers();
+ noscroll_ok | scroll_ok 
+-------------+-----------
+ t           | t
+(1 row)
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+-- Test trigger instrumentation.
+CREATE TEMP TABLE trig_test_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int);
+INSERT INTO trig_work_tab VALUES (1);
+CREATE FUNCTION trig_test_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM * FROM trig_work_tab;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_test_trig
+    BEFORE INSERT ON trig_test_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_test_func();
+CREATE FUNCTION check_trigger_explain_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    trig json;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        INSERT INTO trig_test_tab VALUES (1)' INTO plan_json;
+    trig := plan_json->0->'Triggers'->0;
+    RETURN COALESCE((trig->>'Calls')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_trigger_explain_buffers() AS trigger_buffers_visible;
+ trigger_buffers_visible 
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_trigger_explain_buffers;
+DROP TRIGGER trig_test_trig ON trig_test_tab;
+DROP FUNCTION trig_test_func;
+DROP TABLE trig_test_tab;
+DROP TABLE trig_work_tab;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ebdab42604beb..74f605739f156 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -188,3 +188,292 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
 -- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk)
 select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))');
 reset work_mem;
+
+-- Test parallel bitmap heap scan reports per-worker heap block stats.
+CREATE FUNCTION check_parallel_bitmap_heap_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_indexscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE hundred > 1' INTO plan_json;
+
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Bitmap Heap Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Exact Heap Blocks')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
+
+DROP FUNCTION check_parallel_bitmap_heap_scan;
+
+-- Test parallel index-only scan reports per-worker index search stats.
+CREATE FUNCTION check_parallel_indexonly_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_bitmapscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE thousand > 95' INTO plan_json;
+
+    -- Drill down to the Index Only Scan node
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Index Only Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Index Searches')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
+
+DROP FUNCTION check_parallel_indexonly_scan;
+
+-- Test parallel query reports similar buffer stats to a serial run
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+    plan_json json;
+    serial_buffers int;
+    parallel_buffers int;
+    node json;
+BEGIN
+    -- Serial --
+    SET LOCAL max_parallel_workers_per_gather = 0;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    serial_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    -- Parallel --
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    parallel_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_parallel_explain_buffers();
+
+DROP FUNCTION check_parallel_explain_buffers;
+
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT explain_exc_func()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT check_explain_exception_buffers()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly reported.
+
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+    plan_json json;
+    node json;
+    direct_buf int;
+    noscroll_buf int;
+    scroll_buf int;
+BEGIN
+    -- Direct scan: get leaf Seq Scan node buffers as baseline
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT * FROM cursor_buf_test' INTO plan_json;
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+    direct_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_noscroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    noscroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Scrollable cursor path: ExecShutdownNode is skipped
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_scroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    scroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Both cursor paths should report buffer counts about as high as
+    -- the direct scan (same data plus minor catalog overhead), and not
+    -- double-counted (< 2x the direct scan)
+    RETURN QUERY SELECT
+        (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+        (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_cursor_explain_buffers();
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+
+-- Test trigger instrumentation.
+CREATE TEMP TABLE trig_test_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int);
+INSERT INTO trig_work_tab VALUES (1);
+
+CREATE FUNCTION trig_test_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM * FROM trig_work_tab;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trig_test_trig
+    BEFORE INSERT ON trig_test_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_test_func();
+
+CREATE FUNCTION check_trigger_explain_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    trig json;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        INSERT INTO trig_test_tab VALUES (1)' INTO plan_json;
+    trig := plan_json->0->'Triggers'->0;
+    RETURN COALESCE((trig->>'Calls')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_trigger_explain_buffers() AS trigger_buffers_visible;
+
+DROP FUNCTION check_trigger_explain_buffers;
+DROP TRIGGER trig_test_trig ON trig_test_tab;
+DROP FUNCTION trig_test_func;
+DROP TABLE trig_test_tab;
+DROP TABLE trig_work_tab;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c72f6c595730a..7393926e34d97 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1355,6 +1355,7 @@ InjectionPointSharedState
 InjectionPointsCtl
 InlineCodeBlock
 InsertStmt
+InstrStackState
 Instrumentation
 Int128AggState
 Int8TransTypeData
@@ -1822,6 +1823,7 @@ NextSampleBlock_function
 NextSampleTuple_function
 NextValueExpr
 Node
+NodeInstrumentation
 NodeTag
 NonEmptyRange
 NoneCompressorState
@@ -2476,6 +2478,7 @@ QueryCompletion
 QueryDesc
 QueryEnvironment
 QueryInfo
+QueryInstrumentation
 QueryItem
 QueryItemType
 QueryMode
@@ -3213,6 +3216,7 @@ TriggerDesc
 TriggerEvent
 TriggerFlags
 TriggerInfo
+TriggerInstrumentation
 TriggerTransition
 TruncateStmt
 TsmRoutine
@@ -3435,9 +3439,9 @@ WorkTableScan
 WorkTableScanState
 WorkerInfo
 WorkerInfoData
-WorkerInstrumentation
 WorkerJobDumpPtrType
 WorkerJobRestorePtrType
+WorkerNodeInstrumentation
 Working_State
 WriteBufPtrType
 WriteBytePtrType