From 2d3490dd99f04d97202eb37f103f929f3dc33162 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 4 Apr 2026 13:49:37 -0400
Subject: [PATCH 01/23] heapam: Keep buffer pins across index scan resets.

Avoid dropping the heap page pin (xs_cbuf) and visibility map pin
(xs_vmbuffer) within heapam_index_fetch_reset.  Retaining these pins
saves cycles during certain nested loop joins and merge joins that
frequently restore a saved mark: cases where the next tuple fetched
after a reset often falls on the same heap page will now avoid the cost
of repeated pinning and unpinning.

Avoiding dropping the scan's heap page buffer pin is preparation for an
upcoming patch that will add I/O prefetching to index scans.  Testing of
that patch (which makes heapam tend to pin more buffers concurrently
than was typical before now) shows that the aforementioned cases get a
small but clearly measurable benefit from this optimization.

Upcoming work to add a slot-based table AM interface for index scans
(which is further preparation for prefetching) will move VM checks for
index-only scans out of the executor and into heapam.  That will expand
the role of xs_vmbuffer to include VM lookups for index-only scans (the
field won't just be used for setting pages all-visible during on-access
pruning via the enhancement recently introduced by commit b46e1e54).
Avoiding dropping the xs_vmbuffer pin will preserve the historical
behavior of nodeIndexonlyscan.c, which always kept this pin on a rescan;
that aspect of this commit isn't really new.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CAH2-Wz=g=JTSyDB4UtB5su2ZcvsS7VbP+ZMvvaG6ABoCb+s8Lw@mail.gmail.com
---
 src/backend/access/heap/heapam_indexscan.c | 29 +++++++++++-----------
 src/backend/access/index/indexam.c         |  7 +++---
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c
index bbd8a165ddc23..33d14f1de7d52 100644
--- a/src/backend/access/heap/heapam_indexscan.c
+++ b/src/backend/access/heap/heapam_indexscan.c
@@ -41,20 +41,13 @@ heapam_index_fetch_begin(Relation rel, uint32 flags)
 void
 heapam_index_fetch_reset(IndexFetchTableData *scan)
 {
-	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
-
-	if (BufferIsValid(hscan->xs_cbuf))
-	{
-		ReleaseBuffer(hscan->xs_cbuf);
-		hscan->xs_cbuf = InvalidBuffer;
-		hscan->xs_blk = InvalidBlockNumber;
-	}
-
-	if (BufferIsValid(hscan->xs_vmbuffer))
-	{
-		ReleaseBuffer(hscan->xs_vmbuffer);
-		hscan->xs_vmbuffer = InvalidBuffer;
-	}
+	/*
+	 * Resets are a no-op.
+	 *
+	 * Deliberately avoid dropping pins now held in xs_cbuf and xs_vmbuffer.
+	 * This saves cycles during certain tight nested loop joins (it can avoid
+	 * repeated pinning and unpinning of the same buffer across rescans).
+	 */
 }
 
 void
@@ -62,7 +55,13 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 
-	heapam_index_fetch_reset(scan);
+	/* drop pin if there's a pinned heap page */
+	if (BufferIsValid(hscan->xs_cbuf))
+		ReleaseBuffer(hscan->xs_cbuf);
+
+	/* drop pin if there's a pinned visibility map page */
+	if (BufferIsValid(hscan->xs_vmbuffer))
+		ReleaseBuffer(hscan->xs_vmbuffer);
 
 	pfree(hscan);
 }
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 44496ae0963e1..23288a4f99490 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -375,7 +375,7 @@ index_rescan(IndexScanDesc scan,
 	Assert(nkeys == scan->numberOfKeys);
 	Assert(norderbys == scan->numberOfOrderBys);
 
-	/* Release resources (like buffer pins) from table accesses */
+	/* reset table AM state for rescan */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -452,7 +452,7 @@ index_restrpos(IndexScanDesc scan)
 	SCAN_CHECKS;
 	CHECK_SCAN_PROCEDURE(amrestrpos);
 
-	/* release resources (like buffer pins) from table accesses */
+	/* reset table AM state for restoring the marked position */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -578,6 +578,7 @@ index_parallelrescan(IndexScanDesc scan)
 {
 	SCAN_CHECKS;
 
+	/* reset table AM state for rescan */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
@@ -659,7 +660,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* If we're out of index entries, we're done */
 	if (!found)
 	{
-		/* release resources (like buffer pins) from table accesses */
+		/* reset table AM state */
 		if (scan->xs_heapfetch)
 			table_index_fetch_reset(scan->xs_heapfetch);
 

From 33bf7318f94ce730563eb5ed95ad6c61d6e6f7a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Herrera?= <alvherre@kurilemu.de>
Date: Sat, 4 Apr 2026 20:38:26 +0200
Subject: [PATCH 02/23] Make index_concurrently_create_copy more general
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also rename it to index_create_copy.  Add a 'boolean concurrent' option,
and make it work for both cases: in concurrent mode, just create the
catalog entries; caller is responsible for the actual building later.
In non-concurrent mode, the index is built right away.

This allows it to be reused for other purposes -- specifically, for
concurrent REPACK.

(With the CONCURRENTLY option, REPACK cannot simply swap the heap file and
rebuild its indexes.  Instead, it needs to build a separate set of
indexes, including their system catalog entries, *before* the actual
swap, to reduce the time AccessExclusiveLock needs to be held for.  This
approach is different from what CREATE INDEX CONCURRENTLY does.)

Per a suggestion from Mihail Nikalayeu.

Author: Antonin Houska <ah@cybertec.at>
Reviewed-by: Mihail Nikalayeu <mihailnikalayeu@gmail.com>
Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de>
Discussion: https://postgr.es/m/41104.1754922120@localhost
---
 src/backend/catalog/index.c      | 41 ++++++++++++++++++++++----------
 src/backend/commands/indexcmds.c |  9 +++----
 src/include/catalog/index.h      |  7 +++---
 3 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 1ccfa687f052b..e418d67e8e422 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -1289,17 +1289,17 @@ index_create(Relation heapRelation,
 }
 
 /*
- * index_concurrently_create_copy
+ * index_create_copy
  *
- * Create concurrently an index based on the definition of the one provided by
- * caller.  The index is inserted into catalogs and needs to be built later
- * on.  This is called during concurrent reindex processing.
+ * Create an index based on the definition of the one provided by caller.  The
+ * index is inserted into catalogs. If 'concurrently' is TRUE, it needs to be
+ * built later on; otherwise it's built immediately.
  *
  * "tablespaceOid" is the tablespace to use for this index.
  */
 Oid
-index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
-							   Oid tablespaceOid, const char *newName)
+index_create_copy(Relation heapRelation, bool concurrently,
+				  Oid oldIndexId, Oid tablespaceOid, const char *newName)
 {
 	Relation	indexRelation;
 	IndexInfo  *oldInfo,
@@ -1318,6 +1318,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 	List	   *indexColNames = NIL;
 	List	   *indexExprs = NIL;
 	List	   *indexPreds = NIL;
+	int			flags = 0;
 
 	indexRelation = index_open(oldIndexId, RowExclusiveLock);
 
@@ -1328,7 +1329,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 	 * Concurrent build of an index with exclusion constraints is not
 	 * supported.
 	 */
-	if (oldInfo->ii_ExclusionOps != NULL)
+	if (oldInfo->ii_ExclusionOps != NULL && concurrently)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("concurrent index creation for exclusion constraints is not supported")));
@@ -1384,9 +1385,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 	}
 
 	/*
-	 * Build the index information for the new index.  Note that rebuild of
-	 * indexes with exclusion constraints is not supported, hence there is no
-	 * need to fill all the ii_Exclusion* fields.
+	 * Build the index information for the new index.
 	 */
 	newInfo = makeIndexInfo(oldInfo->ii_NumIndexAttrs,
 							oldInfo->ii_NumIndexKeyAttrs,
@@ -1395,11 +1394,24 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 							indexPreds,
 							oldInfo->ii_Unique,
 							oldInfo->ii_NullsNotDistinct,
-							false,	/* not ready for inserts */
-							true,
+							!concurrently,	/* isready */
+							concurrently,	/* concurrent */
 							indexRelation->rd_indam->amsummarizing,
 							oldInfo->ii_WithoutOverlaps);
 
+	/* fetch exclusion constraint info if any */
+	if (indexRelation->rd_index->indisexclusion)
+	{
+		/*
+		 * XXX Beware: we're making newInfo point to oldInfo-owned memory.  It
+		 * would be more orthodox to palloc+memcpy, but we don't need that
+		 * here at present.
+		 */
+		newInfo->ii_ExclusionOps = oldInfo->ii_ExclusionOps;
+		newInfo->ii_ExclusionProcs = oldInfo->ii_ExclusionProcs;
+		newInfo->ii_ExclusionStrats = oldInfo->ii_ExclusionStrats;
+	}
+
 	/*
 	 * Extract the list of column names and the column numbers for the new
 	 * index information.  All this information will be used for the index
@@ -1436,6 +1448,9 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 		stattargets[i].isnull = isnull;
 	}
 
+	if (concurrently)
+		flags = INDEX_CREATE_SKIP_BUILD | INDEX_CREATE_CONCURRENT;
+
 	/*
 	 * Now create the new index.
 	 *
@@ -1459,7 +1474,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
 							  indcoloptions->values,
 							  stattargets,
 							  reloptionsDatum,
-							  INDEX_CREATE_SKIP_BUILD | INDEX_CREATE_CONCURRENT,
+							  flags,
 							  0,
 							  true, /* allow table to be a system catalog? */
 							  false,	/* is_internal? */
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 373e823479466..cba379810c779 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -3989,10 +3989,11 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein
 			tablespaceid = indexRel->rd_rel->reltablespace;
 
 		/* Create new index definition based on given index */
-		newIndexId = index_concurrently_create_copy(heapRel,
-													idx->indexId,
-													tablespaceid,
-													concurrentName);
+		newIndexId = index_create_copy(heapRel,
+									   true,
+									   idx->indexId,
+									   tablespaceid,
+									   concurrentName);
 
 		/*
 		 * Now open the relation of the new index, a session-level lock is
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index a38e95bc0eb59..ed9e4c37d27a5 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -101,10 +101,9 @@ extern Oid	index_create(Relation heapRelation,
 #define	INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS	(1 << 4)
 #define	INDEX_CONSTR_CREATE_WITHOUT_OVERLAPS (1 << 5)
 
-extern Oid	index_concurrently_create_copy(Relation heapRelation,
-										   Oid oldIndexId,
-										   Oid tablespaceOid,
-										   const char *newName);
+extern Oid	index_create_copy(Relation heapRelation, bool concurrently,
+							  Oid oldIndexId, Oid tablespaceOid,
+							  const char *newName);
 
 extern void index_concurrently_build(Oid heapRelationId,
 									 Oid indexRelationId);

From 69c11f0545a027fdcb32254b546a0cd431823e4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Herrera?= <alvherre@kurilemu.de>
Date: Sun, 5 Apr 2026 00:21:53 +0200
Subject: [PATCH 03/23] Modernize struct declarations in snapbuild.h

Just a cosmetic cleanup.
---
 src/include/replication/snapbuild.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
index ccded021433b0..a22a83a2f237c 100644
--- a/src/include/replication/snapbuild.h
+++ b/src/include/replication/snapbuild.h
@@ -15,6 +15,14 @@
 #include "access/xlogdefs.h"
 #include "utils/snapmgr.h"
 
+/*
+ * forward declarations in this file
+ */
+typedef struct ReorderBuffer ReorderBuffer;
+typedef struct SnapBuild SnapBuild;
+typedef struct xl_heap_new_cid xl_heap_new_cid;
+typedef struct xl_running_xacts xl_running_xacts;
+
 /*
  * Please keep get_snapbuild_state_desc() (located in the pg_logicalinspect
  * module) updated if a change needs to be made to SnapBuildState.
@@ -50,20 +58,11 @@ typedef enum
 	SNAPBUILD_CONSISTENT = 2,
 } SnapBuildState;
 
-/* forward declare so we don't have to include snapbuild_internal.h */
-struct SnapBuild;
-typedef struct SnapBuild SnapBuild;
-
-/* forward declare so we don't have to include reorderbuffer.h */
-struct ReorderBuffer;
 
-/* forward declare so we don't have to include heapam_xlog.h */
-struct xl_heap_new_cid;
-struct xl_running_xacts;
 
 extern void CheckPointSnapBuild(void);
 
-extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *reorder,
+extern SnapBuild *AllocateSnapshotBuilder(ReorderBuffer *reorder,
 										  TransactionId xmin_horizon, XLogRecPtr start_lsn,
 										  bool need_full_snapshot,
 										  bool in_slot_creation,
@@ -91,9 +90,9 @@ extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
 								   XLogRecPtr lsn);
 extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
 								   XLogRecPtr lsn,
-								   struct xl_heap_new_cid *xlrec);
+								   xl_heap_new_cid *xlrec);
 extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
-										 struct xl_running_xacts *running);
+										 xl_running_xacts *running);
 extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
 
 extern bool SnapBuildSnapshotExists(XLogRecPtr lsn);

From 2849fe4c978540111748208ba96af7808602c567 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sun, 5 Apr 2026 08:49:47 +0700
Subject: [PATCH 04/23] Fix unused function warning on Arm platforms

Guard definition pg_pmull_available() on compile-time availability
of PMULL. Oversight in fbc57f2bc. In passing, remove "inline" hint
for consistency.

Reported-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/f153d5a4-a9be-4211-b0b2-7e99b56d68d5@vondra.me
---
 src/port/pg_crc32c_armv8_choose.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index 72d70aea1e164..591e23df44b45 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -108,7 +108,8 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
-static inline bool
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+static bool
 pg_pmull_available(void)
 {
 #if defined(__aarch64__) && defined(HWCAP_PMULL)
@@ -128,6 +129,7 @@ pg_pmull_available(void)
 	return false;
 #endif
 }
+#endif							/* USE_PMULL_CRC32C_WITH_RUNTIME_CHECK */
 
 /*
  * This gets called on the first call. It replaces the function pointer

From a9ee66881744d67193b56964b8398e5f83130956 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 5 Apr 2026 00:43:54 -0400
Subject: [PATCH 05/23] aio: io_uring: Trigger async processing for large IOs

io_method=io_uring has a heuristic to trigger asynchronous processing of IOs
once the IO depth is a bit larger. That heuristic is important when doing
buffered IO from the kernel page cache, to allow parallelizing of the memory
copy, as otherwise io_method=io_uring would be a lot slower than
io_method=worker in that case.

An upcoming commit will make read_stream.c only increase the read-ahead
distance if we needed to wait for IO to complete. If to-be-read data is in the
kernel page cache, io_uring will synchronously execute IO, unless the IO is
flagged as async.  Therefore the aforementioned change in read_stream.c
heuristic would lead to a substantial performance regression with io_uring
when data is in the page cache, as we would never reach a deep enough queue to
actually trigger the existing heuristic.

Parallelizing the copy from the page cache is mainly important when doing a
lot of IO, which commonly is only possible when doing largely sequential IO.

The reason we don't just mark all io_uring IOs as asynchronous is that the
dispatch to a kernel thread has overhead. This overhead is mostly noticeable
with small random IOs with a low queue depth, as in that case the gain from
parallelizing the memory copy is small and the latency cost high.

The facts from the two prior paragraphs show a way out: Use the size of the IO
in addition to the depth of the queue to trigger asynchronous processing.

One might think that just using the IO size might be enough, but
experimentation has shown that not to be the case - with deep look-ahead
distances being able to parallelize the memory copy is important even with
smaller IOs.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/f3xxfrkafjxpyqxywcxricxgyizjirfceychyxsgn7bwjp5eda@kwbduhy7tfmu
Discussion: https://postgr.es/m/CA+hUKGL2PhFyDoqrHefqasOnaXhSg48t1phs3VM8BAdrZqKZkw@mail.gmail.com
---
 src/backend/storage/aio/method_io_uring.c | 91 +++++++++++++++++------
 1 file changed, 69 insertions(+), 22 deletions(-)

diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 39984df31b458..9f76d2683c0c9 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -409,7 +409,6 @@ static int
 pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 {
 	struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
-	int			in_flight_before = dclist_count(&pgaio_my_backend->in_flight_ios);
 
 	Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
 
@@ -425,27 +424,6 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 
 		pgaio_io_prepare_submit(ioh);
 		pgaio_uring_sq_from_io(ioh, sqe);
-
-		/*
-		 * io_uring executes IO in process context if possible. That's
-		 * generally good, as it reduces context switching. When performing a
-		 * lot of buffered IO that means that copying between page cache and
-		 * userspace memory happens in the foreground, as it can't be
-		 * offloaded to DMA hardware as is possible when using direct IO. When
-		 * executing a lot of buffered IO this causes io_uring to be slower
-		 * than worker mode, as worker mode parallelizes the copying. io_uring
-		 * can be told to offload work to worker threads instead.
-		 *
-		 * If an IO is buffered IO and we already have IOs in flight or
-		 * multiple IOs are being submitted, we thus tell io_uring to execute
-		 * the IO in the background. We don't do so for the first few IOs
-		 * being submitted as executing in this process' context has lower
-		 * latency.
-		 */
-		if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
-			io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
-
-		in_flight_before++;
 	}
 
 	while (true)
@@ -701,10 +679,65 @@ pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation)
 	LWLockRelease(&owner_context->completion_lock);
 }
 
+/*
+ * io_uring executes IO in process context if possible. That's generally good,
+ * as it reduces context switching. When performing a lot of buffered IO that
+ * means that copying between page cache and userspace memory happens in the
+ * foreground, as it can't be offloaded to DMA hardware as is possible when
+ * using direct IO. When executing a lot of buffered IO this causes io_uring
+ * to be slower than worker mode, as worker mode parallelizes the
+ * copying. io_uring can be told to offload work to worker threads instead.
+ *
+ * If the IOs are small, we only benefit from forcing things into the
+ * background if there is a lot of IO, as otherwise the overhead from context
+ * switching is higher than the gain.
+ *
+ * If IOs are large, there is benefit from asynchronous processing at lower
+ * queue depths, as IO latency is less of a crucial factor and parallelizing
+ * memory copies is more important.  In addition, it is important to trigger
+ * asynchronous processing even at low queue depth, as with foreground
+ * processing we might never actually reach deep enough IO depths to trigger
+ * asynchronous processing, which in turn would deprive readahead control
+ * logic of information about whether a deeper look-ahead distance would be
+ * advantageous.
+ *
+ * We have done some basic benchmarking to validate the thresholds used, but
+ * it's quite plausible that there are better values.  See
+ * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg
+ * for some details of this benchmarking.
+ */
+static bool
+pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size)
+{
+	/*
+	 * With DIO there's no benefit from forcing asynchronous processing, as
+	 * io_uring will never execute direct IO synchronously during submission.
+	 */
+	if (!(ioh->flags & PGAIO_HF_BUFFERED))
+		return false;
+
+	/*
+	 * Once the IO queue depth is not that shallow anymore, the overhead of
+	 * dispatching to the background is a less significant factor.
+	 */
+	if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4)
+		return true;
+
+	/*
+	 * If the IO is larger, the gains from parallelizing the memory copy are
+	 * larger and typically the impact of the latency is smaller.
+	 */
+	if (io_size >= (BLCKSZ * 4))
+		return true;
+
+	return false;
+}
+
 static void
 pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 {
 	struct iovec *iov;
+	size_t		io_size = 0;
 
 	switch ((PgAioOp) ioh->op)
 	{
@@ -717,6 +750,8 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 								   iov->iov_base,
 								   iov->iov_len,
 								   ioh->op_data.read.offset);
+
+				io_size = iov->iov_len;
 			}
 			else
 			{
@@ -726,7 +761,13 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 									ioh->op_data.read.iov_length,
 									ioh->op_data.read.offset);
 
+				for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++)
+					io_size += iov->iov_len;
 			}
+
+			if (pgaio_uring_should_use_async(ioh, io_size))
+				io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
+
 			break;
 
 		case PGAIO_OP_WRITEV:
@@ -747,6 +788,12 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
 									 ioh->op_data.write.iov_length,
 									 ioh->op_data.write.offset);
 			}
+
+			/*
+			 * For now don't trigger use of IOSQE_ASYNC for writes, it's not
+			 * clear there is a performance benefit in doing so.
+			 */
+
 			break;
 
 		case PGAIO_OP_INVALID:

From 434dab76ba76fba5dacab2dc695b6b3d5df8315b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 5 Apr 2026 00:43:54 -0400
Subject: [PATCH 06/23] read_stream: Move logic about IO combining & issuing to
 helpers

The long if statements were hard to read and hard to document. Splitting them
into inline helpers makes it much easier to explain each part separately.

This is done in preparation for making the logic more complicated...

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/f3xxfrkafjxpyqxywcxricxgyizjirfceychyxsgn7bwjp5eda@kwbduhy7tfmu
---
 src/backend/storage/aio/read_stream.c | 97 ++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 17 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 31f9e35dee310..4a7a271c3e680 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -440,6 +440,78 @@ read_stream_start_pending_read(ReadStream *stream)
 	return true;
 }
 
+/*
+ * Should we continue to perform look ahead?  Looking ahead may allow us to
+ * make the pending IO larger via IO combining or to issue more read-ahead.
+ */
+static inline bool
+read_stream_should_look_ahead(ReadStream *stream)
+{
+	/* If the callback has signaled end-of-stream, we're done */
+	if (stream->distance == 0)
+		return false;
+
+	/* never start more IOs than our cap */
+	if (stream->ios_in_progress >= stream->max_ios)
+		return false;
+
+	/*
+	 * Don't start more read-ahead if that'd put us over the distance limit
+	 * for doing read-ahead. As stream->distance is capped by
+	 * max_pinned_buffers, this prevents us from looking ahead so far that it
+	 * would put us over the pin limit.
+	 */
+	if (stream->pinned_buffers + stream->pending_read_nblocks >= stream->distance)
+		return false;
+
+	return true;
+}
+
+/*
+ * We don't start the pending read just because we've hit the distance limit,
+ * preferring to give it another chance to grow to full io_combine_limit size
+ * once more buffers have been consumed.  But this is not desirable in all
+ * situations - see below.
+ */
+static inline bool
+read_stream_should_issue_now(ReadStream *stream)
+{
+	int16		pending_read_nblocks = stream->pending_read_nblocks;
+
+	/* there is no pending IO that could be issued */
+	if (pending_read_nblocks == 0)
+		return false;
+
+	/* never start more IOs than our cap */
+	if (stream->ios_in_progress >= stream->max_ios)
+		return false;
+
+	/*
+	 * If the callback has signaled end-of-stream, start the pending read
+	 * immediately. There is no further potential for IO combining.
+	 */
+	if (stream->distance == 0)
+		return true;
+
+	/*
+	 * If we've already reached io_combine_limit, there's no chance of growing
+	 * the read further.
+	 */
+	if (pending_read_nblocks >= stream->io_combine_limit)
+		return true;
+
+	/*
+	 * If we currently have no reads in flight or prepared, issue the IO once
+	 * we are not looking ahead further. This ensures there's always at least
+	 * one IO prepared.
+	 */
+	if (stream->pinned_buffers == 0 &&
+		!read_stream_should_look_ahead(stream))
+		return true;
+
+	return false;
+}
+
 static void
 read_stream_look_ahead(ReadStream *stream)
 {
@@ -452,14 +524,13 @@ read_stream_look_ahead(ReadStream *stream)
 	if (stream->batch_mode)
 		pgaio_enter_batchmode();
 
-	while (stream->ios_in_progress < stream->max_ios &&
-		   stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
+	while (read_stream_should_look_ahead(stream))
 	{
 		BlockNumber blocknum;
 		int16		buffer_index;
 		void	   *per_buffer_data;
 
-		if (stream->pending_read_nblocks == stream->io_combine_limit)
+		if (read_stream_should_issue_now(stream))
 		{
 			read_stream_start_pending_read(stream);
 			continue;
@@ -511,21 +582,13 @@ read_stream_look_ahead(ReadStream *stream)
 	}
 
 	/*
-	 * We don't start the pending read just because we've hit the distance
-	 * limit, preferring to give it another chance to grow to full
-	 * io_combine_limit size once more buffers have been consumed.  However,
-	 * if we've already reached io_combine_limit, or we've reached the
-	 * distance limit and there isn't anything pinned yet, or the callback has
-	 * signaled end-of-stream, we start the read immediately.  Note that the
-	 * pending read can exceed the distance goal, if the latter was reduced
-	 * after hitting the per-backend buffer limit.
+	 * Check if the pending read should be issued now, or if we should give it
+	 * another chance to grow to the full size.
+	 *
+	 * Note that the pending read can exceed the distance goal, if the latter
+	 * was reduced after hitting the per-backend buffer limit.
 	 */
-	if (stream->pending_read_nblocks > 0 &&
-		(stream->pending_read_nblocks == stream->io_combine_limit ||
-		 (stream->pending_read_nblocks >= stream->distance &&
-		  stream->pinned_buffers == 0) ||
-		 stream->distance == 0) &&
-		stream->ios_in_progress < stream->max_ios)
+	if (read_stream_should_issue_now(stream))
 		read_stream_start_pending_read(stream);
 
 	/*

From 8ca147d582a5a9f3345478654408c46314758b50 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 5 Apr 2026 00:43:54 -0400
Subject: [PATCH 07/23] read stream: Split decision about look ahead for AIO
 and combining

In a subsequent commit the read-ahead distance will only be increased when
waiting for IO. Without further work that would cause a regression: As IO
combining and read-ahead are currently controlled by the same mechanism, we
would end up not allowing IO combining when never needing to wait for IO (as
the distance ends up too small to allow for full sized IOs), which can
increase CPU overhead. A typical reason to not have to wait for IO completion
at a low look-ahead distance is use of io_uring with the to-be-read data in
the page cache. But even with worker the IO submission rate may be low enough
for the worker to keep up.

One might think that we could just always perform IO combining, but doing so
at the start of a scan can cause performance regressions:

1) Performing a large IO commonly has a higher latency than smaller IOs. That
   is not a problem once reading ahead far enough, but at the start of a stream
   it can lead to longer waits for IO completion.

2) Sometimes read streams will not be read to completion. Immediately starting
   with full sized IOs leads to more wasted effort. This is not commonly an
   issue with existing read stream users, but the upcoming use of read streams
   to fetch table pages as part of an index scan frequently encounters this.

Solve this issue by splitting ReadStream->distance into ->combine_distance and
->readahead_distance. Right now they are increased/decreased at the same time,
but that will change in the next commit.

One of the comments in read_stream_should_look_ahead() refers to a motivation
that only really exists as of the next commit, but without it the code doesn't
make sense on its own.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/f3xxfrkafjxpyqxywcxricxgyizjirfceychyxsgn7bwjp5eda@kwbduhy7tfmu
Discussion: https://postgr.es/m/CA+hUKGL2PhFyDoqrHefqasOnaXhSg48t1phs3VM8BAdrZqKZkw@mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 164 ++++++++++++++++++++------
 1 file changed, 129 insertions(+), 35 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 4a7a271c3e680..37c3921450b5f 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -98,10 +98,23 @@ struct ReadStream
 	int16		max_pinned_buffers;
 	int16		forwarded_buffers;
 	int16		pinned_buffers;
-	int16		distance;
+
+	/*
+	 * Limit of how far, in blocks, to look-ahead for IO combining and for
+	 * read-ahead.
+	 *
+	 * The limits for read-ahead and combining are handled separately to allow
+	 * for IO combining even in cases where the I/O subsystem can keep up at a
+	 * low read-ahead distance, as doing larger IOs is more efficient.
+	 *
+	 * Set to 0 when the end of the stream is reached.
+	 */
+	int16		combine_distance;
+	int16		readahead_distance;
 	uint16		distance_decay_holdoff;
 	int16		initialized_buffers;
-	int16		resume_distance;
+	int16		resume_readahead_distance;
+	int16		resume_combine_distance;
 	int			read_buffers_flags;
 	bool		sync_mode;		/* using io_method=sync */
 	bool		batch_mode;		/* READ_STREAM_USE_BATCHING */
@@ -332,8 +345,8 @@ read_stream_start_pending_read(ReadStream *stream)
 
 		/* Shrink distance: no more look-ahead until buffers are released. */
 		new_distance = stream->pinned_buffers + buffer_limit;
-		if (stream->distance > new_distance)
-			stream->distance = new_distance;
+		if (stream->readahead_distance > new_distance)
+			stream->readahead_distance = new_distance;
 
 		/* Unless we have nothing to give the consumer, stop here. */
 		if (stream->pinned_buffers > 0)
@@ -374,12 +387,29 @@ read_stream_start_pending_read(ReadStream *stream)
 		 * perform IO asynchronously when starting out with a small look-ahead
 		 * distance.
 		 */
-		if (stream->distance > 1 && stream->ios_in_progress == 0)
+		if (stream->ios_in_progress == 0)
 		{
-			if (stream->distance_decay_holdoff == 0)
-				stream->distance--;
-			else
+			if (stream->distance_decay_holdoff > 0)
 				stream->distance_decay_holdoff--;
+			else
+			{
+				if (stream->readahead_distance > 1)
+					stream->readahead_distance--;
+
+				/*
+				 * For now we reduce the IO combine distance after
+				 * sufficiently many buffer hits. There is no clear
+				 * performance argument for doing so, but at the moment we
+				 * need to do so to make the entrance into fast_path work
+				 * correctly: We require combine_distance == 1 to enter
+				 * fast-path, as without that condition we would wrongly
+				 * re-enter fast-path when readahead_distance == 1 and
+				 * pinned_buffers == 1, as we would not yet have prepared
+				 * another IO in that situation.
+				 */
+				if (stream->combine_distance > 1)
+					stream->combine_distance--;
+			}
 		}
 	}
 	else
@@ -448,20 +478,56 @@ static inline bool
 read_stream_should_look_ahead(ReadStream *stream)
 {
 	/* If the callback has signaled end-of-stream, we're done */
-	if (stream->distance == 0)
+	if (stream->readahead_distance == 0)
 		return false;
 
 	/* never start more IOs than our cap */
 	if (stream->ios_in_progress >= stream->max_ios)
 		return false;
 
+	/*
+	 * Allow looking further ahead if we are in the process of building a
+	 * larger IO, the IO is not yet big enough, and we don't yet have IO in
+	 * flight.
+	 *
+	 * We do so to allow building larger reads when readahead_distance is
+	 * small (e.g. because the I/O subsystem is keeping up or
+	 * effective_io_concurrency is small). That's a useful goal because larger
+	 * reads are more CPU efficient than smaller reads, even if the system is
+	 * not IO bound.
+	 *
+	 * The reason we do *not* do so when we already have a read prepared (i.e.
+	 * why we check for pinned_buffers == 0) is once we are actually reading
+	 * ahead, we don't need it:
+	 *
+	 * - We won't issue unnecessarily small reads as
+	 * read_stream_should_issue_now() will return false until the IO is
+	 * suitably sized. The issuance of the pending read will be delayed until
+	 * enough buffers have been consumed.
+	 *
+	 * - If we are not reading ahead aggressively enough, future
+	 * WaitReadBuffers() calls will return true, leading to readahead_distance
+	 * being increased. After that more full-sized IOs can be issued.
+	 *
+	 * Furthermore, if we did not have the pinned_buffers == 0 condition, we
+	 * might end up issuing I/O more aggressively than we need.
+	 *
+	 * Note that a return of true here can lead to exceeding the read-ahead
+	 * limit, but we won't exceed the buffer pin limit (because pinned_buffers
+	 * == 0 and combine_distance is capped by max_pinned_buffers).
+	 */
+	if (stream->pending_read_nblocks > 0 &&
+		stream->pinned_buffers == 0 &&
+		stream->pending_read_nblocks < stream->combine_distance)
+		return true;
+
 	/*
 	 * Don't start more read-ahead if that'd put us over the distance limit
-	 * for doing read-ahead. As stream->distance is capped by
+	 * for doing read-ahead. As stream->readahead_distance is capped by
 	 * max_pinned_buffers, this prevents us from looking ahead so far that it
 	 * would put us over the pin limit.
 	 */
-	if (stream->pinned_buffers + stream->pending_read_nblocks >= stream->distance)
+	if (stream->pinned_buffers + stream->pending_read_nblocks >= stream->readahead_distance)
 		return false;
 
 	return true;
@@ -490,14 +556,14 @@ read_stream_should_issue_now(ReadStream *stream)
 	 * If the callback has signaled end-of-stream, start the pending read
 	 * immediately. There is no further potential for IO combining.
 	 */
-	if (stream->distance == 0)
+	if (stream->readahead_distance == 0)
 		return true;
 
 	/*
-	 * If we've already reached io_combine_limit, there's no chance of growing
+	 * If we've already reached combine_distance, there's no chance of growing
 	 * the read further.
 	 */
-	if (pending_read_nblocks >= stream->io_combine_limit)
+	if (pending_read_nblocks >= stream->combine_distance)
 		return true;
 
 	/*
@@ -550,7 +616,8 @@ read_stream_look_ahead(ReadStream *stream)
 		if (blocknum == InvalidBlockNumber)
 		{
 			/* End of stream. */
-			stream->distance = 0;
+			stream->readahead_distance = 0;
+			stream->combine_distance = 0;
 			break;
 		}
 
@@ -597,7 +664,7 @@ read_stream_look_ahead(ReadStream *stream)
 	 * stream.  In the worst case we can always make progress one buffer at a
 	 * time.
 	 */
-	Assert(stream->pinned_buffers > 0 || stream->distance == 0);
+	Assert(stream->pinned_buffers > 0 || stream->readahead_distance == 0);
 
 	if (stream->batch_mode)
 		pgaio_exit_batchmode();
@@ -787,10 +854,17 @@ read_stream_begin_impl(int flags,
 	 * doing full io_combine_limit sized reads.
 	 */
 	if (flags & READ_STREAM_FULL)
-		stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
+	{
+		stream->readahead_distance = Min(max_pinned_buffers, stream->io_combine_limit);
+		stream->combine_distance = Min(max_pinned_buffers, stream->io_combine_limit);
+	}
 	else
-		stream->distance = 1;
-	stream->resume_distance = stream->distance;
+	{
+		stream->readahead_distance = 1;
+		stream->combine_distance = 1;
+	}
+	stream->resume_readahead_distance = stream->readahead_distance;
+	stream->resume_combine_distance = stream->combine_distance;
 
 	/*
 	 * Since we always access the same relation, we can initialize parts of
@@ -889,7 +963,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		Assert(stream->ios_in_progress == 0);
 		Assert(stream->forwarded_buffers == 0);
 		Assert(stream->pinned_buffers == 1);
-		Assert(stream->distance == 1);
+		Assert(stream->readahead_distance == 1);
+		Assert(stream->combine_distance == 1);
 		Assert(stream->pending_read_nblocks == 0);
 		Assert(stream->per_buffer_data_size == 0);
 		Assert(stream->initialized_buffers > stream->oldest_buffer_index);
@@ -963,7 +1038,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No more blocks, end of stream. */
-			stream->distance = 0;
+			stream->readahead_distance = 0;
+			stream->combine_distance = 0;
 			stream->oldest_buffer_index = stream->next_buffer_index;
 			stream->pinned_buffers = 0;
 			stream->buffers[oldest_buffer_index] = InvalidBuffer;
@@ -979,7 +1055,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		Assert(stream->oldest_buffer_index == stream->next_buffer_index);
 
 		/* End of stream reached?  */
-		if (stream->distance == 0)
+		if (stream->readahead_distance == 0)
 			return InvalidBuffer;
 
 		/*
@@ -993,7 +1069,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		/* End of stream reached? */
 		if (stream->pinned_buffers == 0)
 		{
-			Assert(stream->distance == 0);
+			Assert(stream->readahead_distance == 0);
 			return InvalidBuffer;
 		}
 	}
@@ -1014,7 +1090,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		stream->ios[stream->oldest_io_index].buffer_index == oldest_buffer_index)
 	{
 		int16		io_index = stream->oldest_io_index;
-		int32		distance;	/* wider temporary value, clamped below */
+
+		/* wider temporary values, clamped below */
+		int32		readahead_distance;
+		int32		combine_distance;
 
 		/* Sanity check that we still agree on the buffers. */
 		Assert(stream->ios[io_index].op.buffers ==
@@ -1027,10 +1106,18 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		if (++stream->oldest_io_index == stream->max_ios)
 			stream->oldest_io_index = 0;
 
-		/* Look-ahead distance ramps up rapidly after we do I/O. */
-		distance = stream->distance * 2;
-		distance = Min(distance, stream->max_pinned_buffers);
-		stream->distance = distance;
+		/*
+		 * Read-ahead and IO combining distances ramp up rapidly after we do
+		 * I/O.
+		 */
+		readahead_distance = stream->readahead_distance * 2;
+		readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
+		stream->readahead_distance = readahead_distance;
+
+		combine_distance = stream->combine_distance * 2;
+		combine_distance = Min(combine_distance, stream->io_combine_limit);
+		combine_distance = Min(combine_distance, stream->max_pinned_buffers);
+		stream->combine_distance = combine_distance;
 
 		/*
 		 * As we needed IO, prevent distance from being reduced within our
@@ -1111,7 +1198,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 	if (stream->ios_in_progress == 0 &&
 		stream->forwarded_buffers == 0 &&
 		stream->pinned_buffers == 1 &&
-		stream->distance == 1 &&
+		stream->readahead_distance == 1 &&
+		stream->combine_distance == 1 &&
 		stream->pending_read_nblocks == 0 &&
 		stream->per_buffer_data_size == 0)
 	{
@@ -1157,8 +1245,10 @@ read_stream_next_block(ReadStream *stream, BufferAccessStrategy *strategy)
 BlockNumber
 read_stream_pause(ReadStream *stream)
 {
-	stream->resume_distance = stream->distance;
-	stream->distance = 0;
+	stream->resume_readahead_distance = stream->readahead_distance;
+	stream->resume_combine_distance = stream->combine_distance;
+	stream->readahead_distance = 0;
+	stream->combine_distance = 0;
 	return InvalidBlockNumber;
 }
 
@@ -1170,7 +1260,8 @@ read_stream_pause(ReadStream *stream)
 void
 read_stream_resume(ReadStream *stream)
 {
-	stream->distance = stream->resume_distance;
+	stream->readahead_distance = stream->resume_readahead_distance;
+	stream->combine_distance = stream->resume_combine_distance;
 }
 
 /*
@@ -1186,7 +1277,8 @@ read_stream_reset(ReadStream *stream)
 	Buffer		buffer;
 
 	/* Stop looking ahead. */
-	stream->distance = 0;
+	stream->readahead_distance = 0;
+	stream->combine_distance = 0;
 
 	/* Forget buffered block number and fast path state. */
 	stream->buffered_blocknum = InvalidBlockNumber;
@@ -1218,8 +1310,10 @@ read_stream_reset(ReadStream *stream)
 	Assert(stream->ios_in_progress == 0);
 
 	/* Start off assuming data is cached. */
-	stream->distance = 1;
-	stream->resume_distance = stream->distance;
+	stream->readahead_distance = 1;
+	stream->combine_distance = 1;
+	stream->resume_readahead_distance = stream->readahead_distance;
+	stream->resume_combine_distance = stream->combine_distance;
 	stream->distance_decay_holdoff = 0;
 }
 

From f63ca3379025ee4547865182da6cae14aec35d58 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 5 Apr 2026 00:43:54 -0400
Subject: [PATCH 08/23] read_stream: Only increase read-ahead distance when
 waiting for IO

This avoids increasing the distance to the maximum in cases where the I/O
subsystem is already keeping up. This turns out to be important for
performance for two reasons:

- Pinning a lot of buffers is not cheap. If additional pins allow us to avoid
  IO waits, it's definitely worth it, but if we can already do all the
  necessary readahead at a distance of 16, reading ahead 512 buffers can
  increase the CPU overhead substantially.  This is particularly noticeable
  when the to-be-read blocks are already in the kernel page cache.

- If the read stream is read to completion, reading in data earlier than
  needed is of limited consequences, leaving aside the CPU costs mentioned
  above. But if the read stream will not be fully consumed, e.g. because it is
  on the inner side of a nested loop join, the additional IO can be a serious
  performance issue. This is not that commonly a problem for current read
  stream users, but the upcoming work, to use a read stream to fetch table
  pages as part of an index scan, frequently encounters this.

Note that this commit would have substantial performance downsides without
earlier commits:

- Commit 6e36930f9aa, which avoids decreasing the readahead distance when
  there was recent IO, is crucial, as otherwise we very often would end up not
  reading ahead aggressively enough anymore with this commit, due to
  increasing the distance less often.

- "read stream: Split decision about look ahead for AIO and combining" is
  important as we would otherwise not perform IO combining when the IO
  subsystem can keep up.

- "aio: io_uring: Trigger async processing for large IOs" is important to
  continue to benefit from memory copy parallelism when using fewer IOs.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Tested-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/f3xxfrkafjxpyqxywcxricxgyizjirfceychyxsgn7bwjp5eda@kwbduhy7tfmu
Discussion: https://postgr.es/m/CA+hUKGL2PhFyDoqrHefqasOnaXhSg48t1phs3VM8BAdrZqKZkw@mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 89 ++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 21 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 37c3921450b5f..0b6cdf7c8730d 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -18,11 +18,13 @@
  * to StartReadBuffers() so that a new one can begin to form.
  *
  * The algorithm for controlling the look-ahead distance is based on recent
- * cache hit and miss history.  When no I/O is necessary, there is no benefit
- * in looking ahead more than one block.  This is the default initial
- * assumption, but when blocks needing I/O are streamed, the distance is
- * increased rapidly to try to benefit from I/O combining and concurrency.  It
- * is reduced gradually when cached blocks are streamed.
+ * cache / miss history, as well as whether we need to wait for I/O completion
+ * after a miss.  When no I/O is necessary, there is no benefit in looking
+ * ahead more than one block.  This is the default initial assumption.  When
+ * blocks needing I/O are streamed, the combine distance is increased to
+ * benefit from I/O combining and the read-ahead distance is increased
+ * whenever we need to wait for I/O to try to benefit from increased I/O
+ * concurrency. Both are reduced gradually when cached blocks are streamed.
  *
  * The main data structure is a circular queue of buffers of size
  * max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -1090,16 +1092,13 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		stream->ios[stream->oldest_io_index].buffer_index == oldest_buffer_index)
 	{
 		int16		io_index = stream->oldest_io_index;
-
-		/* wider temporary values, clamped below */
-		int32		readahead_distance;
-		int32		combine_distance;
+		bool		needed_wait;
 
 		/* Sanity check that we still agree on the buffers. */
 		Assert(stream->ios[io_index].op.buffers ==
 			   &stream->buffers[oldest_buffer_index]);
 
-		WaitReadBuffers(&stream->ios[io_index].op);
+		needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
 
 		Assert(stream->ios_in_progress > 0);
 		stream->ios_in_progress--;
@@ -1107,21 +1106,45 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			stream->oldest_io_index = 0;
 
 		/*
-		 * Read-ahead and IO combining distances ramp up rapidly after we do
-		 * I/O.
+		 * If the IO was executed synchronously, we will never see
+		 * WaitReadBuffers() block. Treat it as if it did block. This is
+		 * particularly crucial when effective_io_concurrency=0 is used, as
+		 * all IO will be synchronous.  Without treating synchronous IO as
+		 * having waited, we'd never allow the distance to get large enough to
+		 * allow for IO combining, resulting in bad performance.
+		 */
+		if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY)
+			needed_wait = true;
+
+		/*
+		 * Have the read-ahead distance ramp up rapidly after we needed to
+		 * wait for IO. We only increase the read-ahead-distance when we
+		 * needed to wait, to avoid increasing the distance further than
+		 * necessary, as looking ahead too far can be costly, both due to the
+		 * cost of unnecessarily pinning many buffers and due to doing IOs
+		 * that may never be consumed if the stream is ended/reset before
+		 * completion.
+		 *
+		 * If we did not need to wait, the current distance was evidently
+		 * sufficient.
+		 *
+		 * NB: Must not increase the distance if we already reached the end of
+		 * the stream, as stream->readahead_distance == 0 is used to keep
+		 * track of having reached the end.
 		 */
-		readahead_distance = stream->readahead_distance * 2;
-		readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
-		stream->readahead_distance = readahead_distance;
+		if (stream->readahead_distance > 0 && needed_wait)
+		{
+			/* wider temporary value, due to overflow risk */
+			int32		readahead_distance;
 
-		combine_distance = stream->combine_distance * 2;
-		combine_distance = Min(combine_distance, stream->io_combine_limit);
-		combine_distance = Min(combine_distance, stream->max_pinned_buffers);
-		stream->combine_distance = combine_distance;
+			readahead_distance = stream->readahead_distance * 2;
+			readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
+			stream->readahead_distance = readahead_distance;
+		}
 
 		/*
-		 * As we needed IO, prevent distance from being reduced within our
-		 * maximum look-ahead window. This avoids having distance collapse too
+		 * As we needed IO, prevent distances from being reduced within our
+		 * maximum look-ahead window. This avoids collapsing distances too
 		 * quickly in workloads where most of the required blocks are cached,
 		 * but where the remaining IOs are a sufficient enough factor to cause
 		 * a substantial slowdown if executed synchronously.
@@ -1133,6 +1156,30 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		 */
 		stream->distance_decay_holdoff = stream->max_pinned_buffers;
 
+		/*
+		 * Whether we needed to wait or not, allow for more IO combining if we
+		 * needed to do IO. The reason to do so independent of needing to wait
+		 * is that when the data is resident in the kernel page cache, IO
+		 * combining reduces the syscall / dispatch overhead, making it
+		 * worthwhile regardless of needing to wait.
+		 *
+		 * It is also important with io_uring as it will never signal the need
+		 * to wait for reads if all the data is in the page cache. There are
+		 * heuristics to deal with that in method_io_uring.c, but they only
+		 * work when the IO gets large enough.
+		 */
+		if (stream->combine_distance > 0 &&
+			stream->combine_distance < stream->io_combine_limit)
+		{
+			/* wider temporary value, due to overflow risk */
+			int32		combine_distance;
+
+			combine_distance = stream->combine_distance * 2;
+			combine_distance = Min(combine_distance, stream->io_combine_limit);
+			combine_distance = Min(combine_distance, stream->max_pinned_buffers);
+			stream->combine_distance = combine_distance;
+		}
+
 		/*
 		 * If we've reached the first block of a sequential region we're
 		 * issuing advice for, cancel that until the next jump.  The kernel

From fc44f106657a0b2f55147d8309a676a6ac555a95 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Sun, 5 Apr 2026 18:01:10 +1200
Subject: [PATCH 09/23] aio: Simplify pgaio_worker_submit().

Merge pgaio_worker_submit_internal() and pgaio_worker_submit().  The
separation didn't serve any purpose.

Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKG%2Bm4xV0LMoH2c%3DoRAdEXuCnh%2BtGBTWa7uFeFMGgTLAw%2BQ%40mail.gmail.com
---
 src/backend/storage/aio/method_worker.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index efe38e9f1134f..e24357a7a0a23 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -239,8 +239,8 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh)
 		|| !pgaio_io_can_reopen(ioh);
 }
 
-static void
-pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
+static int
+pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 {
 	PgAioHandle **synchronous_ios = NULL;
 	int			nsync = 0;
@@ -249,6 +249,9 @@ pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
 
 	Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
 
+	for (int i = 0; i < num_staged_ios; i++)
+		pgaio_io_prepare_submit(staged_ios[i]);
+
 	if (LWLockConditionalAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE))
 	{
 		for (int i = 0; i < num_staged_ios; ++i)
@@ -299,19 +302,6 @@ pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
 			pgaio_io_perform_synchronously(synchronous_ios[i]);
 		}
 	}
-}
-
-static int
-pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
-{
-	for (int i = 0; i < num_staged_ios; i++)
-	{
-		PgAioHandle *ioh = staged_ios[i];
-
-		pgaio_io_prepare_submit(ioh);
-	}
-
-	pgaio_worker_submit_internal(num_staged_ios, staged_ios);
 
 	return num_staged_ios;
 }

From de28140ded8d4ba00faf905ec3530ffeb8a34a53 Mon Sep 17 00:00:00 2001
From: Etsuro Fujita <efujita@postgresql.org>
Date: Sun, 5 Apr 2026 18:55:00 +0900
Subject: [PATCH 10/23] postgres_fdw: Inherit the local transaction's
 access/deferrable modes.

READ ONLY transactions should prevent modifications to foreign data as
well as local data, but postgres_fdw transactions declared as READ ONLY
that reference foreign tables mapped to a remote view executing volatile
functions would modify data on remote servers, as it would open remote
transactions in READ WRITE mode.

Similarly, DEFERRABLE transactions should not abort due to a
serialization failure even when accessing foreign data, but postgres_fdw
transactions declared as DEFERRABLE would abort due to that failure in a
remote server, as it would open remote transactions in NOT DEFERRABLE
mode.

To fix, modify postgres_fdw to open remote transactions in the same
access/deferrable modes as the local transaction.  This commit also
modifies it to open remote subtransactions in the same access mode as
the local subtransaction.

This commit changes the behavior of READ ONLY/DEFERRABLE transactions
using postgres_fdw; in particular, it doesn't allow the READ ONLY
transactions to modify data on remote servers anymore, so such
transactions should be redeclared as READ WRITE or rewritten using other
tools like dblink.  The release notes should note this as an
incompatibility.

These issues exist since the introduction of postgres_fdw, but to avoid
the incompatibility in the back branches, fix them in master only.

Author: Etsuro Fujita <etsuro.fujita@gmail.com>
Reviewed-by: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Fujii Masao <masao.fujii@gmail.com>
Discussion: https://postgr.es/m/CAPmGK16n_hcUUWuOdmeUS%2Bw4Q6dZvTEDHb%3DOP%3D5JBzo-M3QmpQ%40mail.gmail.com
Discussion: https://postgr.es/m/E1uLe9X-000zsY-2g%40gemulon.postgresql.org
---
 contrib/postgres_fdw/connection.c             | 124 ++++++++++++++--
 .../postgres_fdw/expected/postgres_fdw.out    | 136 ++++++++++++++++++
 contrib/postgres_fdw/sql/postgres_fdw.sql     |  80 +++++++++++
 doc/src/sgml/postgres-fdw.sgml                |  17 +++
 src/backend/access/transam/xact.c             |  28 ++++
 src/include/access/xact.h                     |   1 +
 6 files changed, 378 insertions(+), 8 deletions(-)

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 192f8011160a9..06673017bcfaf 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -60,6 +60,7 @@ typedef struct ConnCacheEntry
 	/* Remaining fields are invalid when conn is NULL: */
 	int			xact_depth;		/* 0 = no xact open, 1 = main xact open, 2 =
 								 * one level of subxact open, etc */
+	bool		xact_read_only; /* xact r/o state */
 	bool		have_prep_stmt; /* have we prepared any stmts in this xact? */
 	bool		have_error;		/* have any subxacts aborted in this xact? */
 	bool		changing_xact_state;	/* xact state change in process */
@@ -86,6 +87,12 @@ static unsigned int prep_stmt_number = 0;
 /* tracks whether any work is needed in callback functions */
 static bool xact_got_connection = false;
 
+/*
+ * tracks the topmost read-only local transaction's nesting level determined
+ * by GetTopReadOnlyTransactionNestLevel()
+ */
+static int	read_only_level = 0;
+
 /* custom wait event values, retrieved from shared memory */
 static uint32 pgfdw_we_cleanup_result = 0;
 static uint32 pgfdw_we_connect = 0;
@@ -378,6 +385,7 @@ make_new_connection(ConnCacheEntry *entry, UserMapping *user)
 
 	/* Reset all transient state fields, to be sure all are clean */
 	entry->xact_depth = 0;
+	entry->xact_read_only = false;
 	entry->have_prep_stmt = false;
 	entry->have_error = false;
 	entry->changing_xact_state = false;
@@ -871,29 +879,106 @@ do_sql_command_end(PGconn *conn, const char *sql, bool consume_input)
  * those scans.  A disadvantage is that we can't provide sane emulation of
  * READ COMMITTED behavior --- it would be nice if we had some other way to
  * control which remote queries share a snapshot.
+ *
+ * Note also that we always start the remote transaction with the same
+ * read/write and deferrable properties as the local transaction, and start
+ * the remote subtransaction with the same read/write property as the local
+ * subtransaction.
  */
 static void
 begin_remote_xact(ConnCacheEntry *entry)
 {
 	int			curlevel = GetCurrentTransactionNestLevel();
 
-	/* Start main transaction if we haven't yet */
+	/*
+	 * If the current local (sub)transaction is read-only, set the topmost
+	 * read-only local transaction's nesting level if we haven't yet.
+	 *
+	 * Note: once it's set, it's retained until the topmost read-only local
+	 * transaction is committed/aborted (see pgfdw_xact_callback and
+	 * pgfdw_subxact_callback).
+	 */
+	if (XactReadOnly)
+	{
+		if (read_only_level == 0)
+			read_only_level = GetTopReadOnlyTransactionNestLevel();
+		Assert(read_only_level > 0);
+	}
+	else
+		Assert(read_only_level == 0);
+
+	/*
+	 * Start main transaction if we haven't yet; otherwise, change the current
+	 * remote (sub)transaction's read/write mode if needed.
+	 */
 	if (entry->xact_depth <= 0)
 	{
-		const char *sql;
+		/*
+		 * This is the case when we haven't yet started a main transaction.
+		 */
+		StringInfoData sql;
+		bool		ro = (read_only_level == 1);
 
 		elog(DEBUG3, "starting remote transaction on connection %p",
 			 entry->conn);
 
+		initStringInfo(&sql);
+		appendStringInfoString(&sql, "START TRANSACTION ISOLATION LEVEL ");
 		if (IsolationIsSerializable())
-			sql = "START TRANSACTION ISOLATION LEVEL SERIALIZABLE";
+			appendStringInfoString(&sql, "SERIALIZABLE");
 		else
-			sql = "START TRANSACTION ISOLATION LEVEL REPEATABLE READ";
+			appendStringInfoString(&sql, "REPEATABLE READ");
+		if (ro)
+			appendStringInfoString(&sql, " READ ONLY");
+		if (XactDeferrable)
+			appendStringInfoString(&sql, " DEFERRABLE");
 		entry->changing_xact_state = true;
-		do_sql_command(entry->conn, sql);
+		do_sql_command(entry->conn, sql.data);
 		entry->xact_depth = 1;
+		if (ro)
+		{
+			Assert(!entry->xact_read_only);
+			entry->xact_read_only = true;
+		}
 		entry->changing_xact_state = false;
 	}
+	else if (!entry->xact_read_only)
+	{
+		/*
+		 * The remote (sub)transaction has been opened in read-write mode.
+		 */
+		Assert(read_only_level == 0 ||
+			   entry->xact_depth <= read_only_level);
+
+		/*
+		 * If its nesting depth matches read_only_level, it means that the
+		 * local read-write (sub)transaction that started it has changed to
+		 * read-only after that; in which case change it to read-only as well.
+		 * Otherwise, the local (sub)transaction is still read-write, so there
+		 * is no need to do anything.
+		 */
+		if (entry->xact_depth == read_only_level)
+		{
+			entry->changing_xact_state = true;
+			do_sql_command(entry->conn, "SET transaction_read_only = on");
+			entry->xact_read_only = true;
+			entry->changing_xact_state = false;
+		}
+	}
+	else
+	{
+		/*
+		 * The remote (sub)transaction has been opened in read-only mode.
+		 */
+		Assert(read_only_level > 0 &&
+			   entry->xact_depth >= read_only_level);
+
+		/*
+		 * The local read-only (sub)transaction that started it is guaranteed
+		 * to be still read-only (see check_transaction_read_only), so there
+		 * is no need to do anything.
+		 */
+	}
 
 	/*
 	 * If we're in a subtransaction, stack up savepoints to match our level.
@@ -902,12 +987,21 @@ begin_remote_xact(ConnCacheEntry *entry)
 	 */
 	while (entry->xact_depth < curlevel)
 	{
-		char		sql[64];
+		StringInfoData sql;
+		bool		ro = (entry->xact_depth + 1 == read_only_level);
 
-		snprintf(sql, sizeof(sql), "SAVEPOINT s%d", entry->xact_depth + 1);
+		initStringInfo(&sql);
+		appendStringInfo(&sql, "SAVEPOINT s%d", entry->xact_depth + 1);
+		if (ro)
+			appendStringInfoString(&sql, "; SET transaction_read_only = on");
 		entry->changing_xact_state = true;
-		do_sql_command(entry->conn, sql);
+		do_sql_command(entry->conn, sql.data);
 		entry->xact_depth++;
+		if (ro)
+		{
+			Assert(!entry->xact_read_only);
+			entry->xact_read_only = true;
+		}
 		entry->changing_xact_state = false;
 	}
 }
@@ -1212,6 +1306,9 @@ pgfdw_xact_callback(XactEvent event, void *arg)
 
 	/* Also reset cursor numbering for next transaction */
 	cursor_number = 0;
+
+	/* Likewise for read_only_level */
+	read_only_level = 0;
 }
 
 /*
@@ -1310,6 +1407,10 @@ pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid,
 									   false);
 		}
 	}
+
+	/* If in read_only_level, reset it */
+	if (curlevel == read_only_level)
+		read_only_level = 0;
 }
 
 /*
@@ -1412,6 +1513,9 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
 		/* Reset state to show we're out of a transaction */
 		entry->xact_depth = 0;
 
+		/* Reset xact r/o state */
+		entry->xact_read_only = false;
+
 		/*
 		 * If the connection isn't in a good idle state, it is marked as
 		 * invalid or keep_connections option of its server is disabled, then
@@ -1432,6 +1536,10 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
 	{
 		/* Reset state to show we're out of a subtransaction */
 		entry->xact_depth--;
+
+		/* If in read_only_level, reset xact r/o state */
+		if (entry->xact_depth + 1 == read_only_level)
+			entry->xact_read_only = false;
 	}
 }
 
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index ac34a1acacb62..cd22553236f05 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -12575,6 +12575,142 @@ SELECT count(*) FROM remote_application_name
 DROP FOREIGN TABLE remote_application_name;
 DROP VIEW my_application_name;
 -- ===================================================================
+-- test read-only and/or deferrable transactions
+-- ===================================================================
+CREATE TABLE loct (f1 int, f2 text);
+CREATE FUNCTION locf() RETURNS SETOF loct LANGUAGE SQL AS
+  'UPDATE public.loct SET f2 = f2 || f2 RETURNING *';
+CREATE VIEW locv AS SELECT t.* FROM locf() t;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'locv');
+CREATE FOREIGN TABLE remt2 (f1 int, f2 text)
+  SERVER loopback2 OPTIONS (table_name 'locv');
+INSERT INTO loct VALUES (1, 'foo'), (2, 'bar');
+START TRANSACTION READ ONLY;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+START TRANSACTION;
+SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+-- Exercise abort code paths in pgfdw_xact_callback/pgfdw_subxact_callback
+-- in situations where multiple connections are involved
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+ f1 |   f2   
+----+--------
+  1 | foofoo
+  2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ERROR:  cannot execute UPDATE in a read-only transaction
+CONTEXT:  SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+DROP FOREIGN TABLE remt;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'loct');
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+SELECT * FROM remt;
+ f1 | f2  
+----+-----
+  1 | foo
+  2 | bar
+(2 rows)
+
+COMMIT;
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE DEFERRABLE;
+SELECT * FROM remt;
+ f1 | f2  
+----+-----
+  1 | foo
+  2 | bar
+(2 rows)
+
+COMMIT;
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+SELECT * FROM remt;
+ f1 | f2  
+----+-----
+  1 | foo
+  2 | bar
+(2 rows)
+
+COMMIT;
+-- Clean up
+DROP FOREIGN TABLE remt;
+DROP FOREIGN TABLE remt2;
+DROP VIEW locv;
+DROP FUNCTION locf();
+DROP TABLE loct;
+-- ===================================================================
 -- test parallel commit and parallel abort
 -- ===================================================================
 ALTER SERVER loopback OPTIONS (ADD parallel_commit 'true');
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 0e218b29a29ed..59963e298b846 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -4328,6 +4328,86 @@ SELECT count(*) FROM remote_application_name
 DROP FOREIGN TABLE remote_application_name;
 DROP VIEW my_application_name;
 
+-- ===================================================================
+-- test read-only and/or deferrable transactions
+-- ===================================================================
+CREATE TABLE loct (f1 int, f2 text);
+CREATE FUNCTION locf() RETURNS SETOF loct LANGUAGE SQL AS
+  'UPDATE public.loct SET f2 = f2 || f2 RETURNING *';
+CREATE VIEW locv AS SELECT t.* FROM locf() t;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'locv');
+CREATE FOREIGN TABLE remt2 (f1 int, f2 text)
+  SERVER loopback2 OPTIONS (table_name 'locv');
+INSERT INTO loct VALUES (1, 'foo'), (2, 'bar');
+
+START TRANSACTION READ ONLY;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should fail
+ROLLBACK;
+
+START TRANSACTION;
+SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK;
+
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt;  -- should fail
+ROLLBACK;
+
+-- Exercise abort code paths in pgfdw_xact_callback/pgfdw_subxact_callback
+-- in situations where multiple connections are involved
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt;  -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt2;  -- should fail
+ROLLBACK;
+
+DROP FOREIGN TABLE remt;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+  SERVER loopback OPTIONS (table_name 'loct');
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+SELECT * FROM remt;
+COMMIT;
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE DEFERRABLE;
+SELECT * FROM remt;
+COMMIT;
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+SELECT * FROM remt;
+COMMIT;
+
+-- Clean up
+DROP FOREIGN TABLE remt;
+DROP FOREIGN TABLE remt2;
+DROP VIEW locv;
+DROP FUNCTION locf();
+DROP TABLE loct;
+
 -- ===================================================================
 -- test parallel commit and parallel abort
 -- ===================================================================
diff --git a/doc/src/sgml/postgres-fdw.sgml b/doc/src/sgml/postgres-fdw.sgml
index de69ddcdebcc7..9185c76f93290 100644
--- a/doc/src/sgml/postgres-fdw.sgml
+++ b/doc/src/sgml/postgres-fdw.sgml
@@ -1103,6 +1103,23 @@ CREATE SUBSCRIPTION my_subscription SERVER subscription_server PUBLICATION testp
    <productname>PostgreSQL</productname> release might modify these rules.
   </para>
 
+  <para>
+   The remote transaction is opened in the same read/write mode as the local
+   transaction: if the local transaction is <literal>READ ONLY</literal>,
+   the remote transaction is opened in <literal>READ ONLY</literal> mode,
+   otherwise it is opened in <literal>READ WRITE</literal> mode.
+   (This rule is also applied to remote and local subtransactions.)
+   Note that this does not prevent login triggers executed on the remote
+   server from writing.
+  </para>
+
+  <para>
+   The remote transaction is also opened in the same deferrable mode as the
+   local transaction: if the local transaction is <literal>DEFERRABLE</literal>,
+   the remote transaction is opened in <literal>DEFERRABLE</literal> mode,
+   otherwise it is opened in <literal>NOT DEFERRABLE</literal> mode.
+  </para>
+
   <para>
    Note that it is currently not supported by
    <filename>postgres_fdw</filename> to prepare the remote transaction for
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index aafc53e016467..48bc90c967353 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1046,6 +1046,34 @@ TransactionStartedDuringRecovery(void)
 	return CurrentTransactionState->startedInRecovery;
 }
 
+/*
+ *	GetTopReadOnlyTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction or when neither
+ * a top-level transaction nor subtransactions are read-only, one when the
+ * top-level transaction is read-only, two when one level of subtransaction is
+ * read-only, etc.
+ *
+ * Note: subtransactions of the topmost read-only transaction are also
+ * read-only, because they inherit read-only mode from the transaction, and
+ * thus can't change to read-write mode (see check_transaction_read_only).
+ */
+int
+GetTopReadOnlyTransactionNestLevel(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (!XactReadOnly)
+		return 0;
+	while (s->nestingLevel > 1)
+	{
+		if (!s->prevXactReadOnly)
+			return s->nestingLevel;
+		s = s->parent;
+	}
+	return s->nestingLevel;
+}
+
 /*
  *	EnterParallelMode
  */
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index f0b4d795071af..a8cbdf247c866 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -459,6 +459,7 @@ extern TimestampTz GetCurrentTransactionStopTimestamp(void);
 extern void SetCurrentStatementStartTimestamp(void);
 extern int	GetCurrentTransactionNestLevel(void);
 extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
+extern int	GetTopReadOnlyTransactionNestLevel(void);
 extern void CommandCounterIncrement(void);
 extern void ForceSyncCommit(void);
 extern void StartTransactionCommand(void);

From caec9d9fadf1b04741ac554470c46bc1f8e89d19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Herrera?= <alvherre@kurilemu.de>
Date: Sun, 5 Apr 2026 13:34:08 +0200
Subject: [PATCH 11/23] Allow index_create to suppress index_build progress
 reporting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A future REPACK patch wants a way to suppress index_build doing its
progress reports when building an index, because that would interfere
with repack's own reporting; so add an INDEX_CREATE_SUPPRESS_PROGRESS
bit that enables this.

Furthermore, change the index_create_copy() API so that it takes flag
bits for index_create() and passes them unchanged.  This gives its
callers more direct control, which eases the interface -- now its
callers can pass the INDEX_CREATE_SUPPRESS_PROGRESS bit directly.  We
use it for the current caller in REINDEX CONCURRENTLY, since it's also
not interested in progress reporting, since it doesn't want
index_build() to be called at all in the first place.

One thing to keep in mind, pointed out by Mihail, is that we're not
suppressing the index-AM-specific progress report updates which happen
during ambuild().  At present this is not a problem, because the values
updated by those don't overlap with those used by commands other than
CREATE INDEX; but maybe in the future we'll want the ability to suppress
them also.  (Alternatively we might want to display how each
index-build-subcommand progresses during REPACK and others.)

Author: Antonin Houska <ah@cybertec.at>
Author: Álvaro Herrera <alvherre@kurilemu.de>
Reviewed-by: Mihail Nikalayeu <mihailnikalayeu@gmail.com>
Discussion: https://postgr.es/m/102906.1773668762@localhost
---
 src/backend/bootstrap/bootstrap.c |  2 +-
 src/backend/catalog/heap.c        |  3 ++-
 src/backend/catalog/index.c       | 27 ++++++++++++++++-----------
 src/backend/commands/indexcmds.c  |  4 +++-
 src/include/catalog/index.h       |  6 ++++--
 5 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index c52c0a6023ddf..ebd41176b9446 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -1184,7 +1184,7 @@ build_indices(void)
 		heap = table_open(ILHead->il_heap, NoLock);
 		ind = index_open(ILHead->il_ind, NoLock);
 
-		index_build(heap, ind, ILHead->il_info, false, false);
+		index_build(heap, ind, ILHead->il_info, false, false, false);
 
 		index_close(ind, NoLock);
 		table_close(heap, NoLock);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 5748aa9a1a9af..ae6b7cda3ddfe 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -3570,7 +3570,8 @@ RelationTruncateIndexes(Relation heapRelation)
 
 		/* Initialize the index and rebuild */
 		/* Note: we do not need to re-establish pkey setting */
-		index_build(heapRelation, currentIndex, indexInfo, true, false);
+		index_build(heapRelation, currentIndex, indexInfo, true, false,
+					true);
 
 		/* We're done with this index */
 		index_close(currentIndex, NoLock);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index e418d67e8e422..9407c357f2716 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -715,6 +715,9 @@ UpdateIndexRelation(Oid indexoid,
  *			already exists.
  *		INDEX_CREATE_PARTITIONED:
  *			create a partitioned index (table must be partitioned)
+ *		INDEX_CREATE_SUPPRESS_PROGRESS:
+ *			don't report progress during the index build.
+ *
  * constr_flags: flags passed to index_constraint_create
  *		(only if INDEX_CREATE_ADD_CONSTRAINT is set)
  * allow_system_table_mods: allow table to be a system catalog
@@ -760,6 +763,7 @@ index_create(Relation heapRelation,
 	bool		invalid = (flags & INDEX_CREATE_INVALID) != 0;
 	bool		concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
 	bool		partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0;
+	bool		progress = (flags & INDEX_CREATE_SUPPRESS_PROGRESS) == 0;
 	char		relkind;
 	TransactionId relfrozenxid;
 	MultiXactId relminmxid;
@@ -1276,7 +1280,8 @@ index_create(Relation heapRelation,
 	}
 	else
 	{
-		index_build(heapRelation, indexRelation, indexInfo, false, true);
+		index_build(heapRelation, indexRelation, indexInfo, false, true,
+					progress);
 	}
 
 	/*
@@ -1292,19 +1297,20 @@ index_create(Relation heapRelation,
  * index_create_copy
  *
  * Create an index based on the definition of the one provided by caller.  The
- * index is inserted into catalogs. If 'concurrently' is TRUE, it needs to be
- * built later on; otherwise it's built immediately.
+ * index is inserted into catalogs.  'flags' are passed directly to
+ * index_create.
  *
  * "tablespaceOid" is the tablespace to use for this index.
  */
 Oid
-index_create_copy(Relation heapRelation, bool concurrently,
+index_create_copy(Relation heapRelation, uint16 flags,
 				  Oid oldIndexId, Oid tablespaceOid, const char *newName)
 {
 	Relation	indexRelation;
 	IndexInfo  *oldInfo,
 			   *newInfo;
 	Oid			newIndexId = InvalidOid;
+	bool		concurrently = (flags & INDEX_CREATE_CONCURRENT) != 0;
 	HeapTuple	indexTuple,
 				classTuple;
 	Datum		indclassDatum,
@@ -1318,7 +1324,6 @@ index_create_copy(Relation heapRelation, bool concurrently,
 	List	   *indexColNames = NIL;
 	List	   *indexExprs = NIL;
 	List	   *indexPreds = NIL;
-	int			flags = 0;
 
 	indexRelation = index_open(oldIndexId, RowExclusiveLock);
 
@@ -1448,9 +1453,6 @@ index_create_copy(Relation heapRelation, bool concurrently,
 		stattargets[i].isnull = isnull;
 	}
 
-	if (concurrently)
-		flags = INDEX_CREATE_SKIP_BUILD | INDEX_CREATE_CONCURRENT;
-
 	/*
 	 * Now create the new index.
 	 *
@@ -1538,7 +1540,7 @@ index_concurrently_build(Oid heapRelationId,
 	indexInfo->ii_BrokenHotChain = false;
 
 	/* Now build the index */
-	index_build(heapRel, indexRelation, indexInfo, false, true);
+	index_build(heapRel, indexRelation, indexInfo, false, true, true);
 
 	/* Roll back any GUC changes executed by index functions */
 	AtEOXact_GUC(false, save_nestlevel);
@@ -3009,6 +3011,7 @@ index_update_stats(Relation rel,
  *
  * isreindex indicates we are recreating a previously-existing index.
  * parallel indicates if parallelism may be useful.
+ * progress indicates if the backend should update its progress info.
  *
  * Note: before Postgres 8.2, the passed-in heap and index Relations
  * were automatically closed by this routine.  This is no longer the case.
@@ -3019,7 +3022,8 @@ index_build(Relation heapRelation,
 			Relation indexRelation,
 			IndexInfo *indexInfo,
 			bool isreindex,
-			bool parallel)
+			bool parallel,
+			bool progress)
 {
 	IndexBuildResult *stats;
 	Oid			save_userid;
@@ -3070,6 +3074,7 @@ index_build(Relation heapRelation,
 	RestrictSearchPath();
 
 	/* Set up initial progress report status */
+	if (progress)
 	{
 		const int	progress_index[] = {
 			PROGRESS_CREATEIDX_PHASE,
@@ -3827,7 +3832,7 @@ reindex_index(const ReindexStmt *stmt, Oid indexId,
 
 	/* Initialize the index and rebuild */
 	/* Note: we do not need to re-establish pkey setting */
-	index_build(heapRelation, iRel, indexInfo, true, true);
+	index_build(heapRelation, iRel, indexInfo, true, true, progress);
 
 	/* Re-allow use of target index */
 	ResetReindexProcessing();
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index cba379810c779..9ab74c8df0a1b 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -3990,7 +3990,9 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein
 
 		/* Create new index definition based on given index */
 		newIndexId = index_create_copy(heapRel,
-									   true,
+									   INDEX_CREATE_CONCURRENT |
+									   INDEX_CREATE_SKIP_BUILD |
+									   INDEX_CREATE_SUPPRESS_PROGRESS,
 									   idx->indexId,
 									   tablespaceid,
 									   concurrentName);
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index ed9e4c37d27a5..9aee822634781 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -71,6 +71,7 @@ extern void index_check_primary_key(Relation heapRel,
 #define	INDEX_CREATE_IF_NOT_EXISTS			(1 << 4)
 #define	INDEX_CREATE_PARTITIONED			(1 << 5)
 #define INDEX_CREATE_INVALID				(1 << 6)
+#define INDEX_CREATE_SUPPRESS_PROGRESS		(1 << 7)
 
 extern Oid	index_create(Relation heapRelation,
 						 const char *indexRelationName,
@@ -101,7 +102,7 @@ extern Oid	index_create(Relation heapRelation,
 #define	INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS	(1 << 4)
 #define	INDEX_CONSTR_CREATE_WITHOUT_OVERLAPS (1 << 5)
 
-extern Oid	index_create_copy(Relation heapRelation, bool concurrently,
+extern Oid	index_create_copy(Relation heapRelation, uint16 flags,
 							  Oid oldIndexId, Oid tablespaceOid,
 							  const char *newName);
 
@@ -148,7 +149,8 @@ extern void index_build(Relation heapRelation,
 						Relation indexRelation,
 						IndexInfo *indexInfo,
 						bool isreindex,
-						bool parallel);
+						bool parallel,
+						bool progress);
 
 extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
 

From 9134b8275c44dfdabeee8d08649da1a4b5c75daa Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sat, 1 Mar 2025 19:31:30 -0800
Subject: [PATCH 12/23] instrumentation: Separate trigger logic from other uses

Introduce TriggerInstrumentation to capture trigger timing and firings
(previously counted in "ntuples"), to aid a future refactoring that
splits out all Instrumentation fields beyond timing and WAL/buffers into
more specific structs.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion:
---
 src/backend/commands/explain.c    | 19 ++++++++-----------
 src/backend/commands/trigger.c    | 22 +++++++++++-----------
 src/backend/executor/execMain.c   |  2 +-
 src/backend/executor/instrument.c | 26 ++++++++++++++++++++++++++
 src/include/executor/instrument.h | 12 ++++++++++++
 src/include/nodes/execnodes.h     |  3 ++-
 src/tools/pgindent/typedefs.list  |  1 +
 7 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e4b70166b0e50..eb6ef23c2d667 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1101,18 +1101,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 	for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
 	{
 		Trigger    *trig = rInfo->ri_TrigDesc->triggers + nt;
-		Instrumentation *instr = rInfo->ri_TrigInstrument + nt;
+		TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt;
 		char	   *relname;
 		char	   *conname = NULL;
 
-		/* Must clean up instrumentation state */
-		InstrEndLoop(instr);
-
 		/*
 		 * We ignore triggers that were never invoked; they likely aren't
 		 * relevant to the current query type.
 		 */
-		if (instr->ntuples == 0)
+		if (tginstr->firings == 0)
 			continue;
 
 		ExplainOpenGroup("Trigger", NULL, true, es);
@@ -1137,11 +1134,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 			if (show_relname)
 				appendStringInfo(es->str, " on %s", relname);
 			if (es->timing)
-				appendStringInfo(es->str, ": time=%.3f calls=%.0f\n",
-								 INSTR_TIME_GET_MILLISEC(instr->total),
-								 instr->ntuples);
+				appendStringInfo(es->str, ": time=%.3f calls=%d\n",
+								 INSTR_TIME_GET_MILLISEC(tginstr->instr.total),
+								 tginstr->firings);
 			else
-				appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples);
+				appendStringInfo(es->str, ": calls=%d\n", tginstr->firings);
 		}
 		else
 		{
@@ -1151,9 +1148,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 			ExplainPropertyText("Relation", relname, es);
 			if (es->timing)
 				ExplainPropertyFloat("Time", "ms",
-									 INSTR_TIME_GET_MILLISEC(instr->total), 3,
+									 INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3,
 									 es);
-			ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es);
+			ExplainPropertyInteger("Calls", NULL, tginstr->firings, es);
 		}
 
 		if (conname)
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 90e94fb8a5a4b..4d4e96a530236 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -92,7 +92,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
 static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
 									 int tgindx,
 									 FmgrInfo *finfo,
-									 Instrumentation *instr,
+									 TriggerInstrumentation *instr,
 									 MemoryContext per_tuple_context);
 static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 								  ResultRelInfo *src_partinfo,
@@ -2311,7 +2311,7 @@ static HeapTuple
 ExecCallTriggerFunc(TriggerData *trigdata,
 					int tgindx,
 					FmgrInfo *finfo,
-					Instrumentation *instr,
+					TriggerInstrumentation *instr,
 					MemoryContext per_tuple_context)
 {
 	LOCAL_FCINFO(fcinfo, 0);
@@ -2346,7 +2346,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
 	 * If doing EXPLAIN ANALYZE, start charging time to this trigger.
 	 */
 	if (instr)
-		InstrStartNode(instr + tgindx);
+		InstrStartTrigger(instr + tgindx);
 
 	/*
 	 * Do the function evaluation in the per-tuple memory context, so that
@@ -2391,10 +2391,10 @@ ExecCallTriggerFunc(TriggerData *trigdata,
 
 	/*
 	 * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
-	 * one "tuple returned" (really the number of firings).
+	 * the firing of the trigger.
 	 */
 	if (instr)
-		InstrStopNode(instr + tgindx, 1);
+		InstrStopTrigger(instr + tgindx, 1);
 
 	return (HeapTuple) DatumGetPointer(result);
 }
@@ -3947,7 +3947,7 @@ static void AfterTriggerExecute(EState *estate,
 								ResultRelInfo *dst_relInfo,
 								TriggerDesc *trigdesc,
 								FmgrInfo *finfo,
-								Instrumentation *instr,
+								TriggerInstrumentation *instr,
 								MemoryContext per_tuple_context,
 								TupleTableSlot *trig_tuple_slot1,
 								TupleTableSlot *trig_tuple_slot2);
@@ -4342,7 +4342,7 @@ AfterTriggerExecute(EState *estate,
 					ResultRelInfo *src_relInfo,
 					ResultRelInfo *dst_relInfo,
 					TriggerDesc *trigdesc,
-					FmgrInfo *finfo, Instrumentation *instr,
+					FmgrInfo *finfo, TriggerInstrumentation *instr,
 					MemoryContext per_tuple_context,
 					TupleTableSlot *trig_tuple_slot1,
 					TupleTableSlot *trig_tuple_slot2)
@@ -4383,7 +4383,7 @@ AfterTriggerExecute(EState *estate,
 	 * to include time spent re-fetching tuples in the trigger cost.
 	 */
 	if (instr)
-		InstrStartNode(instr + tgindx);
+		InstrStartTrigger(instr + tgindx);
 
 	/*
 	 * Fetch the required tuple(s).
@@ -4600,10 +4600,10 @@ AfterTriggerExecute(EState *estate,
 
 	/*
 	 * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
-	 * one "tuple returned" (really the number of firings).
+	 * the firing of the trigger.
 	 */
 	if (instr)
-		InstrStopNode(instr + tgindx, 1);
+		InstrStopTrigger(instr + tgindx, 1);
 }
 
 
@@ -4719,7 +4719,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
 	Relation	rel = NULL;
 	TriggerDesc *trigdesc = NULL;
 	FmgrInfo   *finfo = NULL;
-	Instrumentation *instr = NULL;
+	TriggerInstrumentation *instr = NULL;
 	TupleTableSlot *slot1 = NULL,
 			   *slot2 = NULL;
 
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 45e00c6af85de..0237d8c3b1d8a 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1285,7 +1285,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 		resultRelInfo->ri_TrigWhenExprs = (ExprState **)
 			palloc0_array(ExprState *, n);
 		if (instrument_options)
-			resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+			resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options);
 	}
 	else
 	{
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index a40610bc2522f..9354ad7be126c 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -196,6 +196,32 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
 		WalUsageAdd(&dst->walusage, &add->walusage);
 }
 
+/* Trigger instrumentation handling */
+TriggerInstrumentation *
+InstrAllocTrigger(int n, int instrument_options)
+{
+	TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation));
+	int			i;
+
+	for (i = 0; i < n; i++)
+		InstrInit(&tginstr[i].instr, instrument_options);
+
+	return tginstr;
+}
+
+void
+InstrStartTrigger(TriggerInstrumentation *tginstr)
+{
+	InstrStartNode(&tginstr->instr);
+}
+
+void
+InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
+{
+	InstrStopNode(&tginstr->instr, 0);
+	tginstr->firings += firings;
+}
+
 /* note current values during parallel executor startup */
 void
 InstrStartParallelQuery(void)
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9759f3ea5d8d9..a9c2233227fdc 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -100,6 +100,13 @@ typedef struct WorkerInstrumentation
 	Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
 } WorkerInstrumentation;
 
+typedef struct TriggerInstrumentation
+{
+	Instrumentation instr;
+	int			firings;		/* number of times the instrumented trigger
+								 * was fired */
+} TriggerInstrumentation;
+
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 extern PGDLLIMPORT WalUsage pgWalUsage;
 
@@ -111,6 +118,11 @@ extern void InstrStopNode(Instrumentation *instr, double nTuples);
 extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
 extern void InstrEndLoop(Instrumentation *instr);
 extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
+
+extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options);
+extern void InstrStartTrigger(TriggerInstrumentation *tginstr);
+extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings);
+
 extern void InstrStartParallelQuery(void);
 extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
 extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 090cfccf65fa0..908898aa7c9e5 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -60,6 +60,7 @@ typedef struct ScanKeyData ScanKeyData;
 typedef struct SnapshotData *Snapshot;
 typedef struct SortSupportData *SortSupport;
 typedef struct TIDBitmap TIDBitmap;
+typedef struct TriggerInstrumentation TriggerInstrumentation;
 typedef struct TupleConversionMap TupleConversionMap;
 typedef struct TupleDescData *TupleDesc;
 typedef struct Tuplesortstate Tuplesortstate;
@@ -552,7 +553,7 @@ typedef struct ResultRelInfo
 	ExprState **ri_TrigWhenExprs;
 
 	/* optional runtime measurements for triggers */
-	Instrumentation *ri_TrigInstrument;
+	TriggerInstrumentation *ri_TrigInstrument;
 
 	/* On-demand created slots for triggers / returning processing */
 	TupleTableSlot *ri_ReturningSlot;	/* for trigger output tuples */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c72f6c595730a..7ddf970fb97fc 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3213,6 +3213,7 @@ TriggerDesc
 TriggerEvent
 TriggerFlags
 TriggerInfo
+TriggerInstrumentation
 TriggerTransition
 TruncateStmt
 TsmRoutine

From bf620643238327dd6aa5192aa4940b5ff5791328 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sat, 7 Mar 2026 01:19:50 -0800
Subject: [PATCH 13/23] instrumentation: Separate per-node logic from other
 uses

Previously different places (e.g. query "total time") were repurposing
the Instrumentation struct initially introduced for capturing per-node
statistics during execution. This overuse of the same struct is confusing,
e.g. by cluttering calls of InstrStartNode/InstrStopNode in unrelated
code paths, and prevents future refactorings.

Instead, simplify the Instrumentation struct to only track time and
WAL/buffer usage. Similarly, drop the use of InstrEndLoop outside of
per-node instrumentation - these calls were added without any apparent
benefit since the relevant fields were never read.

Introduce the NodeInstrumentation struct to carry forward the per-node
instrumentation information. WorkerInstrumentation is renamed to
WorkerNodeInstrumentation for clarity.

In passing, drop the "n" argument to InstrAlloc, as all remaining callers
need exactly one Instrumentation struct. Additionally, clarify that
InstrAggNode is expected to only run after InstrEndLoop (as it does in
practice), and drop unused code.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion:
---
 contrib/auto_explain/auto_explain.c           |   8 +-
 .../pg_stat_statements/pg_stat_statements.c   |   8 +-
 contrib/postgres_fdw/postgres_fdw.c           |   2 +-
 src/backend/commands/explain.c                |  20 +--
 src/backend/executor/execMain.c               |   8 +-
 src/backend/executor/execParallel.c           |  24 +--
 src/backend/executor/execProcnode.c           |   4 +-
 src/backend/executor/instrument.c             | 158 +++++++++++-------
 src/include/executor/instrument.h             |  60 ++++---
 src/include/nodes/execnodes.h                 |   9 +-
 src/tools/pgindent/typedefs.list              |   3 +-
 11 files changed, 179 insertions(+), 125 deletions(-)

diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index e856cd35a6f0f..39bf2543b701d 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -315,7 +315,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
 			MemoryContext oldcxt;
 
 			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-			queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+			queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL);
 			MemoryContextSwitchTo(oldcxt);
 		}
 	}
@@ -381,12 +381,6 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 		 */
 		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
 
-		/*
-		 * Make sure stats accumulation is done.  (Note: it's okay if several
-		 * levels of hook all do this.)
-		 */
-		InstrEndLoop(queryDesc->totaltime);
-
 		/* Log plan if duration is exceeded. */
 		msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
 		if (msec >= auto_explain_log_min_duration)
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 5494d41dca161..fbf32f0e72c29 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -1025,7 +1025,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 			MemoryContext oldcxt;
 
 			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-			queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+			queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL);
 			MemoryContextSwitchTo(oldcxt);
 		}
 	}
@@ -1084,12 +1084,6 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
 	if (queryId != INT64CONST(0) && queryDesc->totaltime &&
 		pgss_enabled(nesting_level))
 	{
-		/*
-		 * Make sure stats accumulation is done.  (Note: it's okay if several
-		 * levels of hook all do this.)
-		 */
-		InstrEndLoop(queryDesc->totaltime);
-
 		pgss_store(queryDesc->sourceText,
 				   queryId,
 				   queryDesc->plannedstmt->stmt_location,
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 41e47cc795ba8..cc8ec24c30eb0 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2779,7 +2779,7 @@ postgresIterateDirectModify(ForeignScanState *node)
 	if (!resultRelInfo->ri_projectReturning)
 	{
 		TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
-		Instrumentation *instr = node->ss.ps.instrument;
+		NodeInstrumentation *instr = node->ss.ps.instrument;
 
 		Assert(!dmstate->has_returning);
 
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index eb6ef23c2d667..e73dc129132be 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1837,7 +1837,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	{
 		double		nloops = planstate->instrument->nloops;
 		double		startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops;
-		double		total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops;
+		double		total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->instr.total) / nloops;
 		double		rows = planstate->instrument->ntuples / nloops;
 
 		if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -1890,11 +1890,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	/* prepare per-worker general execution details */
 	if (es->workers_state && es->verbose)
 	{
-		WorkerInstrumentation *w = planstate->worker_instrument;
+		WorkerNodeInstrumentation *w = planstate->worker_instrument;
 
 		for (int n = 0; n < w->num_workers; n++)
 		{
-			Instrumentation *instrument = &w->instrument[n];
+			NodeInstrumentation *instrument = &w->instrument[n];
 			double		nloops = instrument->nloops;
 			double		startup_ms;
 			double		total_ms;
@@ -1903,7 +1903,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			if (nloops <= 0)
 				continue;
 			startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops;
-			total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops;
+			total_ms = INSTR_TIME_GET_MILLISEC(instrument->instr.total) / nloops;
 			rows = instrument->ntuples / nloops;
 
 			ExplainOpenWorker(n, es);
@@ -2290,18 +2290,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 	/* Show buffer/WAL usage */
 	if (es->buffers && planstate->instrument)
-		show_buffer_usage(es, &planstate->instrument->bufusage);
+		show_buffer_usage(es, &planstate->instrument->instr.bufusage);
 	if (es->wal && planstate->instrument)
-		show_wal_usage(es, &planstate->instrument->walusage);
+		show_wal_usage(es, &planstate->instrument->instr.walusage);
 
 	/* Prepare per-worker buffer/WAL usage */
 	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
 	{
-		WorkerInstrumentation *w = planstate->worker_instrument;
+		WorkerNodeInstrumentation *w = planstate->worker_instrument;
 
 		for (int n = 0; n < w->num_workers; n++)
 		{
-			Instrumentation *instrument = &w->instrument[n];
+			NodeInstrumentation *instrument = &w->instrument[n];
 			double		nloops = instrument->nloops;
 
 			if (nloops <= 0)
@@ -2309,9 +2309,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 			ExplainOpenWorker(n, es);
 			if (es->buffers)
-				show_buffer_usage(es, &instrument->bufusage);
+				show_buffer_usage(es, &instrument->instr.bufusage);
 			if (es->wal)
-				show_wal_usage(es, &instrument->walusage);
+				show_wal_usage(es, &instrument->instr.walusage);
 			ExplainCloseWorker(n, es);
 		}
 	}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 0237d8c3b1d8a..b0f636bf8b6c2 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -333,7 +333,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
-		InstrStartNode(queryDesc->totaltime);
+		InstrStart(queryDesc->totaltime);
 
 	/*
 	 * extract information from the query descriptor and the query feature.
@@ -385,7 +385,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 		dest->rShutdown(dest);
 
 	if (queryDesc->totaltime)
-		InstrStopNode(queryDesc->totaltime, estate->es_processed);
+		InstrStop(queryDesc->totaltime);
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -435,7 +435,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
-		InstrStartNode(queryDesc->totaltime);
+		InstrStart(queryDesc->totaltime);
 
 	/* Run ModifyTable nodes to completion */
 	ExecPostprocessPlan(estate);
@@ -445,7 +445,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 		AfterTriggerEndQuery(estate);
 
 	if (queryDesc->totaltime)
-		InstrStopNode(queryDesc->totaltime, 0);
+		InstrStop(queryDesc->totaltime);
 
 	MemoryContextSwitchTo(oldcontext);
 
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 755191b51ef66..78f60c1530ce1 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -87,7 +87,7 @@ typedef struct FixedParallelExecutorState
  * instrument_options: Same meaning here as in instrument.c.
  *
  * instrument_offset: Offset, relative to the start of this structure,
- * of the first Instrumentation object.  This will depend on the length of
+ * of the first NodeInstrumentation object.  This will depend on the length of
  * the plan_node_id array.
  *
  * num_workers: Number of workers.
@@ -104,11 +104,15 @@ struct SharedExecutorInstrumentation
 	int			num_workers;
 	int			num_plan_nodes;
 	int			plan_node_id[FLEXIBLE_ARRAY_MEMBER];
-	/* array of num_plan_nodes * num_workers Instrumentation objects follows */
+
+	/*
+	 * array of num_plan_nodes * num_workers NodeInstrumentation objects
+	 * follows
+	 */
 };
 #define GetInstrumentationArray(sei) \
 	(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
-	 (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+	 (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
 
 /* Context object for ExecParallelEstimate. */
 typedef struct ExecParallelEstimateContext
@@ -731,7 +735,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		instrumentation_len = MAXALIGN(instrumentation_len);
 		instrument_offset = instrumentation_len;
 		instrumentation_len +=
-			mul_size(sizeof(Instrumentation),
+			mul_size(sizeof(NodeInstrumentation),
 					 mul_size(e.nnodes, nworkers));
 		shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
 		shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -817,7 +821,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	 */
 	if (estate->es_instrument)
 	{
-		Instrumentation *instrument;
+		NodeInstrumentation *instrument;
 		int			i;
 
 		instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
@@ -827,7 +831,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		instrumentation->num_plan_nodes = e.nnodes;
 		instrument = GetInstrumentationArray(instrumentation);
 		for (i = 0; i < nworkers * e.nnodes; ++i)
-			InstrInit(&instrument[i], estate->es_instrument);
+			InstrInitNode(&instrument[i], estate->es_instrument);
 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
 					   instrumentation);
 		pei->instrumentation = instrumentation;
@@ -1059,7 +1063,7 @@ static bool
 ExecParallelRetrieveInstrumentation(PlanState *planstate,
 									SharedExecutorInstrumentation *instrumentation)
 {
-	Instrumentation *instrument;
+	NodeInstrumentation *instrument;
 	int			i;
 	int			n;
 	int			ibytes;
@@ -1087,9 +1091,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	 * Switch into per-query memory context.
 	 */
 	oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
-	ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+	ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
 	planstate->worker_instrument =
-		palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+		palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument));
 	MemoryContextSwitchTo(oldcontext);
 
 	planstate->worker_instrument->num_workers = instrumentation->num_workers;
@@ -1319,7 +1323,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 {
 	int			i;
 	int			plan_node_id = planstate->plan->plan_node_id;
-	Instrumentation *instrument;
+	NodeInstrumentation *instrument;
 
 	InstrEndLoop(planstate->instrument);
 
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d35976925ae76..132fe37ef60f8 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -414,8 +414,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation for this node if requested */
 	if (estate->es_instrument)
-		result->instrument = InstrAlloc(1, estate->es_instrument,
-										result->async_capable);
+		result->instrument = InstrAllocNode(estate->es_instrument,
+											result->async_capable);
 
 	return result;
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 9354ad7be126c..e3d890a7f98d6 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -26,51 +26,31 @@ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
 static void WalUsageAdd(WalUsage *dst, WalUsage *add);
 
 
-/* Allocate new instrumentation structure(s) */
+/* General purpose instrumentation handling */
 Instrumentation *
-InstrAlloc(int n, int instrument_options, bool async_mode)
+InstrAlloc(int instrument_options)
 {
-	Instrumentation *instr;
-
-	/* initialize all fields to zeroes, then modify as needed */
-	instr = palloc0(n * sizeof(Instrumentation));
-	if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
-	{
-		bool		need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
-		bool		need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
-		bool		need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
-		int			i;
-
-		for (i = 0; i < n; i++)
-		{
-			instr[i].need_bufusage = need_buffers;
-			instr[i].need_walusage = need_wal;
-			instr[i].need_timer = need_timer;
-			instr[i].async_mode = async_mode;
-		}
-	}
+	Instrumentation *instr = palloc0(sizeof(Instrumentation));
 
+	InstrInitOptions(instr, instrument_options);
 	return instr;
 }
 
-/* Initialize a pre-allocated instrumentation structure. */
 void
-InstrInit(Instrumentation *instr, int instrument_options)
+InstrInitOptions(Instrumentation *instr, int instrument_options)
 {
-	memset(instr, 0, sizeof(Instrumentation));
 	instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
 	instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
 	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
 }
 
-/* Entry to a plan node */
 void
-InstrStartNode(Instrumentation *instr)
+InstrStart(Instrumentation *instr)
 {
 	if (instr->need_timer)
 	{
 		if (!INSTR_TIME_IS_ZERO(instr->starttime))
-			elog(ERROR, "InstrStartNode called twice in a row");
+			elog(ERROR, "InstrStart called twice in a row");
 		else
 			INSTR_TIME_SET_CURRENT(instr->starttime);
 	}
@@ -83,24 +63,19 @@ InstrStartNode(Instrumentation *instr)
 		instr->walusage_start = pgWalUsage;
 }
 
-/* Exit from a plan node */
 void
-InstrStopNode(Instrumentation *instr, double nTuples)
+InstrStop(Instrumentation *instr)
 {
-	double		save_tuplecount = instr->tuplecount;
 	instr_time	endtime;
 
-	/* count the returned tuples */
-	instr->tuplecount += nTuples;
-
 	/* let's update the time only if the timer was requested */
 	if (instr->need_timer)
 	{
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
-			elog(ERROR, "InstrStopNode called without start");
+			elog(ERROR, "InstrStop called without start");
 
 		INSTR_TIME_SET_CURRENT(endtime);
-		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
+		INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
 	}
@@ -113,6 +88,74 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 	if (instr->need_walusage)
 		WalUsageAccumDiff(&instr->walusage,
 						  &pgWalUsage, &instr->walusage_start);
+}
+
+/* Node instrumentation handling */
+
+/* Allocate new node instrumentation structure */
+NodeInstrumentation *
+InstrAllocNode(int instrument_options, bool async_mode)
+{
+	NodeInstrumentation *instr = palloc(sizeof(NodeInstrumentation));
+
+	InstrInitNode(instr, instrument_options);
+	instr->async_mode = async_mode;
+
+	return instr;
+}
+
+/* Initialize a pre-allocated instrumentation structure. */
+void
+InstrInitNode(NodeInstrumentation *instr, int instrument_options)
+{
+	memset(instr, 0, sizeof(NodeInstrumentation));
+	InstrInitOptions(&instr->instr, instrument_options);
+}
+
+/* Entry to a plan node */
+void
+InstrStartNode(NodeInstrumentation *instr)
+{
+	InstrStart(&instr->instr);
+}
+
+/* Exit from a plan node */
+void
+InstrStopNode(NodeInstrumentation *instr, double nTuples)
+{
+	double		save_tuplecount = instr->tuplecount;
+	instr_time	endtime;
+
+	/* count the returned tuples */
+	instr->tuplecount += nTuples;
+
+	/*
+	 * Update the time only if the timer was requested.
+	 *
+	 * Note this is different from InstrStop because total is only updated in
+	 * InstrEndLoop. We need the separate counter variable because we need to
+	 * calculate start-up time for the first tuple in each cycle, and then
+	 * accumulate it together.
+	 */
+	if (instr->instr.need_timer)
+	{
+		if (INSTR_TIME_IS_ZERO(instr->instr.starttime))
+			elog(ERROR, "InstrStopNode called without start");
+
+		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime);
+
+		INSTR_TIME_SET_ZERO(instr->instr.starttime);
+	}
+
+	/* Add delta of buffer usage since entry to node's totals */
+	if (instr->instr.need_bufusage)
+		BufferUsageAccumDiff(&instr->instr.bufusage,
+							 &pgBufferUsage, &instr->instr.bufusage_start);
+
+	if (instr->instr.need_walusage)
+		WalUsageAccumDiff(&instr->instr.walusage,
+						  &pgWalUsage, &instr->instr.walusage_start);
 
 	/* Is this the first tuple of this cycle? */
 	if (!instr->running)
@@ -133,7 +176,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 
 /* Update tuple count */
 void
-InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
 {
 	/* count the returned tuples */
 	instr->tuplecount += nTuples;
@@ -141,47 +184,40 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
 
 /* Finish a run cycle for a plan node */
 void
-InstrEndLoop(Instrumentation *instr)
+InstrEndLoop(NodeInstrumentation *instr)
 {
 	/* Skip if nothing has happened, or already shut down */
 	if (!instr->running)
 		return;
 
-	if (!INSTR_TIME_IS_ZERO(instr->starttime))
+	if (!INSTR_TIME_IS_ZERO(instr->instr.starttime))
 		elog(ERROR, "InstrEndLoop called on running node");
 
 	/* Accumulate per-cycle statistics into totals */
 	INSTR_TIME_ADD(instr->startup, instr->firsttuple);
-	INSTR_TIME_ADD(instr->total, instr->counter);
+	INSTR_TIME_ADD(instr->instr.total, instr->counter);
 	instr->ntuples += instr->tuplecount;
 	instr->nloops += 1;
 
 	/* Reset for next cycle (if any) */
 	instr->running = false;
-	INSTR_TIME_SET_ZERO(instr->starttime);
+	INSTR_TIME_SET_ZERO(instr->instr.starttime);
 	INSTR_TIME_SET_ZERO(instr->counter);
 	INSTR_TIME_SET_ZERO(instr->firsttuple);
 	instr->tuplecount = 0;
 }
 
-/* aggregate instrumentation information */
+/*
+ * Aggregate instrumentation from parallel workers. Must be called after
+ * InstrEndLoop.
+ */
 void
-InstrAggNode(Instrumentation *dst, Instrumentation *add)
+InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
 {
-	if (!dst->running && add->running)
-	{
-		dst->running = true;
-		dst->firsttuple = add->firsttuple;
-	}
-	else if (dst->running && add->running &&
-			 INSTR_TIME_GT(dst->firsttuple, add->firsttuple))
-		dst->firsttuple = add->firsttuple;
-
-	INSTR_TIME_ADD(dst->counter, add->counter);
+	Assert(!add->running);
 
-	dst->tuplecount += add->tuplecount;
 	INSTR_TIME_ADD(dst->startup, add->startup);
-	INSTR_TIME_ADD(dst->total, add->total);
+	INSTR_TIME_ADD(dst->instr.total, add->instr.total);
 	dst->ntuples += add->ntuples;
 	dst->ntuples2 += add->ntuples2;
 	dst->nloops += add->nloops;
@@ -189,11 +225,11 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
 	dst->nfiltered2 += add->nfiltered2;
 
 	/* Add delta of buffer usage since entry to node's totals */
-	if (dst->need_bufusage)
-		BufferUsageAdd(&dst->bufusage, &add->bufusage);
+	if (dst->instr.need_bufusage)
+		BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage);
 
-	if (dst->need_walusage)
-		WalUsageAdd(&dst->walusage, &add->walusage);
+	if (dst->instr.need_walusage)
+		WalUsageAdd(&dst->instr.walusage, &add->instr.walusage);
 }
 
 /* Trigger instrumentation handling */
@@ -204,7 +240,7 @@ InstrAllocTrigger(int n, int instrument_options)
 	int			i;
 
 	for (i = 0; i < n; i++)
-		InstrInit(&tginstr[i].instr, instrument_options);
+		InstrInitOptions(&tginstr[i].instr, instrument_options);
 
 	return tginstr;
 }
@@ -212,13 +248,13 @@ InstrAllocTrigger(int n, int instrument_options)
 void
 InstrStartTrigger(TriggerInstrumentation *tginstr)
 {
-	InstrStartNode(&tginstr->instr);
+	InstrStart(&tginstr->instr);
 }
 
 void
 InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
 {
-	InstrStopNode(&tginstr->instr, 0);
+	InstrStop(&tginstr->instr);
 	tginstr->firings += firings;
 }
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index a9c2233227fdc..b11d64633b5f4 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -67,38 +67,55 @@ typedef enum InstrumentOption
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
+/*
+ * General purpose instrumentation that can capture time and WAL/buffer usage
+ *
+ * Initialized through InstrAlloc, followed by one or more calls to a pair of
+ * InstrStart/InstrStop (activity is measured inbetween).
+ */
 typedef struct Instrumentation
 {
-	/* Parameters set at node creation: */
+	/* Parameters set at creation: */
 	bool		need_timer;		/* true if we need timer data */
 	bool		need_bufusage;	/* true if we need buffer usage data */
 	bool		need_walusage;	/* true if we need WAL usage data */
+	/* Internal state keeping: */
+	instr_time	starttime;		/* start time of last InstrStart */
+	BufferUsage bufusage_start; /* buffer usage at start */
+	WalUsage	walusage_start; /* WAL usage at start */
+	/* Accumulated statistics: */
+	instr_time	total;			/* total runtime */
+	BufferUsage bufusage;		/* total buffer usage */
+	WalUsage	walusage;		/* total WAL usage */
+} Instrumentation;
+
+/*
+ * Specialized instrumentation for per-node execution statistics
+ */
+typedef struct NodeInstrumentation
+{
+	Instrumentation instr;
+	/* Parameters set at node creation: */
 	bool		async_mode;		/* true if node is in async mode */
 	/* Info about current plan cycle: */
 	bool		running;		/* true if we've completed first tuple */
-	instr_time	starttime;		/* start time of current iteration of node */
 	instr_time	counter;		/* accumulated runtime for this node */
 	instr_time	firsttuple;		/* time for first tuple of this cycle */
 	double		tuplecount;		/* # of tuples emitted so far this cycle */
-	BufferUsage bufusage_start; /* buffer usage at start */
-	WalUsage	walusage_start; /* WAL usage at start */
 	/* Accumulated statistics across all completed cycles: */
 	instr_time	startup;		/* total startup time */
-	instr_time	total;			/* total time */
 	double		ntuples;		/* total tuples produced */
 	double		ntuples2;		/* secondary node-specific tuple counter */
 	double		nloops;			/* # of run cycles for this node */
 	double		nfiltered1;		/* # of tuples removed by scanqual or joinqual */
 	double		nfiltered2;		/* # of tuples removed by "other" quals */
-	BufferUsage bufusage;		/* total buffer usage */
-	WalUsage	walusage;		/* total WAL usage */
-} Instrumentation;
+} NodeInstrumentation;
 
-typedef struct WorkerInstrumentation
+typedef struct WorkerNodeInstrumentation
 {
 	int			num_workers;	/* # of structures that follow */
-	Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
-} WorkerInstrumentation;
+	NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} WorkerNodeInstrumentation;
 
 typedef struct TriggerInstrumentation
 {
@@ -110,14 +127,19 @@ typedef struct TriggerInstrumentation
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 extern PGDLLIMPORT WalUsage pgWalUsage;
 
-extern Instrumentation *InstrAlloc(int n, int instrument_options,
-								   bool async_mode);
-extern void InstrInit(Instrumentation *instr, int instrument_options);
-extern void InstrStartNode(Instrumentation *instr);
-extern void InstrStopNode(Instrumentation *instr, double nTuples);
-extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
-extern void InstrEndLoop(Instrumentation *instr);
-extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
+extern Instrumentation *InstrAlloc(int instrument_options);
+extern void InstrInitOptions(Instrumentation *instr, int instrument_options);
+extern void InstrStart(Instrumentation *instr);
+extern void InstrStop(Instrumentation *instr);
+
+extern NodeInstrumentation *InstrAllocNode(int instrument_options,
+										   bool async_mode);
+extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);
+extern void InstrStartNode(NodeInstrumentation *instr);
+extern void InstrStopNode(NodeInstrumentation *instr, double nTuples);
+extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
+extern void InstrEndLoop(NodeInstrumentation *instr);
+extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
 
 extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options);
 extern void InstrStartTrigger(TriggerInstrumentation *tginstr);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 908898aa7c9e5..3ecae7552fc71 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -60,6 +60,7 @@ typedef struct ScanKeyData ScanKeyData;
 typedef struct SnapshotData *Snapshot;
 typedef struct SortSupportData *SortSupport;
 typedef struct TIDBitmap TIDBitmap;
+typedef struct NodeInstrumentation NodeInstrumentation;
 typedef struct TriggerInstrumentation TriggerInstrumentation;
 typedef struct TupleConversionMap TupleConversionMap;
 typedef struct TupleDescData *TupleDesc;
@@ -68,7 +69,7 @@ typedef struct Tuplestorestate Tuplestorestate;
 typedef struct TupleTableSlot TupleTableSlot;
 typedef struct TupleTableSlotOps TupleTableSlotOps;
 typedef struct WalUsage WalUsage;
-typedef struct WorkerInstrumentation WorkerInstrumentation;
+typedef struct WorkerNodeInstrumentation WorkerNodeInstrumentation;
 
 
 /* ----------------
@@ -1207,8 +1208,10 @@ typedef struct PlanState
 	ExecProcNodeMtd ExecProcNodeReal;	/* actual function, if above is a
 										 * wrapper */
 
-	Instrumentation *instrument;	/* Optional runtime stats for this node */
-	WorkerInstrumentation *worker_instrument;	/* per-worker instrumentation */
+	NodeInstrumentation *instrument;	/* Optional runtime stats for this
+										 * node */
+	WorkerNodeInstrumentation *worker_instrument;	/* per-worker
+													 * instrumentation */
 
 	/* Per-worker JIT instrumentation */
 	struct SharedJitInstrumentation *worker_jit_instrument;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 7ddf970fb97fc..449acca8dc1a4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1822,6 +1822,7 @@ NextSampleBlock_function
 NextSampleTuple_function
 NextValueExpr
 Node
+NodeInstrumentation
 NodeTag
 NonEmptyRange
 NoneCompressorState
@@ -3436,9 +3437,9 @@ WorkTableScan
 WorkTableScanState
 WorkerInfo
 WorkerInfoData
-WorkerInstrumentation
 WorkerJobDumpPtrType
 WorkerJobRestorePtrType
+WorkerNodeInstrumentation
 Working_State
 WriteBufPtrType
 WriteBytePtrType

From 1b84f1215269f91f7eef02ca6237fb8355fba3da Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 5 Apr 2026 05:08:23 -0700
Subject: [PATCH 14/23] instrumentation: Use Instrumentation instead of manual
 buffer tracking

This replaces different repeated code blocks that read pgBufferUsage /
pgWalUsage, and may have also been running a timer to measure activity,
with the new Instrumentation struct and associated helpers.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion:
---
 .../pg_stat_statements/pg_stat_statements.c   | 67 +++++--------------
 src/backend/access/heap/vacuumlazy.c          | 15 ++---
 src/backend/commands/analyze.c                | 31 +++++----
 src/backend/commands/explain.c                | 44 ++++++------
 src/backend/commands/explain_dr.c             | 56 +++++++---------
 src/backend/commands/prepare.c                | 28 +++-----
 src/include/commands/explain_dr.h             |  5 +-
 7 files changed, 94 insertions(+), 152 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index fbf32f0e72c29..63975706b87ab 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -911,22 +911,11 @@ pgss_planner(Query *parse,
 		&& pgss_track_planning && query_string
 		&& parse->queryId != INT64CONST(0))
 	{
-		instr_time	start;
-		instr_time	duration;
-		BufferUsage bufusage_start,
-					bufusage;
-		WalUsage	walusage_start,
-					walusage;
+		Instrumentation instr = {0};
 
-		/* We need to track buffer usage as the planner can access them. */
-		bufusage_start = pgBufferUsage;
-
-		/*
-		 * Similarly the planner could write some WAL records in some cases
-		 * (e.g. setting a hint bit with those being WAL-logged)
-		 */
-		walusage_start = pgWalUsage;
-		INSTR_TIME_SET_CURRENT(start);
+		/* Track time and buffer/WAL usage as the planner can access them. */
+		InstrInitOptions(&instr, INSTRUMENT_ALL);
+		InstrStart(&instr);
 
 		nesting_level++;
 		PG_TRY();
@@ -940,30 +929,20 @@ pgss_planner(Query *parse,
 		}
 		PG_FINALLY();
 		{
+			InstrStop(&instr);
 			nesting_level--;
 		}
 		PG_END_TRY();
 
-		INSTR_TIME_SET_CURRENT(duration);
-		INSTR_TIME_SUBTRACT(duration, start);
-
-		/* calc differences of buffer counters. */
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
-		/* calc differences of WAL counters. */
-		memset(&walusage, 0, sizeof(WalUsage));
-		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
 		pgss_store(query_string,
 				   parse->queryId,
 				   parse->stmt_location,
 				   parse->stmt_len,
 				   PGSS_PLAN,
-				   INSTR_TIME_GET_MILLISEC(duration),
+				   INSTR_TIME_GET_MILLISEC(instr.total),
 				   0,
-				   &bufusage,
-				   &walusage,
+				   &instr.bufusage,
+				   &instr.walusage,
 				   NULL,
 				   NULL,
 				   0,
@@ -1156,17 +1135,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		!IsA(parsetree, ExecuteStmt) &&
 		!IsA(parsetree, PrepareStmt))
 	{
-		instr_time	start;
-		instr_time	duration;
 		uint64		rows;
-		BufferUsage bufusage_start,
-					bufusage;
-		WalUsage	walusage_start,
-					walusage;
+		Instrumentation instr = {0};
 
-		bufusage_start = pgBufferUsage;
-		walusage_start = pgWalUsage;
-		INSTR_TIME_SET_CURRENT(start);
+		InstrInitOptions(&instr, INSTRUMENT_ALL);
+		InstrStart(&instr);
 
 		nesting_level++;
 		PG_TRY();
@@ -1182,6 +1155,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		}
 		PG_FINALLY();
 		{
+			InstrStop(&instr);
 			nesting_level--;
 		}
 		PG_END_TRY();
@@ -1196,9 +1170,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		 * former value, which'd otherwise be a good idea.
 		 */
 
-		INSTR_TIME_SET_CURRENT(duration);
-		INSTR_TIME_SUBTRACT(duration, start);
-
 		/*
 		 * Track the total number of rows retrieved or affected by the utility
 		 * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
@@ -1210,23 +1181,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 					   qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
 			qc->nprocessed : 0;
 
-		/* calc differences of buffer counters. */
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
-		/* calc differences of WAL counters. */
-		memset(&walusage, 0, sizeof(WalUsage));
-		WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
 		pgss_store(queryString,
 				   saved_queryId,
 				   saved_stmt_location,
 				   saved_stmt_len,
 				   PGSS_EXEC,
-				   INSTR_TIME_GET_MILLISEC(duration),
+				   INSTR_TIME_GET_MILLISEC(instr.total),
 				   rows,
-				   &bufusage,
-				   &walusage,
+				   &instr.bufusage,
+				   &instr.walusage,
 				   NULL,
 				   NULL,
 				   0,
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 88c71cd85b60b..30f589c9207dc 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -637,8 +637,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 	TimestampTz starttime = 0;
 	PgStat_Counter startreadtime = 0,
 				startwritetime = 0;
-	WalUsage	startwalusage = pgWalUsage;
-	BufferUsage startbufferusage = pgBufferUsage;
+	Instrumentation *instr = NULL;
 	ErrorContextCallback errcallback;
 	char	  **indnames = NULL;
 	Size		dead_items_max_bytes = 0;
@@ -654,6 +653,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 			startreadtime = pgStatBlockReadTime;
 			startwritetime = pgStatBlockWriteTime;
 		}
+		instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+		InstrStart(instr);
 	}
 
 	/* Used for instrumentation and stats report */
@@ -984,14 +985,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
+		InstrStop(instr);
+
 		if (verbose || params->log_vacuum_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
 									   params->log_vacuum_min_duration))
 		{
 			long		secs_dur;
 			int			usecs_dur;
-			WalUsage	walusage;
-			BufferUsage bufferusage;
 			StringInfoData buf;
 			char	   *msgfmt;
 			int32		diff;
@@ -1000,12 +1001,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 			int64		total_blks_hit;
 			int64		total_blks_read;
 			int64		total_blks_dirtied;
+			BufferUsage bufferusage = instr->bufusage;
+			WalUsage	walusage = instr->walusage;
 
 			TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
-			memset(&walusage, 0, sizeof(WalUsage));
-			WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-			memset(&bufferusage, 0, sizeof(BufferUsage));
-			BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
 
 			total_blks_hit = bufferusage.shared_blks_hit +
 				bufferusage.local_blks_hit;
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 49a5cdf579c16..8472fc0c28099 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -309,9 +309,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 	Oid			save_userid;
 	int			save_sec_context;
 	int			save_nestlevel;
-	WalUsage	startwalusage = pgWalUsage;
-	BufferUsage startbufferusage = pgBufferUsage;
-	BufferUsage bufferusage;
+	Instrumentation *instr = NULL;
 	PgStat_Counter startreadtime = 0;
 	PgStat_Counter startwritetime = 0;
 
@@ -362,6 +360,9 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 		}
 
 		pg_rusage_init(&ru0);
+
+		instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+		InstrStart(instr);
 	}
 
 	/* Used for instrumentation and stats report */
@@ -742,12 +743,13 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
+		InstrStop(instr);
+
 		if (verbose || params->log_analyze_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
 									   params->log_analyze_min_duration))
 		{
 			long		delay_in_ms;
-			WalUsage	walusage;
 			double		read_rate = 0;
 			double		write_rate = 0;
 			char	   *msgfmt;
@@ -755,18 +757,15 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 			int64		total_blks_hit;
 			int64		total_blks_read;
 			int64		total_blks_dirtied;
-
-			memset(&bufferusage, 0, sizeof(BufferUsage));
-			BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
-			memset(&walusage, 0, sizeof(WalUsage));
-			WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-
-			total_blks_hit = bufferusage.shared_blks_hit +
-				bufferusage.local_blks_hit;
-			total_blks_read = bufferusage.shared_blks_read +
-				bufferusage.local_blks_read;
-			total_blks_dirtied = bufferusage.shared_blks_dirtied +
-				bufferusage.local_blks_dirtied;
+			BufferUsage bufusage = instr->bufusage;
+			WalUsage	walusage = instr->walusage;
+
+			total_blks_hit = bufusage.shared_blks_hit +
+				bufusage.local_blks_hit;
+			total_blks_read = bufusage.shared_blks_read +
+				bufusage.local_blks_read;
+			total_blks_dirtied = bufusage.shared_blks_dirtied +
+				bufusage.local_blks_dirtied;
 
 			/*
 			 * We do not expect an analyze to take > 25 days and it simplifies
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e73dc129132be..e7550a8ac4640 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -324,13 +324,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 						 QueryEnvironment *queryEnv)
 {
 	PlannedStmt *plan;
-	instr_time	planstart,
-				planduration;
-	BufferUsage bufusage_start,
-				bufusage;
+	Instrumentation plan_instr = {0};
 	MemoryContextCounters mem_counters;
 	MemoryContext planner_ctx = NULL;
 	MemoryContext saved_ctx = NULL;
+	int			instrument_options = INSTRUMENT_TIMER;
+
+	if (es->buffers)
+		instrument_options |= INSTRUMENT_BUFFERS;
+
+	InstrInitOptions(&plan_instr, instrument_options);
 
 	if (es->memory)
 	{
@@ -348,15 +351,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 		saved_ctx = MemoryContextSwitchTo(planner_ctx);
 	}
 
-	if (es->buffers)
-		bufusage_start = pgBufferUsage;
-	INSTR_TIME_SET_CURRENT(planstart);
+	InstrStart(&plan_instr);
 
 	/* plan the query */
 	plan = pg_plan_query(query, queryString, cursorOptions, params, es);
 
-	INSTR_TIME_SET_CURRENT(planduration);
-	INSTR_TIME_SUBTRACT(planduration, planstart);
+	InstrStop(&plan_instr);
 
 	if (es->memory)
 	{
@@ -364,16 +364,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 		MemoryContextMemConsumed(planner_ctx, &mem_counters);
 	}
 
-	/* calc differences of buffer counters. */
-	if (es->buffers)
-	{
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-	}
-
 	/* run it (if needed) and produce output */
 	ExplainOnePlan(plan, into, es, queryString, params, queryEnv,
-				   &planduration, (es->buffers ? &bufusage : NULL),
+				   &plan_instr.total, (es->buffers ? &plan_instr.bufusage : NULL),
 				   es->memory ? &mem_counters : NULL);
 }
 
@@ -590,7 +583,12 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 
 	/* grab serialization metrics before we destroy the DestReceiver */
 	if (es->serialize != EXPLAIN_SERIALIZE_NONE)
-		serializeMetrics = GetSerializationMetrics(dest);
+	{
+		SerializeMetrics *metrics = GetSerializationMetrics(dest);
+
+		if (metrics)
+			memcpy(&serializeMetrics, metrics, sizeof(SerializeMetrics));
+	}
 
 	/* call the DestReceiver's destroy method even during explain */
 	dest->rDestroy(dest);
@@ -1019,7 +1017,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 		ExplainIndentText(es);
 		if (es->timing)
 			appendStringInfo(es->str, "Serialization: time=%.3f ms  output=" UINT64_FORMAT "kB  format=%s\n",
-							 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent),
+							 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total),
 							 BYTES_TO_KILOBYTES(metrics->bytesSent),
 							 format);
 		else
@@ -1027,10 +1025,10 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 							 BYTES_TO_KILOBYTES(metrics->bytesSent),
 							 format);
 
-		if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
+		if (es->buffers && peek_buffer_usage(es, &metrics->instr.bufusage))
 		{
 			es->indent++;
-			show_buffer_usage(es, &metrics->bufferUsage);
+			show_buffer_usage(es, &metrics->instr.bufusage);
 			es->indent--;
 		}
 	}
@@ -1038,13 +1036,13 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 	{
 		if (es->timing)
 			ExplainPropertyFloat("Time", "ms",
-								 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent),
+								 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total),
 								 3, es);
 		ExplainPropertyUInteger("Output Volume", "kB",
 								BYTES_TO_KILOBYTES(metrics->bytesSent), es);
 		ExplainPropertyText("Format", format, es);
 		if (es->buffers)
-			show_buffer_usage(es, &metrics->bufferUsage);
+			show_buffer_usage(es, &metrics->instr.bufusage);
 	}
 
 	ExplainCloseGroup("Serialization", "Serialization", true, es);
diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c
index 3c96061cf32ab..34fe4f8f6dd51 100644
--- a/src/backend/commands/explain_dr.c
+++ b/src/backend/commands/explain_dr.c
@@ -110,15 +110,11 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
 	MemoryContext oldcontext;
 	StringInfo	buf = &myState->buf;
 	int			natts = typeinfo->natts;
-	instr_time	start,
-				end;
-	BufferUsage instr_start;
+	Instrumentation *instr = &myState->metrics.instr;
 
 	/* only measure time, buffers if requested */
-	if (myState->es->timing)
-		INSTR_TIME_SET_CURRENT(start);
-	if (myState->es->buffers)
-		instr_start = pgBufferUsage;
+	if (instr->need_timer || instr->need_bufusage)
+		InstrStart(instr);
 
 	/* Set or update my derived attribute info, if needed */
 	if (myState->attrinfo != typeinfo || myState->nattrs != natts)
@@ -186,18 +182,9 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
 	MemoryContextSwitchTo(oldcontext);
 	MemoryContextReset(myState->tmpcontext);
 
-	/* Update timing data */
-	if (myState->es->timing)
-	{
-		INSTR_TIME_SET_CURRENT(end);
-		INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start);
-	}
-
-	/* Update buffer metrics */
-	if (myState->es->buffers)
-		BufferUsageAccumDiff(&myState->metrics.bufferUsage,
-							 &pgBufferUsage,
-							 &instr_start);
+	/* Stop per-tuple measurement */
+	if (instr->need_timer || instr->need_bufusage)
+		InstrStop(instr);
 
 	return true;
 }
@@ -233,9 +220,17 @@ serializeAnalyzeStartup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	/* The output buffer is re-used across rows, as in printtup.c */
 	initStringInfo(&receiver->buf);
 
-	/* Initialize results counters */
+	/* Initialize metrics and per-tuple instrumentation */
 	memset(&receiver->metrics, 0, sizeof(SerializeMetrics));
-	INSTR_TIME_SET_ZERO(receiver->metrics.timeSpent);
+	{
+		int			instrument_options = 0;
+
+		if (receiver->es->timing)
+			instrument_options |= INSTRUMENT_TIMER;
+		if (receiver->es->buffers)
+			instrument_options |= INSTRUMENT_BUFFERS;
+		InstrInitOptions(&receiver->metrics.instr, instrument_options);
+	}
 }
 
 /*
@@ -290,22 +285,17 @@ CreateExplainSerializeDestReceiver(ExplainState *es)
 }
 
 /*
- * GetSerializationMetrics - collect metrics
+ * GetSerializationMetrics - get serialization metrics
  *
- * We have to be careful here since the receiver could be an IntoRel
- * receiver if the subject statement is CREATE TABLE AS.  In that
- * case, return all-zeroes stats.
+ * Returns a pointer to the SerializeMetrics inside the dest receiver,
+ * or NULL if the receiver is not a SerializeDestReceiver (e.g. an IntoRel
+ * receiver for CREATE TABLE AS).
  */
-SerializeMetrics
+SerializeMetrics *
 GetSerializationMetrics(DestReceiver *dest)
 {
-	SerializeMetrics empty;
-
 	if (dest->mydest == DestExplainSerialize)
-		return ((SerializeDestReceiver *) dest)->metrics;
-
-	memset(&empty, 0, sizeof(SerializeMetrics));
-	INSTR_TIME_SET_ZERO(empty.timeSpent);
+		return &((SerializeDestReceiver *) dest)->metrics;
 
-	return empty;
+	return NULL;
 }
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 876aad2100aeb..bf9f2eb614997 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -22,6 +22,7 @@
 #include "catalog/pg_type.h"
 #include "commands/createas.h"
 #include "commands/explain.h"
+#include "executor/instrument.h"
 #include "commands/explain_format.h"
 #include "commands/explain_state.h"
 #include "commands/prepare.h"
@@ -580,14 +581,17 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	ListCell   *p;
 	ParamListInfo paramLI = NULL;
 	EState	   *estate = NULL;
-	instr_time	planstart;
-	instr_time	planduration;
-	BufferUsage bufusage_start,
-				bufusage;
+	Instrumentation plan_instr = {0};
+	int			instrument_options = INSTRUMENT_TIMER;
 	MemoryContextCounters mem_counters;
 	MemoryContext planner_ctx = NULL;
 	MemoryContext saved_ctx = NULL;
 
+	if (es->buffers)
+		instrument_options |= INSTRUMENT_BUFFERS;
+
+	InstrInitOptions(&plan_instr, instrument_options);
+
 	if (es->memory)
 	{
 		/* See ExplainOneQuery about this */
@@ -598,9 +602,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 		saved_ctx = MemoryContextSwitchTo(planner_ctx);
 	}
 
-	if (es->buffers)
-		bufusage_start = pgBufferUsage;
-	INSTR_TIME_SET_CURRENT(planstart);
+	InstrStart(&plan_instr);
 
 	/* Look it up in the hash table */
 	entry = FetchPreparedStatement(execstmt->name, true);
@@ -635,8 +637,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	cplan = GetCachedPlan(entry->plansource, paramLI,
 						  CurrentResourceOwner, pstate->p_queryEnv);
 
-	INSTR_TIME_SET_CURRENT(planduration);
-	INSTR_TIME_SUBTRACT(planduration, planstart);
+	InstrStop(&plan_instr);
 
 	if (es->memory)
 	{
@@ -644,13 +645,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 		MemoryContextMemConsumed(planner_ctx, &mem_counters);
 	}
 
-	/* calc differences of buffer counters. */
-	if (es->buffers)
-	{
-		memset(&bufusage, 0, sizeof(BufferUsage));
-		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-	}
-
 	plan_list = cplan->stmt_list;
 
 	/* Explain each query */
@@ -660,7 +654,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 
 		if (pstmt->commandType != CMD_UTILITY)
 			ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv,
-						   &planduration, (es->buffers ? &bufusage : NULL),
+						   &plan_instr.total, (es->buffers ? &plan_instr.bufusage : NULL),
 						   es->memory ? &mem_counters : NULL);
 		else
 			ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI);
diff --git a/src/include/commands/explain_dr.h b/src/include/commands/explain_dr.h
index f98eaae186457..ab5c53023e1e6 100644
--- a/src/include/commands/explain_dr.h
+++ b/src/include/commands/explain_dr.h
@@ -23,11 +23,10 @@ typedef struct ExplainState ExplainState;
 typedef struct SerializeMetrics
 {
 	uint64		bytesSent;		/* # of bytes serialized */
-	instr_time	timeSpent;		/* time spent serializing */
-	BufferUsage bufferUsage;	/* buffers accessed during serialization */
+	Instrumentation instr;		/* time and buffer usage */
 } SerializeMetrics;
 
 extern DestReceiver *CreateExplainSerializeDestReceiver(ExplainState *es);
-extern SerializeMetrics GetSerializationMetrics(DestReceiver *dest);
+extern SerializeMetrics *GetSerializationMetrics(DestReceiver *dest);
 
 #endif

From 5fc2d2519634b6a13658f3ec81c205190dfcfff1 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Thu, 26 Mar 2026 23:31:04 -0700
Subject: [PATCH 15/23] instrumentation: Replace direct changes of
 pgBufferUsage/pgWalUsage with INSTR_* macros

This encapsulates the ownership of these globals better, and will allow
a subsequent refactoring.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Discussion: https://www.postgresql.org/message-id/flat/CAP53PkzZ3UotnRrrnXWAv%3DF4avRq9MQ8zU%2BbxoN9tpovEu6fGQ%40mail.gmail.com#fc7140e8af21e07a90a09d7e76b300c4
---
 src/backend/access/transam/xlog.c      | 10 +++++-----
 src/backend/storage/buffer/bufmgr.c    | 20 ++++++++++----------
 src/backend/storage/buffer/localbuf.c  |  6 +++---
 src/backend/storage/file/buffile.c     |  8 ++++----
 src/backend/utils/activity/pgstat_io.c |  8 ++++----
 src/include/executor/instrument.h      | 19 +++++++++++++++++++
 6 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 9e8999bbb616f..71c9a26566236 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1103,10 +1103,10 @@ XLogInsertRecord(XLogRecData *rdata,
 	/* Report WAL traffic to the instrumentation. */
 	if (inserted)
 	{
-		pgWalUsage.wal_bytes += rechdr->xl_tot_len;
-		pgWalUsage.wal_records++;
-		pgWalUsage.wal_fpi += num_fpi;
-		pgWalUsage.wal_fpi_bytes += fpi_bytes;
+		INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len);
+		INSTR_WALUSAGE_INCR(wal_records);
+		INSTR_WALUSAGE_ADD(wal_fpi, num_fpi);
+		INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes);
 
 		/* Required for the flush of pending stats WAL data */
 		pgstat_report_fixed = true;
@@ -2085,7 +2085,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 					WriteRqst.Flush = InvalidXLogRecPtr;
 					XLogWrite(WriteRqst, tli, false);
 					LWLockRelease(WALWriteLock);
-					pgWalUsage.wal_buffers_full++;
+					INSTR_WALUSAGE_INCR(wal_buffers_full);
 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
 
 					/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3cc0b0bdd929f..3e1c39160db0d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -840,7 +840,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 		{
 			PinLocalBuffer(bufHdr, true);
 
-			pgBufferUsage.local_blks_hit++;
+			INSTR_BUFUSAGE_INCR(local_blks_hit);
 
 			return true;
 		}
@@ -861,7 +861,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 		{
 			if (BufferTagsEqual(&tag, &bufHdr->tag))
 			{
-				pgBufferUsage.shared_blks_hit++;
+				INSTR_BUFUSAGE_INCR(shared_blks_hit);
 				return true;
 			}
 			UnpinBuffer(bufHdr);
@@ -1684,9 +1684,9 @@ TrackBufferHit(IOObject io_object, IOContext io_context,
 									  true);
 
 	if (persistence == RELPERSISTENCE_TEMP)
-		pgBufferUsage.local_blks_hit += 1;
+		INSTR_BUFUSAGE_INCR(local_blks_hit);
 	else
-		pgBufferUsage.shared_blks_hit += 1;
+		INSTR_BUFUSAGE_INCR(shared_blks_hit);
 
 	pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
 
@@ -2148,9 +2148,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 							io_start, 1, io_buffers_len * BLCKSZ);
 
 	if (persistence == RELPERSISTENCE_TEMP)
-		pgBufferUsage.local_blks_read += io_buffers_len;
+		INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len);
 	else
-		pgBufferUsage.shared_blks_read += io_buffers_len;
+		INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len);
 
 	/*
 	 * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
@@ -3043,7 +3043,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
 	}
 
-	pgBufferUsage.shared_blks_written += extend_by;
+	INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by);
 
 	*extended_by = extend_by;
 
@@ -3189,7 +3189,7 @@ MarkBufferDirty(Buffer buffer)
 	 */
 	if (!(old_buf_state & BM_DIRTY))
 	{
-		pgBufferUsage.shared_blks_dirtied++;
+		INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
@@ -4601,7 +4601,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
 							IOOP_WRITE, io_start, 1, BLCKSZ);
 
-	pgBufferUsage.shared_blks_written++;
+	INSTR_BUFUSAGE_INCR(shared_blks_written);
 
 	/*
 	 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
@@ -5796,7 +5796,7 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
 			UnlockBufHdr(bufHdr);
 		}
 
-		pgBufferUsage.shared_blks_dirtied++;
+		INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 396da84b25c55..851b99056d571 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -218,7 +218,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
 	/* Mark not-dirty */
 	TerminateLocalBufferIO(bufHdr, true, 0, false);
 
-	pgBufferUsage.local_blks_written++;
+	INSTR_BUFUSAGE_INCR(local_blks_written);
 }
 
 static Buffer
@@ -479,7 +479,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
 
 	*extended_by = extend_by;
 
-	pgBufferUsage.local_blks_written += extend_by;
+	INSTR_BUFUSAGE_ADD(local_blks_written, extend_by);
 
 	return first_block;
 }
@@ -510,7 +510,7 @@ MarkLocalBufferDirty(Buffer buffer)
 	buf_state = pg_atomic_read_u64(&bufHdr->state);
 
 	if (!(buf_state & BM_DIRTY))
-		pgBufferUsage.local_blks_dirtied++;
+		INSTR_BUFUSAGE_INCR(local_blks_dirtied);
 
 	buf_state |= BM_DIRTY;
 
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index c4afe4d368a34..8b501dfcadd02 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -475,13 +475,13 @@ BufFileLoadBuffer(BufFile *file)
 	if (track_io_timing)
 	{
 		INSTR_TIME_SET_CURRENT(io_time);
-		INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+		INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start);
 	}
 
 	/* we choose not to advance curOffset here */
 
 	if (file->nbytes > 0)
-		pgBufferUsage.temp_blks_read++;
+		INSTR_BUFUSAGE_INCR(temp_blks_read);
 }
 
 /*
@@ -549,13 +549,13 @@ BufFileDumpBuffer(BufFile *file)
 		if (track_io_timing)
 		{
 			INSTR_TIME_SET_CURRENT(io_time);
-			INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+			INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start);
 		}
 
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
-		pgBufferUsage.temp_blks_written++;
+		INSTR_BUFUSAGE_INCR(temp_blks_written);
 	}
 	file->dirty = false;
 
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 2be26e9228361..e3829d7fe7cef 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 			{
 				pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
 				if (io_object == IOOBJECT_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time);
 				else if (io_object == IOOBJECT_TEMP_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time);
 			}
 			else if (io_op == IOOP_READ)
 			{
 				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
 				if (io_object == IOOBJECT_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time);
 				else if (io_object == IOOBJECT_TEMP_RELATION)
-					INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+					INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time);
 			}
 		}
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index b11d64633b5f4..d4769f3da7bde 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -153,4 +153,23 @@ extern void BufferUsageAccumDiff(BufferUsage *dst,
 extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
 							  const WalUsage *sub);
 
+#define INSTR_BUFUSAGE_INCR(fld) do { \
+		pgBufferUsage.fld++; \
+	} while(0)
+#define INSTR_BUFUSAGE_ADD(fld,val) do { \
+		pgBufferUsage.fld += (val); \
+	} while(0)
+#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
+	INSTR_TIME_ADD(pgBufferUsage.fld, val); \
+	} while (0)
+#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
+	INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \
+	} while (0)
+#define INSTR_WALUSAGE_INCR(fld) do { \
+		pgWalUsage.fld++; \
+	} while(0)
+#define INSTR_WALUSAGE_ADD(fld,val) do { \
+		pgWalUsage.fld += (val); \
+	} while(0)
+
 #endif							/* INSTRUMENT_H */

From 11c9364bb33d6c6c7a8de9e26bc247e761cb5808 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 5 Apr 2026 03:39:46 -0700
Subject: [PATCH 16/23] Parallel Bitmap Heap Scan: Fix EXPLAIN reporting of
 "Heap Blocks"

Fix the missing accumulation of "Heap Blocks" from parallel query workers
to the leader, causing EXPLAIN (ANALYZE) to only show the leader statistics,
significantly undercounting the true value.

Additionally, add a regression test covering EXPLAIN (ANALYZE) of a
Parallel Bitmap Heap Scan, which previously was not tested at all.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion
---
 src/backend/commands/explain.c        | 33 +++++++++++++++++++++------
 src/test/regress/expected/explain.out | 33 +++++++++++++++++++++++++++
 src/test/regress/sql/explain.sql      | 31 +++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e7550a8ac4640..79bd4d9d69e34 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3919,26 +3919,45 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es)
 static void
 show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 {
+	uint64		exact_pages;
+	uint64		lossy_pages;
+
 	if (!es->analyze)
 		return;
 
+	/* Start with leader's stats */
+	exact_pages = planstate->stats.exact_pages;
+	lossy_pages = planstate->stats.lossy_pages;
+
+	/* Accumulate worker stats into node-level totals */
+	if (planstate->sinstrument != NULL)
+	{
+		for (int n = 0; n < planstate->sinstrument->num_workers; n++)
+		{
+			BitmapHeapScanInstrumentation *si = &planstate->sinstrument->sinstrument[n];
+
+			exact_pages += si->exact_pages;
+			lossy_pages += si->lossy_pages;
+		}
+	}
+
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 		ExplainPropertyUInteger("Exact Heap Blocks", NULL,
-								planstate->stats.exact_pages, es);
+								exact_pages, es);
 		ExplainPropertyUInteger("Lossy Heap Blocks", NULL,
-								planstate->stats.lossy_pages, es);
+								lossy_pages, es);
 	}
 	else
 	{
-		if (planstate->stats.exact_pages > 0 || planstate->stats.lossy_pages > 0)
+		if (exact_pages > 0 || lossy_pages > 0)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "Heap Blocks:");
-			if (planstate->stats.exact_pages > 0)
-				appendStringInfo(es->str, " exact=" UINT64_FORMAT, planstate->stats.exact_pages);
-			if (planstate->stats.lossy_pages > 0)
-				appendStringInfo(es->str, " lossy=" UINT64_FORMAT, planstate->stats.lossy_pages);
+			if (exact_pages > 0)
+				appendStringInfo(es->str, " exact=" UINT64_FORMAT, exact_pages);
+			if (lossy_pages > 0)
+				appendStringInfo(es->str, " lossy=" UINT64_FORMAT, lossy_pages);
 			appendStringInfoChar(es->str, '\n');
 		}
 	}
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 7c1f26b182cb0..58c5a512d74de 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -822,3 +822,36 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
 (9 rows)
 
 reset work_mem;
+-- Test parallel bitmap heap scan reports per-worker heap block stats.
+CREATE FUNCTION check_parallel_bitmap_heap_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_indexscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE hundred > 1' INTO plan_json;
+
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Bitmap Heap Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Exact Heap Blocks')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
+ parallel_bitmap_instrumentation 
+---------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_parallel_bitmap_heap_scan;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ebdab42604beb..bac97522053f6 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -188,3 +188,34 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
 -- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk)
 select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))');
 reset work_mem;
+
+-- Test parallel bitmap heap scan reports per-worker heap block stats.
+CREATE FUNCTION check_parallel_bitmap_heap_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_indexscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE hundred > 1' INTO plan_json;
+
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Bitmap Heap Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Exact Heap Blocks')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
+
+DROP FUNCTION check_parallel_bitmap_heap_scan;

From eb3be81df13b3b7ded84db0019bb68105ce3163a Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 5 Apr 2026 03:48:22 -0700
Subject: [PATCH 17/23] Add regression test coverage for EXPLAIN of Parallel
 Index Only Scans

The functions dealing with copying back parallel worker instrumentation
such as ExecIndexOnlyScanRetrieveInstrumentation were not exercised
at all in the regression tests, leading to a gap in coverage. Add a
query that verifies we correctly copy back "Index Searches" for
EXPLAIN ANALYZE of a Parallel Index Only Scan.

Reported-by: Andres Freund <andres@anarazel.de>
Author: Lukas Fittl <lukas@fittl.com>
Discussion:
---
 src/test/regress/expected/explain.out | 34 +++++++++++++++++++++++++++
 src/test/regress/sql/explain.sql      | 32 +++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 58c5a512d74de..b307e810ca561 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -855,3 +855,37 @@ SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
 (1 row)
 
 DROP FUNCTION check_parallel_bitmap_heap_scan;
+-- Test parallel index-only scan reports per-worker index search stats.
+CREATE FUNCTION check_parallel_indexonly_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_bitmapscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE thousand > 95' INTO plan_json;
+
+    -- Drill down to the Index Only Scan node
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Index Only Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Index Searches')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
+ parallel_indexonly_instrumentation 
+------------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_parallel_indexonly_scan;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index bac97522053f6..3a13fa6ca69a4 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -219,3 +219,35 @@ $$ LANGUAGE plpgsql;
 SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
 
 DROP FUNCTION check_parallel_bitmap_heap_scan;
+
+-- Test parallel index-only scan reports per-worker index search stats.
+CREATE FUNCTION check_parallel_indexonly_scan() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+BEGIN
+    SET LOCAL enable_seqscan = off;
+    SET LOCAL enable_bitmapscan = off;
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_index_scan_size = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1 WHERE thousand > 95' INTO plan_json;
+
+    -- Drill down to the Index Only Scan node
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Index Only Scan' LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+
+    RETURN COALESCE((node->>'Index Searches')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
+
+DROP FUNCTION check_parallel_indexonly_scan;

From 73d3d43fe305871f62332098e414f9db84fc133c Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 5 Apr 2026 03:48:32 -0700
Subject: [PATCH 18/23] instrumentation: Add additional regression tests
 covering buffer usage

This adds regression tests that cover some of the expected behaviour
around the buffer statistics reported in EXPLAIN ANALYZE, specifically
how they behave in parallel query, nested function calls and abort
situations.

Testing this is challenging because there can be different sources of
buffer activity, so we rely on temporary tables where we can to prove
that activity was captured and not lost. This supports a future commit
that will rework some of the instrumentation logic that could cause
areas covered by these tests to fail.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion:
---
 .../pg_stat_statements/expected/utility.out   |  70 ++++++
 contrib/pg_stat_statements/expected/wal.out   |  48 ++++
 contrib/pg_stat_statements/sql/utility.sql    |  56 +++++
 contrib/pg_stat_statements/sql/wal.sql        |  33 +++
 src/test/regress/expected/explain.out         | 228 ++++++++++++++++++
 src/test/regress/sql/explain.sql              | 226 +++++++++++++++++
 6 files changed, 661 insertions(+)

diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out
index e4d6564ea5b5a..cba487f6be582 100644
--- a/contrib/pg_stat_statements/expected/utility.out
+++ b/contrib/pg_stat_statements/expected/utility.out
@@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
      1 |    1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
 (3 rows)
 
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT run_explain_buffers_test();
+ run_explain_buffers_test 
+--------------------------
+ 
+(1 row)
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+               query               | has_buffer_stats 
+-----------------------------------+------------------
+ SELECT run_explain_buffers_test() | t
+(1 row)
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+    v int;
+BEGIN
+    EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+CALL pgss_call_rollback_proc();
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+             query              | calls | local_hitread | wal_bytes_generated | wal_records_generated 
+--------------------------------+-------+---------------+---------------------+-----------------------
+ CALL pgss_call_rollback_proc() |     1 | t             | t                   | t
+(1 row)
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
 -- CALL
 CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
 DECLARE
diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out
index 977e382d84894..611213daef6c2 100644
--- a/contrib/pg_stat_statements/expected/wal.out
+++ b/contrib/pg_stat_statements/expected/wal.out
@@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t
 (1 row)
 
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT pgss_error_func();
+ pgss_error_func 
+-----------------
+ 
+(1 row)
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+          query           | calls | local_hitread | wal_bytes_generated | wal_records_generated 
+--------------------------+-------+---------------+---------------------+-----------------------
+ SELECT pgss_error_func() |     1 | t             | t                   | t
+(1 row)
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t 
+---
+ t
+(1 row)
+
diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql
index dd97203c21025..7540e49c73caf 100644
--- a/contrib/pg_stat_statements/sql/utility.sql
+++ b/contrib/pg_stat_statements/sql/utility.sql
@@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7;
 
 SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
 
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+SELECT run_explain_buffers_test();
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+    v int;
+BEGIN
+    EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+CALL pgss_call_rollback_proc();
+
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
+
 -- CALL
 CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
 DECLARE
diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql
index 1dc1552a81ebc..467e321b2062e 100644
--- a/contrib/pg_stat_statements/sql/wal.sql
+++ b/contrib/pg_stat_statements/sql/wal.sql
@@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated,
 wal_records >= rows as wal_records_ge_rows
 FROM pg_stat_statements ORDER BY query COLLATE "C";
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT pgss_error_func();
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index b307e810ca561..f630acd5f54fc 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -889,3 +889,231 @@ SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
 (1 row)
 
 DROP FUNCTION check_parallel_indexonly_scan;
+-- Test parallel query reports similar buffer stats to a serial run
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+    plan_json json;
+    serial_buffers int;
+    parallel_buffers int;
+    node json;
+BEGIN
+    -- Serial --
+    SET LOCAL max_parallel_workers_per_gather = 0;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    serial_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    -- Parallel --
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    parallel_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_parallel_explain_buffers();
+ ratio 
+-------
+     1
+(1 row)
+
+DROP FUNCTION check_parallel_explain_buffers;
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT explain_exc_func()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+ exception_buffers_visible 
+---------------------------
+ t
+(1 row)
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT check_explain_exception_buffers()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+ exception_buffers_nested_visible 
+----------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly reported.
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+    plan_json json;
+    node json;
+    direct_buf int;
+    noscroll_buf int;
+    scroll_buf int;
+BEGIN
+    -- Direct scan: get leaf Seq Scan node buffers as baseline
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT * FROM cursor_buf_test' INTO plan_json;
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+    direct_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_noscroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    noscroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Scrollable cursor path: ExecShutdownNode is skipped
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_scroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    scroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Both cursor paths should report buffer counts about as high as
+    -- the direct scan (same data plus minor catalog overhead), and not
+    -- double-counted (< 2x the direct scan)
+    RETURN QUERY SELECT
+        (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+        (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_cursor_explain_buffers();
+ noscroll_ok | scroll_ok 
+-------------+-----------
+ t           | t
+(1 row)
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+-- Test trigger instrumentation.
+CREATE TEMP TABLE trig_test_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int);
+INSERT INTO trig_work_tab VALUES (1);
+CREATE FUNCTION trig_test_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM * FROM trig_work_tab;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_test_trig
+    BEFORE INSERT ON trig_test_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_test_func();
+CREATE FUNCTION check_trigger_explain_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    trig json;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        INSERT INTO trig_test_tab VALUES (1)' INTO plan_json;
+    trig := plan_json->0->'Triggers'->0;
+    RETURN COALESCE((trig->>'Calls')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_trigger_explain_buffers() AS trigger_buffers_visible;
+ trigger_buffers_visible 
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_trigger_explain_buffers;
+DROP TRIGGER trig_test_trig ON trig_test_tab;
+DROP FUNCTION trig_test_func;
+DROP TABLE trig_test_tab;
+DROP TABLE trig_work_tab;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 3a13fa6ca69a4..74f605739f156 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -251,3 +251,229 @@ $$ LANGUAGE plpgsql;
 SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
 
 DROP FUNCTION check_parallel_indexonly_scan;
+
+-- Test parallel query reports similar buffer stats to a serial run
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+    plan_json json;
+    serial_buffers int;
+    parallel_buffers int;
+    node json;
+BEGIN
+    -- Serial --
+    SET LOCAL max_parallel_workers_per_gather = 0;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    serial_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    -- Parallel --
+    SET LOCAL parallel_setup_cost = 0;
+    SET LOCAL parallel_tuple_cost = 0;
+    SET LOCAL min_parallel_table_scan_size = 0;
+    SET LOCAL max_parallel_workers_per_gather = 2;
+    SET LOCAL parallel_leader_participation = off;
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT count(*) FROM tenk1' INTO plan_json;
+    node := plan_json->0->'Plan';
+    parallel_buffers :=
+        COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+        COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+    RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_parallel_explain_buffers();
+
+DROP FUNCTION check_parallel_explain_buffers;
+
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+    v int;
+BEGIN
+    WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+    SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+    NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT explain_exc_func()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    node json;
+    total_buffers int;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT check_explain_exception_buffers()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    total_buffers :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+    RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly reported.
+
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+    cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+    rec RECORD;
+    cnt bigint := 0;
+BEGIN
+    OPEN cur;
+    LOOP
+        FETCH NEXT FROM cur INTO rec;
+        EXIT WHEN NOT FOUND;
+        cnt := cnt + 1;
+    END LOOP;
+    CLOSE cur;
+    RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+    plan_json json;
+    node json;
+    direct_buf int;
+    noscroll_buf int;
+    scroll_buf int;
+BEGIN
+    -- Direct scan: get leaf Seq Scan node buffers as baseline
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT * FROM cursor_buf_test' INTO plan_json;
+    node := plan_json->0->'Plan';
+    WHILE node->'Plans' IS NOT NULL LOOP
+        node := node->'Plans'->0;
+    END LOOP;
+    direct_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_noscroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    noscroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Scrollable cursor path: ExecShutdownNode is skipped
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        SELECT cursor_scroll_scan()' INTO plan_json;
+    node := plan_json->0->'Plan';
+    scroll_buf :=
+        COALESCE((node->>'Local Hit Blocks')::int, 0) +
+        COALESCE((node->>'Local Read Blocks')::int, 0);
+
+    -- Both cursor paths should report buffer counts about as high as
+    -- the direct scan (same data plus minor catalog overhead), and not
+    -- double-counted (< 2x the direct scan)
+    RETURN QUERY SELECT
+        (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+        (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_cursor_explain_buffers();
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+
+-- Test trigger instrumentation.
+CREATE TEMP TABLE trig_test_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int);
+INSERT INTO trig_work_tab VALUES (1);
+
+CREATE FUNCTION trig_test_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM * FROM trig_work_tab;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trig_test_trig
+    BEFORE INSERT ON trig_test_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_test_func();
+
+CREATE FUNCTION check_trigger_explain_buffers() RETURNS boolean AS $$
+DECLARE
+    plan_json json;
+    trig json;
+BEGIN
+    EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+        INSERT INTO trig_test_tab VALUES (1)' INTO plan_json;
+    trig := plan_json->0->'Triggers'->0;
+    RETURN COALESCE((trig->>'Calls')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_trigger_explain_buffers() AS trigger_buffers_visible;
+
+DROP FUNCTION check_trigger_explain_buffers;
+DROP TRIGGER trig_test_trig ON trig_test_tab;
+DROP FUNCTION trig_test_func;
+DROP TABLE trig_test_tab;
+DROP TABLE trig_work_tab;

From 9386bc74a560dd979b1d2b4484cdc1420ab86b39 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Tue, 9 Sep 2025 02:16:59 -0700
Subject: [PATCH 19/23] Optimize measuring WAL/buffer usage through stack-based
 instrumentation

Previously, in order to determine the buffer/WAL usage of a given code
section, we utilized continuously incrementing global counters that get
updated when the actual activity (e.g. shared block read) occurred, and
then calculated a diff when the code section ended. This resulted in a
bottleneck for executor node instrumentation specifically, with the
function BufferUsageAccumDiff showing up in profiles and in some cases
adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run.

Instead, introduce a stack-based mechanism, where the actual activity
writes into the current stack entry. In the case of executor nodes, this
means that each node gets its own stack entry that is pushed at
InstrStartNode, and popped at InstrEndNode. Stack entries are zero
initialized (avoiding the diff mechanism) and get added to their parent
entry when they are finalized, i.e. no more modifications can occur.

To correctly handle abort situations, any use of instrumentation stacks
must involve either a top-level QueryInstrumentation struct, and its
associated InstrQueryStart/InstrQueryStop helpers (which use resource
owners to handle aborts), or the Instrumentation struct itself with
dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a
consistent state after an abort.

In tests, the stack-based instrumentation mechanism reduces the overhead
of EXPLAIN (ANALYZE, BUFFERS ON, TIMING OFF) for a large COUNT(*) query
from about 50% to 22% on top of the actual runtime.

This also drops the global pgBufferUsage, any callers interested in
measuring buffer activity should instead utilize InstrStart/InstrStop.

The related global pgWalUsage is kept for now due to its use in pgstat
to track aggregate WAL activity and heap_page_prune_and_freeze for
measuring FPIs.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxrmpECzVFpeeEEHDGe6u625s%2BYkmVv5-gw3L_NDSfbiA%40mail.gmail.com#cb583a08e8e096aa1f093bb178906173
---
 contrib/auto_explain/auto_explain.c           |  16 +-
 .../pg_stat_statements/pg_stat_statements.c   |  24 +-
 src/backend/access/brin/brin.c                |  10 +-
 src/backend/access/gin/gininsert.c            |  10 +-
 src/backend/access/heap/vacuumlazy.c          |  12 +-
 src/backend/access/nbtree/nbtsort.c           |  10 +-
 src/backend/commands/analyze.c                |  12 +-
 src/backend/commands/explain.c                |  10 +-
 src/backend/commands/explain_dr.c             |   6 +-
 src/backend/commands/prepare.c                |  10 +-
 src/backend/commands/tablecmds.c              |   2 +-
 src/backend/commands/trigger.c                |  17 +-
 src/backend/commands/vacuumparallel.c         |  10 +-
 src/backend/executor/README.instrument        | 237 +++++++++
 src/backend/executor/execMain.c               |  84 +++-
 src/backend/executor/execParallel.c           |  36 +-
 src/backend/executor/execPartition.c          |   2 +-
 src/backend/executor/execProcnode.c           | 103 +++-
 src/backend/executor/execUtils.c              |  11 +-
 src/backend/executor/instrument.c             | 468 ++++++++++++++----
 src/backend/replication/logical/worker.c      |   2 +-
 src/backend/storage/buffer/bufmgr.c           |   6 +-
 src/backend/utils/activity/pgstat_io.c        |   6 +-
 src/include/executor/execdesc.h               |   4 +-
 src/include/executor/executor.h               |   5 +-
 src/include/executor/instrument.h             | 201 +++++++-
 src/include/nodes/execnodes.h                 |   3 +-
 src/include/utils/resowner.h                  |   1 +
 src/tools/pgindent/typedefs.list              |   2 +
 29 files changed, 1084 insertions(+), 236 deletions(-)
 create mode 100644 src/backend/executor/README.instrument

diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index 39bf2543b701d..4be81489ff4fb 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
 
 	if (auto_explain_enabled())
 	{
-		/*
-		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
-		 * space is allocated in the per-query context so it will go away at
-		 * ExecutorEnd.
-		 */
+		/* Set up to track total elapsed time in ExecutorRun. */
 		if (queryDesc->totaltime == NULL)
-		{
-			MemoryContext oldcxt;
-
-			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-			queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL);
-			MemoryContextSwitchTo(oldcxt);
-		}
+			queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
 	}
 }
 
@@ -382,7 +372,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
 		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
 
 		/* Log plan if duration is exceeded. */
-		msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
+		msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total);
 		if (msec >= auto_explain_log_min_duration)
 		{
 			ExplainState *es = NewExplainState();
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 63975706b87ab..78f1518c94084 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -929,7 +929,7 @@ pgss_planner(Query *parse,
 		}
 		PG_FINALLY();
 		{
-			InstrStop(&instr);
+			InstrStopFinalize(&instr);
 			nesting_level--;
 		}
 		PG_END_TRY();
@@ -994,19 +994,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	 */
 	if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0))
 	{
-		/*
-		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
-		 * space is allocated in the per-query context so it will go away at
-		 * ExecutorEnd.
-		 */
+		/* Set up to track total elapsed time in ExecutorRun. */
 		if (queryDesc->totaltime == NULL)
-		{
-			MemoryContext oldcxt;
-
-			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-			queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL);
-			MemoryContextSwitchTo(oldcxt);
-		}
+			queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
 	}
 }
 
@@ -1068,10 +1058,10 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
 				   queryDesc->plannedstmt->stmt_location,
 				   queryDesc->plannedstmt->stmt_len,
 				   PGSS_EXEC,
-				   INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total),
+				   INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total),
 				   queryDesc->estate->es_total_processed,
-				   &queryDesc->totaltime->bufusage,
-				   &queryDesc->totaltime->walusage,
+				   &queryDesc->totaltime->instr.bufusage,
+				   &queryDesc->totaltime->instr.walusage,
 				   queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
 				   NULL,
 				   queryDesc->estate->es_parallel_workers_to_launch,
@@ -1155,7 +1145,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		}
 		PG_FINALLY();
 		{
-			InstrStop(&instr);
+			InstrStopFinalize(&instr);
 			nesting_level--;
 		}
 		PG_END_TRY();
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index bdb30752e098c..3a5176c76c765 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -2434,8 +2434,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	 * and PARALLEL_KEY_BUFFER_USAGE.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgWalUsage or
-	 * pgBufferUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(WalUsage), pcxt->nworkers));
@@ -2887,6 +2887,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
+	QueryInstrumentation *instr;
 	WalUsage   *walusage;
 	BufferUsage *bufferusage;
 	int			sortmem;
@@ -2936,7 +2937,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	tuplesort_attach_shared(sharedsort, seg);
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/*
 	 * Might as well use reliable figure when doling out maintenance_work_mem
@@ -2951,7 +2952,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	/* Report WAL/buffer usage during parallel execution */
 	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+	InstrEndParallelQuery(instr,
+						  &bufferusage[ParallelWorkerNumber],
 						  &walusage[ParallelWorkerNumber]);
 
 	index_close(indexRel, indexLockmode);
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 9d83a4957757b..0d80f72a0b085 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -991,8 +991,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	 * and PARALLEL_KEY_BUFFER_USAGE.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgWalUsage or
-	 * pgBufferUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(WalUsage), pcxt->nworkers));
@@ -2118,6 +2118,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
+	QueryInstrumentation *instr;
 	WalUsage   *walusage;
 	BufferUsage *bufferusage;
 	int			sortmem;
@@ -2186,7 +2187,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	tuplesort_attach_shared(sharedsort, seg);
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/*
 	 * Might as well use reliable figure when doling out maintenance_work_mem
@@ -2201,7 +2202,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	/* Report WAL/buffer usage during parallel execution */
 	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+	InstrEndParallelQuery(instr,
+						  &bufferusage[ParallelWorkerNumber],
 						  &walusage[ParallelWorkerNumber]);
 
 	index_close(indexRel, indexLockmode);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 30f589c9207dc..291d9d67bc295 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -637,7 +637,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 	TimestampTz starttime = 0;
 	PgStat_Counter startreadtime = 0,
 				startwritetime = 0;
-	Instrumentation *instr = NULL;
+	QueryInstrumentation *instr = NULL;
 	ErrorContextCallback errcallback;
 	char	  **indnames = NULL;
 	Size		dead_items_max_bytes = 0;
@@ -653,8 +653,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 			startreadtime = pgStatBlockReadTime;
 			startwritetime = pgStatBlockWriteTime;
 		}
-		instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
-		InstrStart(instr);
+		instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+		InstrQueryStart(instr);
 	}
 
 	/* Used for instrumentation and stats report */
@@ -985,7 +985,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
-		InstrStop(instr);
+		InstrQueryStopFinalize(instr);
 
 		if (verbose || params->log_vacuum_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
@@ -1001,8 +1001,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
 			int64		total_blks_hit;
 			int64		total_blks_read;
 			int64		total_blks_dirtied;
-			BufferUsage bufferusage = instr->bufusage;
-			WalUsage	walusage = instr->walusage;
+			BufferUsage bufferusage = instr->instr.bufusage;
+			WalUsage	walusage = instr->instr.walusage;
 
 			TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
 
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 756dfa3dcf47e..2d7b7cef91202 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1466,8 +1466,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	 * and PARALLEL_KEY_BUFFER_USAGE.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgWalUsage or
-	 * pgBufferUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(WalUsage), pcxt->nworkers));
@@ -1753,6 +1753,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	Relation	indexRel;
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
+	QueryInstrumentation *instr;
 	WalUsage   *walusage;
 	BufferUsage *bufferusage;
 	int			sortmem;
@@ -1828,7 +1829,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	}
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/* Perform sorting of spool, and possibly a spool2 */
 	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
@@ -1838,7 +1839,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	/* Report WAL/buffer usage during parallel execution */
 	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+	InstrEndParallelQuery(instr,
+						  &bufferusage[ParallelWorkerNumber],
 						  &walusage[ParallelWorkerNumber]);
 
 #ifdef BTREE_BUILD_STATS
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 8472fc0c28099..10f8a2dc81cd5 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -309,7 +309,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 	Oid			save_userid;
 	int			save_sec_context;
 	int			save_nestlevel;
-	Instrumentation *instr = NULL;
+	QueryInstrumentation *instr = NULL;
 	PgStat_Counter startreadtime = 0;
 	PgStat_Counter startwritetime = 0;
 
@@ -361,8 +361,8 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 
 		pg_rusage_init(&ru0);
 
-		instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
-		InstrStart(instr);
+		instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+		InstrQueryStart(instr);
 	}
 
 	/* Used for instrumentation and stats report */
@@ -743,7 +743,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
-		InstrStop(instr);
+		InstrQueryStopFinalize(instr);
 
 		if (verbose || params->log_analyze_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
@@ -757,8 +757,8 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
 			int64		total_blks_hit;
 			int64		total_blks_read;
 			int64		total_blks_dirtied;
-			BufferUsage bufusage = instr->bufusage;
-			WalUsage	walusage = instr->walusage;
+			BufferUsage bufusage = instr->instr.bufusage;
+			WalUsage	walusage = instr->instr.walusage;
 
 			total_blks_hit = bufusage.shared_blks_hit +
 				bufusage.local_blks_hit;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 79bd4d9d69e34..9fc39cabdf815 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -324,7 +324,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 						 QueryEnvironment *queryEnv)
 {
 	PlannedStmt *plan;
-	Instrumentation plan_instr = {0};
+	QueryInstrumentation *plan_instr = NULL;
 	MemoryContextCounters mem_counters;
 	MemoryContext planner_ctx = NULL;
 	MemoryContext saved_ctx = NULL;
@@ -333,7 +333,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 	if (es->buffers)
 		instrument_options |= INSTRUMENT_BUFFERS;
 
-	InstrInitOptions(&plan_instr, instrument_options);
+	plan_instr = InstrQueryAlloc(instrument_options);
 
 	if (es->memory)
 	{
@@ -351,12 +351,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 		saved_ctx = MemoryContextSwitchTo(planner_ctx);
 	}
 
-	InstrStart(&plan_instr);
+	InstrQueryStart(plan_instr);
 
 	/* plan the query */
 	plan = pg_plan_query(query, queryString, cursorOptions, params, es);
 
-	InstrStop(&plan_instr);
+	InstrQueryStopFinalize(plan_instr);
 
 	if (es->memory)
 	{
@@ -366,7 +366,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
 
 	/* run it (if needed) and produce output */
 	ExplainOnePlan(plan, into, es, queryString, params, queryEnv,
-				   &plan_instr.total, (es->buffers ? &plan_instr.bufusage : NULL),
+				   &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL),
 				   es->memory ? &mem_counters : NULL);
 }
 
diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c
index 34fe4f8f6dd51..9c1b30fb75b73 100644
--- a/src/backend/commands/explain_dr.c
+++ b/src/backend/commands/explain_dr.c
@@ -113,7 +113,7 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
 	Instrumentation *instr = &myState->metrics.instr;
 
 	/* only measure time, buffers if requested */
-	if (instr->need_timer || instr->need_bufusage)
+	if (instr->need_timer || instr->need_stack)
 		InstrStart(instr);
 
 	/* Set or update my derived attribute info, if needed */
@@ -183,7 +183,7 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
 	MemoryContextReset(myState->tmpcontext);
 
 	/* Stop per-tuple measurement */
-	if (instr->need_timer || instr->need_bufusage)
+	if (instr->need_timer || instr->need_stack)
 		InstrStop(instr);
 
 	return true;
@@ -241,6 +241,8 @@ serializeAnalyzeShutdown(DestReceiver *self)
 {
 	SerializeDestReceiver *receiver = (SerializeDestReceiver *) self;
 
+	InstrFinalizeChild(&receiver->metrics.instr, instr_stack.current);
+
 	if (receiver->finfos)
 		pfree(receiver->finfos);
 	receiver->finfos = NULL;
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index bf9f2eb614997..ee8113575882e 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -581,7 +581,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	ListCell   *p;
 	ParamListInfo paramLI = NULL;
 	EState	   *estate = NULL;
-	Instrumentation plan_instr = {0};
+	QueryInstrumentation *plan_instr = NULL;
 	int			instrument_options = INSTRUMENT_TIMER;
 	MemoryContextCounters mem_counters;
 	MemoryContext planner_ctx = NULL;
@@ -590,7 +590,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	if (es->buffers)
 		instrument_options |= INSTRUMENT_BUFFERS;
 
-	InstrInitOptions(&plan_instr, instrument_options);
+	plan_instr = InstrQueryAlloc(instrument_options);
 
 	if (es->memory)
 	{
@@ -602,7 +602,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 		saved_ctx = MemoryContextSwitchTo(planner_ctx);
 	}
 
-	InstrStart(&plan_instr);
+	InstrQueryStart(plan_instr);
 
 	/* Look it up in the hash table */
 	entry = FetchPreparedStatement(execstmt->name, true);
@@ -637,7 +637,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 	cplan = GetCachedPlan(entry->plansource, paramLI,
 						  CurrentResourceOwner, pstate->p_queryEnv);
 
-	InstrStop(&plan_instr);
+	InstrQueryStopFinalize(plan_instr);
 
 	if (es->memory)
 	{
@@ -654,7 +654,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 
 		if (pstmt->commandType != CMD_UTILITY)
 			ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv,
-						   &plan_instr.total, (es->buffers ? &plan_instr.bufusage : NULL),
+						   &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL),
 						   es->memory ? &mem_counters : NULL);
 		else
 			ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 0ce2e81f9c2f2..f72c1ac521a3e 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -2139,7 +2139,7 @@ ExecuteTruncateGuts(List *explicit_rels,
 						  rel,
 						  0,	/* dummy rangetable index */
 						  NULL,
-						  0);
+						  NULL);
 		estate->es_opened_result_relations =
 			lappend(estate->es_opened_result_relations, resultRelInfo);
 		resultRelInfo++;
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 4d4e96a530236..b8b8840345bd2 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -93,6 +93,7 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
 									 int tgindx,
 									 FmgrInfo *finfo,
 									 TriggerInstrumentation *instr,
+									 QueryInstrumentation *qinstr,
 									 MemoryContext per_tuple_context);
 static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 								  ResultRelInfo *src_partinfo,
@@ -2312,6 +2313,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
 					int tgindx,
 					FmgrInfo *finfo,
 					TriggerInstrumentation *instr,
+					QueryInstrumentation *qinstr,
 					MemoryContext per_tuple_context)
 {
 	LOCAL_FCINFO(fcinfo, 0);
@@ -2346,7 +2348,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
 	 * If doing EXPLAIN ANALYZE, start charging time to this trigger.
 	 */
 	if (instr)
-		InstrStartTrigger(instr + tgindx);
+		InstrStartTrigger(qinstr, instr + tgindx);
 
 	/*
 	 * Do the function evaluation in the per-tuple memory context, so that
@@ -2441,6 +2443,7 @@ ExecBSInsertTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -2502,6 +2505,7 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -2606,6 +2610,7 @@ ExecIRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -2670,6 +2675,7 @@ ExecBSDeleteTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -2780,6 +2786,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -2884,6 +2891,7 @@ ExecIRDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (rettuple == NULL)
 			return false;		/* Delete was suppressed */
@@ -2942,6 +2950,7 @@ ExecBSUpdateTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -3094,6 +3103,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple == NULL)
@@ -3258,6 +3268,7 @@ ExecIRUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 		if (newtuple == NULL)
 		{
@@ -3316,6 +3327,7 @@ ExecBSTruncateTriggers(EState *estate, ResultRelInfo *relinfo)
 									   i,
 									   relinfo->ri_TrigFunctions,
 									   relinfo->ri_TrigInstrument,
+									   estate->es_instrument,
 									   GetPerTupleMemoryContext(estate));
 
 		if (newtuple)
@@ -4383,7 +4395,7 @@ AfterTriggerExecute(EState *estate,
 	 * to include time spent re-fetching tuples in the trigger cost.
 	 */
 	if (instr)
-		InstrStartTrigger(instr + tgindx);
+		InstrStartTrigger(estate->es_instrument, instr + tgindx);
 
 	/*
 	 * Fetch the required tuple(s).
@@ -4571,6 +4583,7 @@ AfterTriggerExecute(EState *estate,
 								   tgindx,
 								   finfo,
 								   NULL,
+								   NULL,
 								   per_tuple_context);
 	if (rettuple != NULL &&
 		rettuple != LocTriggerData.tg_trigtuple &&
diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c
index 77834b96a21c1..c330c891c03e3 100644
--- a/src/backend/commands/vacuumparallel.c
+++ b/src/backend/commands/vacuumparallel.c
@@ -308,8 +308,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
-	 * have no way of knowing whether anyone's looking at pgBufferUsage or
-	 * pgWalUsage, so do it unconditionally.
+	 * have no way of knowing whether anyone's looking at instrumentation, so
+	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
@@ -1006,6 +1006,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	PVIndStats *indstats;
 	PVShared   *shared;
 	TidStore   *dead_items;
+	QueryInstrumentation *instr;
 	BufferUsage *buffer_usage;
 	WalUsage   *wal_usage;
 	int			nindexes;
@@ -1095,7 +1096,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	error_context_stack = &errcallback;
 
 	/* Prepare to track buffer usage during parallel execution */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/* Process indexes to perform vacuum/cleanup */
 	parallel_vacuum_process_safe_indexes(&pvs);
@@ -1103,7 +1104,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	/* Report buffer/WAL usage during parallel execution */
 	buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
 	wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+	InstrEndParallelQuery(instr,
+						  &buffer_usage[ParallelWorkerNumber],
 						  &wal_usage[ParallelWorkerNumber]);
 
 	/* Report any remaining cost-based vacuum delay time */
diff --git a/src/backend/executor/README.instrument b/src/backend/executor/README.instrument
new file mode 100644
index 0000000000000..7df837dbc77e8
--- /dev/null
+++ b/src/backend/executor/README.instrument
@@ -0,0 +1,237 @@
+src/backend/executor/README.instrument
+
+Instrumentation
+===============
+
+The instrumentation subsystem measures time, buffer usage and WAL activity
+during query execution and other similar activities.  It is used by
+EXPLAIN ANALYZE, pg_stat_statements, and other consumers that need
+activity and/or timing metrics over a section of code.
+
+The design has two central goals:
+
+* Make it cheap to measure activity in a section of code, even when
+  that section is called many times and the aggregate is what is used
+  (as is the case with per-node instrumentation in the executor)
+
+* Ensure nested instrumentation accurately measures activity/timing,
+  even when execution is aborted due to errors being thrown.
+
+The key data structures are defined in src/include/executor/instrument.h
+and the implementation lives in src/backend/executor/instrument.c.
+
+
+Instrumentation Options
+-----------------------
+
+Callers specify what to measure with a bitmask of InstrumentOption flags:
+
+  INSTRUMENT_ROWS     -- row counts only (used with NodeInstrumentation)
+  INSTRUMENT_TIMER    -- wall-clock timing and row counts
+  INSTRUMENT_BUFFERS  -- buffer hit/read/dirtied/written counts and I/O time
+  INSTRUMENT_WAL      -- WAL records, FPI, bytes
+
+INSTRUMENT_BUFFERS and INSTRUMENT_WAL utilize the instrumentation stack
+(described below) for efficient handling of counter values.
+
+
+Struct Hierarchy
+----------------
+
+There are the following instrumentation structs, each specialized for a
+different scope:
+
+Instrumentation        Base struct.  Holds timing and buffer/WAL counters.
+
+QueryInstrumentation   Extends Instrumentation for query-level tracking.  When
+                       stack-based tracking is enabled, it owns a dedicated
+                       MemoryContext and uses the ResourceOwner mechanism for
+                       abort cleanup.
+
+NodeInstrumentation    Extends Instrumentation for per-plan-node statistics
+                       (startup time, tuple counts, loop counts, etc).
+
+TriggerInstrumentation Extends Instrumentation with a firing count.
+
+
+Stack-based instrumentation
+===========================
+
+For tracking WAL or buffer usage counters, the specialized stack-based
+instrumentation is used.
+
+A simple approach to measuring buffer/WAL activity in a code section could be
+to have a set of global counters, snapshot all the counters at the start, and
+diff them at the end. But, this is expensive in practice: BufferUsage alone
+has many fields, and the diff must be computed for every InstrStartNode /
+InstrStopNode cycle.
+
+An alternative is to write counter updates directly into the struct that
+should receive them, avoiding the diff.  But that has two complexities: Low-level
+code such as the buffer manager, has no direct pointers to higher level
+structs, such as plan nodes tracking buffer usage. And instrumentation is often
+nested: We might both be interested in the aggregate buffer usage of a query, and
+the individual per-node details. Stack-based instrumentation solves for that:
+
+At all times, there is a stack that tracks which Instrumentation is currently
+active.  The stack is represented by instr_stack, a per-backend global
+that holds a dynamic array of Instrumentation pointers.  The field
+instr_stack.current always points to the current stack entry that should
+be updated when activity occurs.  When the stack array is empty, the
+current stack points to instr_top.
+
+For example, if a backend has two portals open, the overall nesting of
+Instrumentation and their respective InstrStart/InstrStop calls creates a
+tree-like structure like this:
+
+    Session (instr_top)
+     |
+     +-- Query A (QueryInstrumentation)
+     |    |
+     |    +-- NestLoop (NodeInstrumentation)
+     |         |
+     |         +-- Seq Scan A (NodeInstrumentation)
+     |         +-- Seq Scan B (NodeInstrumentation)
+     |
+     +-- Query B (QueryInstrumentation)
+          |
+          +-- Seq Scan C (NodeInstrumentation)
+
+While executing Seq Scan B, the stack looks like:
+
+    instr_top              (implicit bottom, not in the entries array)
+    0: Query A
+    1: NestLoop
+    2: Seq Scan B          <-- instr_stack.current
+
+When no query is running, the stack is empty (stack_size == 0) and
+instr_stack.current points to instr_top.
+
+Any buffer or WAL counter update (via the INSTR_BUFUSAGE_* and
+INSTR_WALUSAGE_* macros in the buffer manager, WAL insertion code, etc.)
+writes directly into instr_stack.current.  Each instrumentation node starts
+zeroed, so the values it accumulates while on top of the stack represent
+exactly the activity that occurred during that time.
+
+Every Instrumentation node (except for instr_top) has a target, or parent, it
+will be accumulated into, which is typically the Instrumentation that was the
+current stack entry when it was created.
+
+For example, when Seq Scan A gets finalized in regular execution via ExecutorFinish,
+its instrumentation data gets added to the immediate parent in
+the execution tree, the NestLoop, which will then get added to Query A's
+QueryInstrumentation, which then accumulates to the parent.
+
+While we can typically think of this as a tree, the NodeInstrumentation
+underneath a particular QueryInstrumentation could behave differently --
+for example, it could propagate directly to the QueryInstrumentation, in
+order to not show cumulative numbers in EXPLAIN ANALYZE.
+
+Note these relationships are partially implicit, especially when it comes
+to NodeInstrumentation. Each QueryInstrumentation maintains a list of its
+unfinalized child nodes.  The parent of a QueryInstrumentation itself is
+determined by the stack (see below): when a query is finalized or cleaned
+up on abort, its counters are accumulated to whatever entry is then current
+on the stack, which is typically instr_top.
+
+
+Finalization and Abort Safety
+=============================
+
+Finalization is the process of rolling up a node's buffer/WAL counters to
+its parent.  In normal execution, nodes are pushed onto the stack when they
+start and popped when they stop; at finalization time their accumulated
+counters are added to the parent.
+
+Due to the use of longjmp for error handling, functions can exit abruptly
+without executing their normal cleanup code.  On abort, two things need
+to happen:
+
+1. The stack is reset to the level saved at the start of the aborting
+   (sub-)transaction level.  This ensures that we don't later try to update
+   counters on a freed stack entry.  We also need to ensure that the stack
+   entry that was current before a particular Instrumentation started, is
+   current again after it stops.
+
+2. Finalize all affected Instrumentation nodes, rolling up their counters
+   to the innermost surviving Instrumentation, so that data is not lost.
+
+For example, if Seq Scan B aborts while the stack is:
+
+    instr_top              (implicit bottom)
+    0: Query A
+    1: NestLoop
+    2: Seq Scan B
+
+The abort handler for Query A accumulates all unfinalized children (Seq
+Scan A, Seq Scan B, NestLoop) directly into Query A's counters, then
+unwinds the instrumentation stack and accumulates Query A's counters to
+instr_top.
+
+Note that on abort the children do not accumulate through each other (Seq
+Scan B -> NestLoop -> Query A); they all accumulate directly to their
+parent QueryInstrumentation.  This means the order in which children are
+released does not matter -- this is important because ResourceOwner cleanup
+does not guarantee a particular release order.  The per-node breakdown is lost,
+but the instrumentation active when the query was started (instr_top in the
+above example) survives the abort, and its counters include the activity.
+
+If multiple QueryInstrumentations are active on the stack (e.g. nested
+portals), the abort handler of each uses InstrStopFinalize() to accumulate
+the statistics to the parent entry of either the entry being released, or a
+previously released entry if it was higher up in the stack, so they compose
+correctly regardless of release order.
+
+There are two mechanisms for achieving abort safety:
+
+* Resource Owner (QueryInstrumentation): registers with the current
+  ResourceOwner at start.  On transaction abort, the resource owner system
+  calls the release callback, which walks unfinalized child entries,
+  accumulates their data, unwinds the stack, and destroys the dedicated
+  memory context (freeing the QueryInstrumentation and all child
+  allocations as a unit).  This is the recommended approach when the
+  instrumented code already has an appropriate resource owner (e.g. it
+  runs inside a portal).  The query executor uses this path.
+
+* PG_FINALLY (base Instrumentation): when no suitable resource owner
+  exists, or when the caller wants to inspect the instrumentation data
+  even after an error, the base Instrumentation can be used with a
+  PG_TRY/PG_FINALLY block that calls InstrStopFinalize().
+
+Both mechanisms add overhead, so neither is suitable for high-frequency
+instrumentation like per-node measurements in the executor.  Instead,
+plan node and trigger children rely on their parent QueryInstrumentation
+for abort safety: they are allocated in the parent's memory context and
+registered in its unfinalized-entries list, so the parent's abort handler
+recovers their data automatically.  In normal execution, children are
+finalized explicitly by the caller.
+
+Parallel Query
+--------------
+
+Parallel workers get their own QueryInstrumentation so they can measure
+buffer and WAL activity independently, then copy the totals into dynamic
+shared memory at worker shutdown.  The leader accumulates these into its
+own stack.
+
+When per-node instrumentation is active, parallel workers skip per-node
+finalization at shutdown to avoid double-counting; the per-node data is
+aggregated separately through InstrAggNode().
+
+
+Memory Handling
+===============
+
+Instrumentation objects that use the stack must survive until finalization
+runs, including the abort case.  To ensure this, QueryInstrumentation
+creates a dedicated "Instrumentation" MemoryContext (instr_cxt) as a child
+of TopMemoryContext.  All child instrumentation (nodes, triggers) should be
+allocated in this context.
+
+On successful completion, instr_cxt is reparented to CurrentMemoryContext
+so its lifetime is tied to the caller's context.  On abort, the
+ResourceOwner cleanup frees it after accumulating the instrumentation data
+to the current stack entry after resetting the stack.
+
+When the stack is not needed (timer/rows only), Instrumentation allocations
+happen in CurrentMemoryContext instead of TopMemoryContext.
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index b0f636bf8b6c2..d0cd34d286c28 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -78,6 +78,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
 /* decls for local routines only used within this module */
 static void InitPlan(QueryDesc *queryDesc, int eflags);
 static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecFinalizeTriggerInstrumentation(EState *estate);
 static void ExecPostprocessPlan(EState *estate);
 static void ExecEndPlan(PlanState *planstate, EState *estate);
 static void ExecutePlan(QueryDesc *queryDesc,
@@ -247,9 +248,16 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot);
 	estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot);
 	estate->es_top_eflags = eflags;
-	estate->es_instrument = queryDesc->instrument_options;
 	estate->es_jit_flags = queryDesc->plannedstmt->jitFlags;
 
+	/*
+	 * Set up per-node instrumentation if needed. We do this before InitPlan
+	 * so that node and trigger instrumentation can be allocated within the
+	 * query's dedicated instrumentation memory context.
+	 */
+	if (!estate->es_instrument && queryDesc->instrument_options)
+		estate->es_instrument = InstrQueryAlloc(queryDesc->instrument_options);
+
 	/*
 	 * Set up an AFTER-trigger statement context, unless told not to, or
 	 * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
@@ -331,9 +339,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	 */
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
 
-	/* Allow instrumentation of Executor overall runtime */
+	/* Start up instrumentation for this execution run */
 	if (queryDesc->totaltime)
-		InstrStart(queryDesc->totaltime);
+		InstrQueryStart(queryDesc->totaltime);
+	if (estate->es_instrument)
+		InstrQueryStart(estate->es_instrument);
 
 	/*
 	 * extract information from the query descriptor and the query feature.
@@ -384,8 +394,10 @@ standard_ExecutorRun(QueryDesc *queryDesc,
 	if (sendTuples)
 		dest->rShutdown(dest);
 
+	if (estate->es_instrument)
+		InstrQueryStop(estate->es_instrument);
 	if (queryDesc->totaltime)
-		InstrStop(queryDesc->totaltime);
+		InstrQueryStop(queryDesc->totaltime);
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -435,7 +447,9 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 
 	/* Allow instrumentation of Executor overall runtime */
 	if (queryDesc->totaltime)
-		InstrStart(queryDesc->totaltime);
+		InstrQueryStart(queryDesc->totaltime);
+	if (estate->es_instrument)
+		InstrQueryStart(estate->es_instrument);
 
 	/* Run ModifyTable nodes to completion */
 	ExecPostprocessPlan(estate);
@@ -444,8 +458,32 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 	if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
 		AfterTriggerEndQuery(estate);
 
+	if (estate->es_instrument)
+	{
+		/*
+		 * Accumulate per-node and trigger statistics to their respective
+		 * parent instrumentation stacks.
+		 *
+		 * We skip this in parallel workers because their per-node stats are
+		 * reported individually via ExecParallelReportInstrumentation, and
+		 * the leader's own ExecFinalizeNodeInstrumentation handles
+		 * propagation.  If we accumulated here, the leader would
+		 * double-count: worker parent nodes would already include their
+		 * children's stats, and then the leader's accumulation would add the
+		 * children again.
+		 */
+		if (!IsParallelWorker())
+		{
+			ExecFinalizeNodeInstrumentation(queryDesc->planstate);
+
+			ExecFinalizeTriggerInstrumentation(estate);
+		}
+
+		InstrQueryStopFinalize(estate->es_instrument);
+	}
+
 	if (queryDesc->totaltime)
-		InstrStop(queryDesc->totaltime);
+		InstrQueryStopFinalize(queryDesc->totaltime);
 
 	MemoryContextSwitchTo(oldcontext);
 
@@ -1263,7 +1301,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 				  Relation resultRelationDesc,
 				  Index resultRelationIndex,
 				  ResultRelInfo *partition_root_rri,
-				  int instrument_options)
+				  QueryInstrumentation *qinstr)
 {
 	MemSet(resultRelInfo, 0, sizeof(ResultRelInfo));
 	resultRelInfo->type = T_ResultRelInfo;
@@ -1284,8 +1322,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 			palloc0_array(FmgrInfo, n);
 		resultRelInfo->ri_TrigWhenExprs = (ExprState **)
 			palloc0_array(ExprState *, n);
-		if (instrument_options)
-			resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options);
+		if (qinstr)
+			resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(qinstr, n);
 	}
 	else
 	{
@@ -1358,6 +1396,10 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
  * also provides a way for EXPLAIN ANALYZE to report the runtimes of such
  * triggers.)  So we make additional ResultRelInfo's as needed, and save them
  * in es_trig_target_relations.
+ *
+ * Note: if new relation lists are searched here, they must also be added to
+ * ExecFinalizeTriggerInstrumentation so that trigger instrumentation data
+ * is properly accumulated.
  */
 ResultRelInfo *
 ExecGetTriggerResultRel(EState *estate, Oid relid,
@@ -1500,6 +1542,30 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo)
 	return resultRelInfo->ri_ancestorResultRels;
 }
 
+static void
+ExecFinalizeTriggerInstrumentation(EState *estate)
+{
+	List	   *rels = NIL;
+
+	rels = list_concat(rels, estate->es_tuple_routing_result_relations);
+	rels = list_concat(rels, estate->es_opened_result_relations);
+	rels = list_concat(rels, estate->es_trig_target_relations);
+
+	foreach_node(ResultRelInfo, rInfo, rels)
+	{
+		TriggerInstrumentation *ti = rInfo->ri_TrigInstrument;
+
+		if (ti == NULL || rInfo->ri_TrigDesc == NULL)
+			continue;
+
+		for (int nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
+		{
+			if (ti[nt].instr.need_stack)
+				InstrAccumStack(&estate->es_instrument->instr, &ti[nt].instr);
+		}
+	}
+}
+
 /* ----------------------------------------------------------------
  *		ExecPostprocessPlan
  *
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 78f60c1530ce1..c01e780f918e1 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -700,7 +700,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	 *
 	 * If EXPLAIN is not in use and there are no extensions loaded that care,
 	 * we could skip this.  But we have no way of knowing whether anyone's
-	 * looking at pgBufferUsage, so do it unconditionally.
+	 * looking at instrumentation, so do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
 						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
@@ -825,13 +825,13 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		int			i;
 
 		instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
-		instrumentation->instrument_options = estate->es_instrument;
+		instrumentation->instrument_options = estate->es_instrument->instrument_options;
 		instrumentation->instrument_offset = instrument_offset;
 		instrumentation->num_workers = nworkers;
 		instrumentation->num_plan_nodes = e.nnodes;
 		instrument = GetInstrumentationArray(instrumentation);
 		for (i = 0; i < nworkers * e.nnodes; ++i)
-			InstrInitNode(&instrument[i], estate->es_instrument);
+			InstrInitNode(&instrument[i], estate->es_instrument->instrument_options);
 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
 					   instrumentation);
 		pei->instrumentation = instrumentation;
@@ -1081,14 +1081,28 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	instrument = GetInstrumentationArray(instrumentation);
 	instrument += i * instrumentation->num_workers;
 	for (n = 0; n < instrumentation->num_workers; ++n)
+	{
 		InstrAggNode(planstate->instrument, &instrument[n]);
 
+		/*
+		 * Also add worker WAL usage to the global pgWalUsage counter.
+		 *
+		 * When per-node instrumentation is active, parallel workers skip
+		 * ExecFinalizeNodeInstrumentation (to avoid double-counting in
+		 * EXPLAIN), so per-node WAL activity is not rolled up into the
+		 * query-level stats that InstrAccumParallelQuery receives. Without
+		 * this, pgWalUsage would under-report WAL generated by parallel
+		 * workers when instrumentation is active.
+		 */
+		WalUsageAdd(&pgWalUsage, &instrument[n].instr.walusage);
+	}
+
 	/*
 	 * Also store the per-worker detail.
 	 *
-	 * Worker instrumentation should be allocated in the same context as the
-	 * regular instrumentation information, which is the per-query context.
-	 * Switch into per-query memory context.
+	 * Ensure worker instrumentation is allocated in the per-query context. We
+	 * don't need to place this in the instrumentation context since no more
+	 * stack-based instrumentation work is being done.
 	 */
 	oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
 	ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
@@ -1238,9 +1252,13 @@ ExecParallelCleanup(ParallelExecutorInfo *pei)
 {
 	/* Accumulate instrumentation, if any. */
 	if (pei->instrumentation)
+	{
 		ExecParallelRetrieveInstrumentation(pei->planstate,
 											pei->instrumentation);
 
+		ExecFinalizeWorkerInstrumentation(pei->planstate);
+	}
+
 	/* Accumulate JIT instrumentation, if any. */
 	if (pei->jit_instrumentation)
 		ExecParallelRetrieveJitInstrumentation(pei->planstate,
@@ -1462,6 +1480,7 @@ void
 ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 {
 	FixedParallelExecutorState *fpes;
+	QueryInstrumentation *instr;
 	BufferUsage *buffer_usage;
 	WalUsage   *wal_usage;
 	DestReceiver *receiver;
@@ -1522,7 +1541,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	 * leader, which also doesn't count buffer accesses and WAL activity that
 	 * occur during executor startup.
 	 */
-	InstrStartParallelQuery();
+	instr = InstrStartParallelQuery();
 
 	/*
 	 * Run the plan.  If we specified a tuple bound, be careful not to demand
@@ -1538,7 +1557,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	/* Report buffer/WAL usage during parallel execution. */
 	buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
 	wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+	InstrEndParallelQuery(instr,
+						  &buffer_usage[ParallelWorkerNumber],
 						  &wal_usage[ParallelWorkerNumber]);
 
 	/* Report instrumentation data if any instrumentation options are set. */
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index d96d4f9947b79..6f2909a1bc35a 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1381,7 +1381,7 @@ ExecInitPartitionDispatchInfo(EState *estate,
 	{
 		ResultRelInfo *rri = makeNode(ResultRelInfo);
 
-		InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
+		InitResultRelInfo(rri, rel, 0, rootResultRelInfo, NULL);
 		proute->nonleaf_partitions[dispatchidx] = rri;
 	}
 	else
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 132fe37ef60f8..3b3ec9850e8e1 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -123,6 +123,8 @@
 static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
 static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
 static bool ExecShutdownNode_walker(PlanState *node, void *context);
+static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context);
+static bool ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context);
 
 
 /* ------------------------------------------------------------------------
@@ -788,10 +790,10 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	 * at least once already.  We don't expect much CPU consumption during
 	 * node shutdown, but in the case of Gather or Gather Merge, we may shut
 	 * down workers at this stage.  If so, their buffer usage will get
-	 * propagated into pgBufferUsage at this point, and we want to make sure
-	 * that it gets associated with the Gather node.  We skip this if the node
-	 * has never been executed, so as to avoid incorrectly making it appear
-	 * that it has.
+	 * propagated into the current instrumentation stack entry at this point,
+	 * and we want to make sure that it gets associated with the Gather node.
+	 * We skip this if the node has never been executed, so as to avoid
+	 * incorrectly making it appear that it has.
 	 */
 	if (node->instrument && node->instrument->running)
 		InstrStartNode(node->instrument);
@@ -829,6 +831,99 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	return false;
 }
 
+/*
+ * ExecFinalizeNodeInstrumentation
+ *
+ * Accumulate instrumentation stats from all execution nodes to their respective
+ * parents (or the original parent instrumentation).
+ *
+ * This must run after the cleanup done by ExecShutdownNode, and not rely on any
+ * resources cleaned up by it. We also expect shutdown actions to have occurred,
+ * e.g. parallel worker instrumentation to have been added to the leader.
+ */
+void
+ExecFinalizeNodeInstrumentation(PlanState *node)
+{
+	(void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current);
+}
+
+static bool
+ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context)
+{
+	Instrumentation *parent = (Instrumentation *) context;
+
+	Assert(parent != NULL);
+
+	if (node == NULL)
+		return false;
+
+	Assert(node->instrument != NULL);
+
+	/*
+	 * Recurse into children first (bottom-up accumulation), and accummulate
+	 * to this nodes instrumentation as the parent context.
+	 */
+	planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker,
+						  &node->instrument->instr);
+
+	InstrFinalizeChild(&node->instrument->instr, parent);
+
+	return false;
+}
+
+/*
+ * ExecFinalizeWorkerInstrumentation
+ *
+ * Accumulate per-worker instrumentation stats from child nodes into their
+ * parents, mirroring what ExecFinalizeNodeInstrumentation does for the
+ * leader's own stats.  Without this, per-worker buffer/WAL stats shown by
+ * EXPLAIN (ANALYZE, VERBOSE) would only reflect each node's own direct
+ * activity, not including children.
+ *
+ * This must run after ExecParallelRetrieveInstrumentation has populated
+ * worker_instrument for all nodes in the parallel subtree.
+ */
+void
+ExecFinalizeWorkerInstrumentation(PlanState *node)
+{
+	(void) ExecFinalizeWorkerInstrumentation_walker(node, NULL);
+}
+
+static bool
+ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context)
+{
+	PlanState  *parent = (PlanState *) context;
+	int			num_workers;
+
+	if (node == NULL)
+		return false;
+
+	/*
+	 * Recurse into children first (bottom-up accumulation), passing this node
+	 * as parent context if it has worker_instrument, otherwise pass through
+	 * the previous parent.
+	 */
+	planstate_tree_walker(node, ExecFinalizeWorkerInstrumentation_walker,
+						  node->worker_instrument ? (void *) node : context);
+
+	if (!node->worker_instrument)
+		return false;
+
+	num_workers = node->worker_instrument->num_workers;
+
+	/* Accumulate this node's per-worker stats to parent's per-worker stats */
+	if (parent && parent->worker_instrument)
+	{
+		int			parent_workers = parent->worker_instrument->num_workers;
+
+		for (int n = 0; n < Min(num_workers, parent_workers); n++)
+			InstrAccumStack(&parent->worker_instrument->instrument[n].instr,
+							&node->worker_instrument->instrument[n].instr);
+	}
+
+	return false;
+}
+
 /*
  * ExecSetTupleBound
  *
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 1eb6b9f1f4068..700764daf45d3 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -150,7 +150,7 @@ CreateExecutorState(void)
 	estate->es_total_processed = 0;
 
 	estate->es_top_eflags = 0;
-	estate->es_instrument = 0;
+	estate->es_instrument = NULL;
 	estate->es_finished = false;
 
 	estate->es_exprcontexts = NIL;
@@ -227,6 +227,15 @@ FreeExecutorState(EState *estate)
 		estate->es_partition_directory = NULL;
 	}
 
+	/*
+	 * Make sure the instrumentation context gets freed. This usually gets
+	 * re-parented under the per-query context in InstrQueryStopFinalize, but
+	 * that won't happen during EXPLAIN (BUFFERS) since ExecutorFinish never
+	 * gets called, so we would otherwise leak it in TopMemoryContext.
+	 */
+	if (estate->es_instrument && estate->es_instrument->instr.need_stack)
+		MemoryContextDelete(estate->es_instrument->instr_cxt);
+
 	/*
 	 * Free the per-query memory context, thereby releasing all working
 	 * memory, including the EState node itself.
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index e3d890a7f98d6..f9202b558d656 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,31 +16,53 @@
 #include <unistd.h>
 
 #include "executor/instrument.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
 
-BufferUsage pgBufferUsage;
-static BufferUsage save_pgBufferUsage;
 WalUsage	pgWalUsage;
-static WalUsage save_pgWalUsage;
+Instrumentation instr_top;
+InstrStackState instr_stack = {
+	.stack_space = 0,
+	.stack_size = 0,
+	.entries = NULL,
+	.current = &instr_top,
+};
 
-static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
-static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+void
+InstrStackGrow(void)
+{
+	int			space = instr_stack.stack_space;
 
+	Assert(instr_stack.stack_size >= instr_stack.stack_space);
+
+	if (instr_stack.entries == NULL)
+	{
+		space = 10;				/* Allocate sufficient initial space for
+								 * typical activity */
+		instr_stack.entries = MemoryContextAlloc(TopMemoryContext,
+												 sizeof(Instrumentation *) * space);
+	}
+	else
+	{
+		space *= 2;
+		instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, space);
+	}
+
+	/* Update stack space after allocation succeeded to protect against OOMs */
+	instr_stack.stack_space = space;
+}
 
 /* General purpose instrumentation handling */
-Instrumentation *
-InstrAlloc(int instrument_options)
+static inline bool
+InstrNeedStack(int instrument_options)
 {
-	Instrumentation *instr = palloc0(sizeof(Instrumentation));
-
-	InstrInitOptions(instr, instrument_options);
-	return instr;
+	return (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) != 0;
 }
 
 void
 InstrInitOptions(Instrumentation *instr, int instrument_options)
 {
-	instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
-	instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+	instr->need_stack = InstrNeedStack(instrument_options);
 	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
 }
 
@@ -55,52 +77,309 @@ InstrStart(Instrumentation *instr)
 			INSTR_TIME_SET_CURRENT(instr->starttime);
 	}
 
-	/* save buffer usage totals at node entry, if needed */
-	if (instr->need_bufusage)
-		instr->bufusage_start = pgBufferUsage;
+	if (instr->need_stack)
+		InstrPushStack(instr);
+}
+
+static void
+InstrStopTimer(Instrumentation *instr)
+{
+	instr_time	endtime;
 
-	if (instr->need_walusage)
-		instr->walusage_start = pgWalUsage;
+	/* let's update the time only if the timer was requested */
+	if (INSTR_TIME_IS_ZERO(instr->starttime))
+		elog(ERROR, "InstrStop called without start");
+
+	INSTR_TIME_SET_CURRENT(endtime);
+	INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+
+	INSTR_TIME_SET_ZERO(instr->starttime);
 }
 
 void
 InstrStop(Instrumentation *instr)
 {
-	instr_time	endtime;
+	if (instr->need_timer)
+		InstrStopTimer(instr);
+
+	if (instr->need_stack)
+		InstrPopStack(instr);
+}
+
+/*
+ * Stops instrumentation, finalizes the stack entry and accumulates to its parent.
+ *
+ * Note that this intentionally allows passing a stack that is not the current
+ * top, as can happen with PG_FINALLY, or resource owners, which don't have a
+ * guaranteed cleanup order.
+ *
+ * We are careful here to achieve two goals:
+ *
+ * 1) Reset the stack to the parent of whichever of the released stack entries
+ *    has the lowest index
+ * 2) Accumulate all instrumentation to the currently active instrumentation,
+ *    so that callers get a complete picture of activity, even after an abort
+ */
+void
+InstrStopFinalize(Instrumentation *instr)
+{
+	if (instr->on_stack)
+	{
+		int			idx = -1;
+
+		for (int i = instr_stack.stack_size - 1; i >= 0; i--)
+		{
+			if (instr_stack.entries[i] == instr)
+			{
+				idx = i;
+				break;
+			}
+		}
+
+		if (idx < 0)
+			elog(ERROR, "instrumentation entry not found on stack");
+
+		/* Clear on_stack for any intermediate entries we're skipping over */
+		for (int i = instr_stack.stack_size - 1; i > idx; i--)
+			instr_stack.entries[i]->on_stack = false;
+
+		while (instr_stack.stack_size > idx + 1)
+			instr_stack.stack_size--;
+
+		InstrPopStack(instr);
+	}
 
-	/* let's update the time only if the timer was requested */
 	if (instr->need_timer)
+		InstrStopTimer(instr);
+
+	InstrAccumStack(instr_stack.current, instr);
+}
+
+/*
+ * Finalize child instrumentation by accumulating buffer/WAL usage to the
+ * provided instrumentation, which may be the current entry, or one the caller
+ * treats as a parent and will add to the totals later.
+ *
+ * Also deletes the unfinalized entry to avoid double counting in an abort
+ * situation, e.g. during executor finish.
+ */
+void
+InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent)
+{
+	if (instr->need_stack)
 	{
-		if (INSTR_TIME_IS_ZERO(instr->starttime))
-			elog(ERROR, "InstrStop called without start");
+		if (!dlist_node_is_detached(&instr->unfinalized_entry))
+			dlist_delete_thoroughly(&instr->unfinalized_entry);
 
-		INSTR_TIME_SET_CURRENT(endtime);
-		INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+		InstrAccumStack(parent, instr);
+	}
+}
+
+
+/* Query instrumentation handling */
+
+/*
+ * Use ResourceOwner mechanism to correctly reset instr_stack on abort.
+ */
+static void ResOwnerReleaseInstrumentation(Datum res);
+static const ResourceOwnerDesc instrumentation_resowner_desc =
+{
+	.name = "instrumentation",
+	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+	.release_priority = RELEASE_PRIO_INSTRUMENTATION,
+	.ReleaseResource = ResOwnerReleaseInstrumentation,
+	.DebugPrint = NULL,			/* default message is fine */
+};
+
+static inline void
+ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+	ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static inline void
+ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+	ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static void
+ResOwnerReleaseInstrumentation(Datum res)
+{
+	QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res);
+	MemoryContext instr_cxt = qinstr->instr_cxt;
+	dlist_mutable_iter iter;
+
+	/* Accumulate data from all unfinalized child entries (nodes, triggers) */
+	dlist_foreach_modify(iter, &qinstr->unfinalized_entries)
+	{
+		Instrumentation *child = dlist_container(Instrumentation, unfinalized_entry, iter.cur);
 
-		INSTR_TIME_SET_ZERO(instr->starttime);
+		InstrAccumStack(&qinstr->instr, child);
 	}
 
-	/* Add delta of buffer usage since entry to node's totals */
-	if (instr->need_bufusage)
-		BufferUsageAccumDiff(&instr->bufusage,
-							 &pgBufferUsage, &instr->bufusage_start);
+	/* Ensure the stack is reset as expected, and we accumulate to the parent */
+	InstrStopFinalize(&qinstr->instr);
 
-	if (instr->need_walusage)
-		WalUsageAccumDiff(&instr->walusage,
-						  &pgWalUsage, &instr->walusage_start);
+	/*
+	 * Destroy the dedicated instrumentation context, which frees the
+	 * QueryInstrumentation and all child allocations.
+	 */
+	MemoryContextDelete(instr_cxt);
+}
+
+QueryInstrumentation *
+InstrQueryAlloc(int instrument_options)
+{
+	QueryInstrumentation *instr;
+	MemoryContext instr_cxt;
+
+	/*
+	 * When the instrumentation stack is used, create a dedicated memory
+	 * context for this query's instrumentation allocations. This context is a
+	 * child of TopMemoryContext so it survives transaction abort —
+	 * ResourceOwner release needs to access it.
+	 *
+	 * For simpler cases (timer/rows only), use the current memory context.
+	 *
+	 * All child instrumentation allocations (nodes, triggers, etc) must be
+	 * allocated within this context to ensure correct clean up on abort.
+	 */
+	if (InstrNeedStack(instrument_options))
+		instr_cxt = AllocSetContextCreate(TopMemoryContext,
+										  "Instrumentation",
+										  ALLOCSET_SMALL_SIZES);
+	else
+		instr_cxt = CurrentMemoryContext;
+
+	instr = MemoryContextAllocZero(instr_cxt, sizeof(QueryInstrumentation));
+	instr->instrument_options = instrument_options;
+	instr->instr_cxt = instr_cxt;
+
+	InstrInitOptions(&instr->instr, instrument_options);
+	dlist_init(&instr->unfinalized_entries);
+
+	return instr;
+}
+
+void
+InstrQueryStart(QueryInstrumentation *qinstr)
+{
+	InstrStart(&qinstr->instr);
+
+	if (qinstr->instr.need_stack)
+	{
+		Assert(CurrentResourceOwner != NULL);
+		qinstr->owner = CurrentResourceOwner;
+
+		ResourceOwnerEnlarge(qinstr->owner);
+		ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr);
+	}
+}
+
+void
+InstrQueryStop(QueryInstrumentation *qinstr)
+{
+	InstrStop(&qinstr->instr);
+
+	if (qinstr->instr.need_stack)
+	{
+		Assert(qinstr->owner != NULL);
+		ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+		qinstr->owner = NULL;
+	}
+}
+
+void
+InstrQueryStopFinalize(QueryInstrumentation *qinstr)
+{
+	InstrStopFinalize(&qinstr->instr);
+
+	if (!qinstr->instr.need_stack)
+	{
+		Assert(qinstr->owner == NULL);
+		return;
+	}
+
+	Assert(qinstr->owner != NULL);
+	ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+	qinstr->owner = NULL;
+
+	/*
+	 * Reparent the dedicated instrumentation context under the current memory
+	 * context, so that its lifetime is now tied to the caller's context
+	 * rather than TopMemoryContext.
+	 */
+	MemoryContextSetParent(qinstr->instr_cxt, CurrentMemoryContext);
+}
+
+/*
+ * Register a child Instrumentation entry for abort processing.
+ *
+ * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to
+ * recover buffer/WAL data from entries that were never finalized, in order for
+ * aggregate totals to be accurate despite the query erroring out.
+ */
+void
+InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *child)
+{
+	if (child->need_stack)
+		dlist_push_head(&parent->unfinalized_entries, &child->unfinalized_entry);
+}
+
+/* start instrumentation during parallel executor startup */
+QueryInstrumentation *
+InstrStartParallelQuery(void)
+{
+	QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+
+	InstrQueryStart(qinstr);
+	return qinstr;
+}
+
+/* report usage after parallel executor shutdown */
+void
+InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage)
+{
+	InstrQueryStopFinalize(qinstr);
+	memcpy(bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
+	memcpy(walusage, &qinstr->instr.walusage, sizeof(WalUsage));
+}
+
+/*
+ * Accumulate work done by parallel workers in the leader's stats.
+ *
+ * Note that what gets added here effectively depends on whether per-node
+ * instrumentation is active. If it's active the parallel worker intentionally
+ * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would
+ * cause double counting. Instead, this only accumulates any extra activity
+ * outside of nodes.
+ *
+ * Otherwise this is responsible for making sure that the complete query
+ * activity is accumulated.
+ */
+void
+InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+{
+	BufferUsageAdd(&instr_stack.current->bufusage, bufusage);
+	WalUsageAdd(&instr_stack.current->walusage, walusage);
+
+	WalUsageAdd(&pgWalUsage, walusage);
 }
 
 /* Node instrumentation handling */
 
 /* Allocate new node instrumentation structure */
 NodeInstrumentation *
-InstrAllocNode(int instrument_options, bool async_mode)
+InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode)
 {
-	NodeInstrumentation *instr = palloc(sizeof(NodeInstrumentation));
+	NodeInstrumentation *instr = MemoryContextAlloc(qinstr->instr_cxt, sizeof(NodeInstrumentation));
 
-	InstrInitNode(instr, instrument_options);
+	InstrInitNode(instr, qinstr->instrument_options);
 	instr->async_mode = async_mode;
 
+	InstrQueryRememberChild(qinstr, &instr->instr);
+
 	return instr;
 }
 
@@ -119,6 +398,7 @@ InstrStartNode(NodeInstrumentation *instr)
 	InstrStart(&instr->instr);
 }
 
+
 /* Exit from a plan node */
 void
 InstrStopNode(NodeInstrumentation *instr, double nTuples)
@@ -148,14 +428,12 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples)
 		INSTR_TIME_SET_ZERO(instr->instr.starttime);
 	}
 
-	/* Add delta of buffer usage since entry to node's totals */
-	if (instr->instr.need_bufusage)
-		BufferUsageAccumDiff(&instr->instr.bufusage,
-							 &pgBufferUsage, &instr->instr.bufusage_start);
-
-	if (instr->instr.need_walusage)
-		WalUsageAccumDiff(&instr->instr.walusage,
-						  &pgWalUsage, &instr->instr.walusage_start);
+	/*
+	 * Only pop the stack, accumulation runs in
+	 * ExecFinalizeNodeInstrumentation
+	 */
+	if (instr->instr.need_stack)
+		InstrPopStack(&instr->instr);
 
 	/* Is this the first tuple of this cycle? */
 	if (!instr->running)
@@ -190,8 +468,8 @@ InstrEndLoop(NodeInstrumentation *instr)
 	if (!instr->running)
 		return;
 
-	if (!INSTR_TIME_IS_ZERO(instr->instr.starttime))
-		elog(ERROR, "InstrEndLoop called on running node");
+	/* Ensure InstrNodeStop was called */
+	Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime));
 
 	/* Accumulate per-cycle statistics into totals */
 	INSTR_TIME_ADD(instr->startup, instr->firsttuple);
@@ -225,67 +503,73 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
 	dst->nfiltered2 += add->nfiltered2;
 
 	/* Add delta of buffer usage since entry to node's totals */
-	if (dst->instr.need_bufusage)
-		BufferUsageAdd(&dst->instr.bufusage, &add->instr.bufusage);
-
-	if (dst->instr.need_walusage)
-		WalUsageAdd(&dst->instr.walusage, &add->instr.walusage);
+	if (dst->instr.need_stack)
+		InstrAccumStack(&dst->instr, &add->instr);
 }
 
 /* Trigger instrumentation handling */
 TriggerInstrumentation *
-InstrAllocTrigger(int n, int instrument_options)
+InstrAllocTrigger(QueryInstrumentation *qinstr, int n)
 {
-	TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation));
+	TriggerInstrumentation *tginstr;
 	int			i;
 
+	/*
+	 * Allocate in the query's dedicated instrumentation context so all
+	 * instrumentation data is grouped together and cleaned up as a unit.
+	 */
+	Assert(qinstr != NULL && qinstr->instr_cxt != NULL);
+	tginstr = MemoryContextAllocZero(qinstr->instr_cxt,
+									 n * sizeof(TriggerInstrumentation));
+
 	for (i = 0; i < n; i++)
-		InstrInitOptions(&tginstr[i].instr, instrument_options);
+		InstrInitOptions(&tginstr[i].instr, qinstr->instrument_options);
 
 	return tginstr;
 }
 
 void
-InstrStartTrigger(TriggerInstrumentation *tginstr)
+InstrStartTrigger(QueryInstrumentation *qinstr, TriggerInstrumentation *tginstr)
 {
 	InstrStart(&tginstr->instr);
+
+	/*
+	 * On first call, register with the parent QueryInstrumentation for abort
+	 * recovery.
+	 */
+	if (qinstr && tginstr->instr.need_stack &&
+		dlist_node_is_detached(&tginstr->instr.unfinalized_entry))
+		dlist_push_head(&qinstr->unfinalized_entries,
+						&tginstr->instr.unfinalized_entry);
 }
 
 void
 InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
 {
+	/*
+	 * This trigger may be called again, so we don't finalize instrumentation
+	 * here. Accumulation to the parent happens at ExecutorFinish through
+	 * ExecFinalizeTriggerInstrumentation.
+	 */
 	InstrStop(&tginstr->instr);
 	tginstr->firings += firings;
 }
 
-/* note current values during parallel executor startup */
 void
-InstrStartParallelQuery(void)
+InstrAccumStack(Instrumentation *dst, Instrumentation *add)
 {
-	save_pgBufferUsage = pgBufferUsage;
-	save_pgWalUsage = pgWalUsage;
-}
+	Assert(dst != NULL);
+	Assert(add != NULL);
 
-/* report usage after parallel executor shutdown */
-void
-InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
-{
-	memset(bufusage, 0, sizeof(BufferUsage));
-	BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
-	memset(walusage, 0, sizeof(WalUsage));
-	WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
-}
+	if (!add->need_stack)
+		return;
 
-/* accumulate work done by workers in leader's stats */
-void
-InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
-{
-	BufferUsageAdd(&pgBufferUsage, bufusage);
-	WalUsageAdd(&pgWalUsage, walusage);
+	BufferUsageAdd(&dst->bufusage, &add->bufusage);
+	WalUsageAdd(&dst->walusage, &add->walusage);
 }
 
 /* dst += add */
-static void
+void
 BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 {
 	dst->shared_blks_hit += add->shared_blks_hit;
@@ -306,39 +590,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
 }
 
-/* dst += add - sub */
+/* dst += add */
 void
-BufferUsageAccumDiff(BufferUsage *dst,
-					 const BufferUsage *add,
-					 const BufferUsage *sub)
-{
-	dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
-	dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
-	dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
-	dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
-	dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
-	dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
-	dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
-	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
-	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
-	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
-	INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
-						  add->shared_blk_read_time, sub->shared_blk_read_time);
-	INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
-						  add->shared_blk_write_time, sub->shared_blk_write_time);
-	INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
-						  add->local_blk_read_time, sub->local_blk_read_time);
-	INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
-						  add->local_blk_write_time, sub->local_blk_write_time);
-	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
-						  add->temp_blk_read_time, sub->temp_blk_read_time);
-	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
-						  add->temp_blk_write_time, sub->temp_blk_write_time);
-}
-
-/* helper functions for WAL usage accumulation */
-static void
-WalUsageAdd(WalUsage *dst, WalUsage *add)
+WalUsageAdd(WalUsage *dst, const WalUsage *add)
 {
 	dst->wal_bytes += add->wal_bytes;
 	dst->wal_records += add->wal_records;
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index b38170f0fbe99..3ca0a7a635dba 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -904,7 +904,7 @@ create_edata_for_relation(LogicalRepRelMapEntry *rel)
 	 * Use Relation opened by logicalrep_rel_open() instead of opening it
 	 * again.
 	 */
-	InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
+	InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, NULL);
 
 	/*
 	 * We put the ResultRelInfo in the es_opened_result_relations list, even
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3e1c39160db0d..cf4f4246ca2b4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1266,9 +1266,9 @@ PinBufferForBlock(Relation rel,
 	if (rel)
 	{
 		/*
-		 * While pgBufferUsage's "read" counter isn't bumped unless we reach
-		 * WaitReadBuffers() (so, not for hits, and not for buffers that are
-		 * zeroed instead), the per-relation stats always count them.
+		 * While the current buffer usage "read" counter isn't bumped unless
+		 * we reach WaitReadBuffers() (so, not for hits, and not for buffers
+		 * that are zeroed instead), the per-relation stats always count them.
 		 */
 		pgstat_count_buffer_read(rel);
 	}
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index e3829d7fe7cef..e7fc7f071d84e 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc)
  * pg_stat_database only counts block read and write times, these are done for
  * IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
  *
- * pgBufferUsage is used for EXPLAIN.  pgBufferUsage has write and read stats
- * for shared, local and temporary blocks.  pg_stat_io does not track the
- * activity of temporary blocks, so these are ignored here.
+ * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has
+ * write and read stats for shared, local and temporary blocks. pg_stat_io
+ * does not track the activity of temporary blocks, so these are ignored here.
  */
 void
 pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index d3a572428449d..340029a203422 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -51,8 +51,8 @@ typedef struct QueryDesc
 	/* This field is set by ExecutePlan */
 	bool		already_executed;	/* true if previously executed */
 
-	/* This is always set NULL by the core system, but plugins can change it */
-	struct Instrumentation *totaltime;	/* total time spent in ExecutorRun */
+	/* This field is set by ExecutorRun, or plugins */
+	struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */
 } QueryDesc;
 
 /* in pquery.c */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 491c48865066a..03f0e864176eb 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -233,6 +233,7 @@ ExecGetJunkAttribute(TupleTableSlot *slot, AttrNumber attno, bool *isNull)
 /*
  * prototypes from functions in execMain.c
  */
+typedef struct QueryInstrumentation QueryInstrumentation;
 extern void ExecutorStart(QueryDesc *queryDesc, int eflags);
 extern void standard_ExecutorStart(QueryDesc *queryDesc, int eflags);
 extern void ExecutorRun(QueryDesc *queryDesc,
@@ -254,7 +255,7 @@ extern void InitResultRelInfo(ResultRelInfo *resultRelInfo,
 							  Relation resultRelationDesc,
 							  Index resultRelationIndex,
 							  ResultRelInfo *partition_root_rri,
-							  int instrument_options);
+							  QueryInstrumentation *qinstr);
 extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid,
 											  ResultRelInfo *rootRelInfo);
 extern List *ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo);
@@ -301,6 +302,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function);
 extern Node *MultiExecProcNode(PlanState *node);
 extern void ExecEndNode(PlanState *node);
 extern void ExecShutdownNode(PlanState *node);
+extern void ExecFinalizeNodeInstrumentation(PlanState *node);
+extern void ExecFinalizeWorkerInstrumentation(PlanState *node);
 extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
 
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index d4769f3da7bde..d2f0191af27a4 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -13,6 +13,7 @@
 #ifndef INSTRUMENT_H
 #define INSTRUMENT_H
 
+#include "lib/ilist.h"
 #include "portability/instr_time.h"
 
 
@@ -68,29 +69,92 @@ typedef enum InstrumentOption
 } InstrumentOption;
 
 /*
- * General purpose instrumentation that can capture time and WAL/buffer usage
+ * Instrumentation base class for capturing time and WAL/buffer usage
  *
- * Initialized through InstrAlloc, followed by one or more calls to a pair of
- * InstrStart/InstrStop (activity is measured inbetween).
+ * If used directly:
+ * - Allocate on the stack and zero initialize the struct
+ * - Call InstrInitOptions to set instrumentation options
+ * - Call InstrStart before the activity you want to measure
+ * - Call InstrStop / InstrStopFinalize after the activity to capture totals
+ *
+ * InstrStart/InstrStop may be called multiple times. The last stop call must
+ * be to InstrStopFinalize to ensure parent stack entries get the accumulated
+ * totals. If there is risk of transaction aborts you must call
+ * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the
+ * instrumentation stack.
+ *
+ * In a query context use QueryInstrumentation instead, which handles aborts
+ * using the resource owner logic.
  */
 typedef struct Instrumentation
 {
 	/* Parameters set at creation: */
 	bool		need_timer;		/* true if we need timer data */
-	bool		need_bufusage;	/* true if we need buffer usage data */
-	bool		need_walusage;	/* true if we need WAL usage data */
+	bool		need_stack;		/* true if we need WAL/buffer usage data */
 	/* Internal state keeping: */
+	bool		on_stack;		/* true if currently on instr_stack */
 	instr_time	starttime;		/* start time of last InstrStart */
-	BufferUsage bufusage_start; /* buffer usage at start */
-	WalUsage	walusage_start; /* WAL usage at start */
 	/* Accumulated statistics: */
 	instr_time	total;			/* total runtime */
 	BufferUsage bufusage;		/* total buffer usage */
 	WalUsage	walusage;		/* total WAL usage */
+	/* Abort handling: link in parent QueryInstrumentation's unfinalized list */
+	dlist_node	unfinalized_entry;
 } Instrumentation;
 
+/*
+ * Query-related instrumentation tracking.
+ *
+ * Usage:
+ * - Allocate on the heap using InstrQueryAlloc (required for abort handling)
+ * - Call InstrQueryStart before the activity you want to measure
+ * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals
+ *
+ * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop
+ * call must be to InstrQueryStopFinalize to ensure parent stack entries get
+ * the accumulated totals.
+ *
+ * Uses resource owner mechanism for handling aborts, as such, the caller
+ * *must* not exit out of the top level transaction after having called
+ * InstrQueryStart, without first calling InstrQueryStop or
+ * InstrQueryStopFinalize. In the case of a transaction abort, logic equivalent
+ * to InstrQueryStopFinalize will be called automatically.
+ */
+struct ResourceOwnerData;
+typedef struct QueryInstrumentation
+{
+	Instrumentation instr;
+
+	/* Original instrument_options flags used to create this instrumentation */
+	int			instrument_options;
+
+	/* Resource owner used for cleanup for aborts between InstrStart/InstrStop */
+	struct ResourceOwnerData *owner;
+
+	/*
+	 * Dedicated memory context for all instrumentation allocations belonging
+	 * to this query (node instrumentation, trigger instrumentation, etc.).
+	 * Initially a child of TopMemoryContext so it survives transaction abort
+	 * for ResourceOwner cleanup, which is then reassigned to the current
+	 * memory context on InstrQueryStopFinalize.
+	 */
+	MemoryContext instr_cxt;
+
+	/*
+	 * Child entries that need to be cleaned up on abort, since they are not
+	 * registered as a resource owner themselves. Contains both node and
+	 * trigger instrumentation entries linked via instr.unfinalized_entry.
+	 */
+	dlist_head	unfinalized_entries;
+} QueryInstrumentation;
+
 /*
  * Specialized instrumentation for per-node execution statistics
+ *
+ * Relies on an outer QueryInstrumentation having been set up to handle the
+ * stack used for WAL/buffer usage statistics, and relies on it for managing
+ * aborts. Solely intended for the executor and anyone reporting about its
+ * activities (e.g. EXPLAIN ANALYZE).
  */
 typedef struct NodeInstrumentation
 {
@@ -111,6 +175,10 @@ typedef struct NodeInstrumentation
 	double		nfiltered2;		/* # of tuples removed by "other" quals */
 } NodeInstrumentation;
 
+/*
+ * Care must be taken with any pointers contained within this struct, as this
+ * gets copied across processes during parallel query execution.
+ */
 typedef struct WorkerNodeInstrumentation
 {
 	int			num_workers;	/* # of structures that follow */
@@ -124,16 +192,104 @@ typedef struct TriggerInstrumentation
 								 * was fired */
 } TriggerInstrumentation;
 
-extern PGDLLIMPORT BufferUsage pgBufferUsage;
+/*
+ * Dynamic array-based stack for tracking current WAL/buffer usage context.
+ *
+ * When the stack is empty, 'current' points to instr_top which accumulates
+ * session-level totals.
+ */
+typedef struct InstrStackState
+{
+	int			stack_space;	/* allocated capacity of entries array */
+	int			stack_size;		/* current number of entries */
+
+	Instrumentation **entries;	/* dynamic array of pointers */
+	Instrumentation *current;	/* top of stack, or &instr_top when empty */
+} InstrStackState;
+
 extern PGDLLIMPORT WalUsage pgWalUsage;
 
-extern Instrumentation *InstrAlloc(int instrument_options);
+/*
+ * The top instrumentation represents a running total of the current backend
+ * WAL/buffer usage information. This will not be updated immediately, but
+ * rather when the current stack entry gets accumulated which typically happens
+ * at query end.
+ *
+ * Care must be taken when utilizing this in the parallel worker context:
+ * Parallel workers will report back their instrumentation to the caller,
+ * and this gets added to the caller's stack. If this were to be used in the
+ * shared memory stats infrastructure it would need to be skipped on parallel
+ * workers to avoid double counting.
+ */
+extern PGDLLIMPORT Instrumentation instr_top;
+
+/*
+ * The instrumentation stack state. The 'current' field points to the
+ * currently active stack entry that is getting updated as activity happens,
+ * and will be accumulated to parent stacks when it gets finalized by
+ * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation
+ * (executor finish) or ResOwnerReleaseInstrumentation on abort.
+ */
+extern PGDLLIMPORT InstrStackState instr_stack;
+
+extern void InstrStackGrow(void);
+
+/*
+ * Pushes the stack so that all WAL/buffer usage updates go to the passed in
+ * instrumentation entry.
+ *
+ * See note on InstrPopStack regarding safe use of these functions.
+ */
+static inline void
+InstrPushStack(Instrumentation *instr)
+{
+	if (unlikely(instr_stack.stack_size == instr_stack.stack_space))
+		InstrStackGrow();
+
+	instr_stack.entries[instr_stack.stack_size++] = instr;
+	instr_stack.current = instr;
+	instr->on_stack = true;
+}
+
+/*
+ * Pops the stack entry back to the previous one that was effective at
+ * InstrPushStack.
+ *
+ * Callers must ensure that no intermediate stack entries are skipped, to
+ * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY
+ * block, consider instead using InstrStart + InstrStopFinalize which can skip
+ * intermediate stack entries.
+ */
+static inline void
+InstrPopStack(Instrumentation *instr)
+{
+	Assert(instr_stack.stack_size > 0);
+	Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr);
+	instr_stack.stack_size--;
+	instr_stack.current = instr_stack.stack_size > 0
+		? instr_stack.entries[instr_stack.stack_size - 1]
+		: &instr_top;
+	instr->on_stack = false;
+}
+
 extern void InstrInitOptions(Instrumentation *instr, int instrument_options);
 extern void InstrStart(Instrumentation *instr);
 extern void InstrStop(Instrumentation *instr);
+extern void InstrStopFinalize(Instrumentation *instr);
+extern void InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent);
+extern void InstrAccumStack(Instrumentation *dst, Instrumentation *add);
 
-extern NodeInstrumentation *InstrAllocNode(int instrument_options,
-										   bool async_mode);
+extern QueryInstrumentation *InstrQueryAlloc(int instrument_options);
+extern void InstrQueryStart(QueryInstrumentation *instr);
+extern void InstrQueryStop(QueryInstrumentation *instr);
+extern void InstrQueryStopFinalize(QueryInstrumentation *instr);
+extern void InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *instr);
+
+pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void);
+extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage);
+extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
+
+extern NodeInstrumentation *InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode);
 extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);
 extern void InstrStartNode(NodeInstrumentation *instr);
 extern void InstrStopNode(NodeInstrumentation *instr, double nTuples);
@@ -141,35 +297,36 @@ extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
 extern void InstrEndLoop(NodeInstrumentation *instr);
 extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
 
-extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options);
-extern void InstrStartTrigger(TriggerInstrumentation *tginstr);
+extern TriggerInstrumentation *InstrAllocTrigger(QueryInstrumentation *qinstr, int n);
+extern void InstrStartTrigger(QueryInstrumentation *qinstr,
+							  TriggerInstrumentation *tginstr);
 extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings);
 
-extern void InstrStartParallelQuery(void);
-extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void BufferUsageAccumDiff(BufferUsage *dst,
-								 const BufferUsage *add, const BufferUsage *sub);
+extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
+extern void WalUsageAdd(WalUsage *dst, const WalUsage *add);
 extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
 							  const WalUsage *sub);
 
 #define INSTR_BUFUSAGE_INCR(fld) do { \
-		pgBufferUsage.fld++; \
+		instr_stack.current->bufusage.fld++; \
 	} while(0)
 #define INSTR_BUFUSAGE_ADD(fld,val) do { \
-		pgBufferUsage.fld += (val); \
+		instr_stack.current->bufusage.fld += (val); \
 	} while(0)
 #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
-	INSTR_TIME_ADD(pgBufferUsage.fld, val); \
+	INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \
 	} while (0)
 #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
-	INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \
+	INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \
 	} while (0)
+
 #define INSTR_WALUSAGE_INCR(fld) do { \
 		pgWalUsage.fld++; \
+		instr_stack.current->walusage.fld++; \
 	} while(0)
 #define INSTR_WALUSAGE_ADD(fld,val) do { \
 		pgWalUsage.fld += (val); \
+		instr_stack.current->walusage.fld += (val); \
 	} while(0)
 
 #endif							/* INSTRUMENT_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3ecae7552fc71..b28288aa1e8db 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -54,6 +54,7 @@ typedef struct Instrumentation Instrumentation;
 typedef struct pairingheap pairingheap;
 typedef struct PlanState PlanState;
 typedef struct QueryEnvironment QueryEnvironment;
+typedef struct QueryInstrumentation QueryInstrumentation;
 typedef struct RelationData *Relation;
 typedef Relation *RelationPtr;
 typedef struct ScanKeyData ScanKeyData;
@@ -753,7 +754,7 @@ typedef struct EState
 									 * ExecutorRun() calls. */
 
 	int			es_top_eflags;	/* eflags passed to ExecutorStart */
-	int			es_instrument;	/* OR of InstrumentOption flags */
+	QueryInstrumentation *es_instrument;	/* query-level instrumentation */
 	bool		es_finished;	/* true when ExecutorFinish is done */
 
 	List	   *es_exprcontexts;	/* List of ExprContexts within EState */
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index eb6033b4fdb65..5463bc921f06e 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority;
 #define RELEASE_PRIO_SNAPSHOT_REFS			500
 #define RELEASE_PRIO_FILES					600
 #define RELEASE_PRIO_WAITEVENTSETS			700
+#define RELEASE_PRIO_INSTRUMENTATION		800
 
 /* 0 is considered invalid */
 #define RELEASE_PRIO_FIRST					1
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 449acca8dc1a4..7393926e34d97 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1355,6 +1355,7 @@ InjectionPointSharedState
 InjectionPointsCtl
 InlineCodeBlock
 InsertStmt
+InstrStackState
 Instrumentation
 Int128AggState
 Int8TransTypeData
@@ -2477,6 +2478,7 @@ QueryCompletion
 QueryDesc
 QueryEnvironment
 QueryInfo
+QueryInstrumentation
 QueryItem
 QueryItemType
 QueryMode

From 9365bbd06e42fc452d400b870e2334ccc32ade8e Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 15 Mar 2026 21:44:58 -0700
Subject: [PATCH 20/23] instrumentation: Use Instrumentation struct for
 parallel workers

This simplifies the DSM allocations a bit since we don't need to
separately allocate WAL and buffer usage, and allows the easier future
addition of a third stack-based struct being discussed.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion:
---
 src/backend/access/brin/brin.c        | 43 ++++++-----------
 src/backend/access/gin/gininsert.c    | 43 ++++++-----------
 src/backend/access/nbtree/nbtsort.c   | 43 ++++++-----------
 src/backend/commands/vacuumparallel.c | 52 ++++++++-------------
 src/backend/executor/execParallel.c   | 66 ++++++++++++---------------
 src/backend/executor/instrument.c     | 14 +++---
 src/include/executor/execParallel.h   |  5 +-
 src/include/executor/instrument.h     |  4 +-
 8 files changed, 99 insertions(+), 171 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 3a5176c76c765..9e545b4ef0e66 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -51,8 +51,7 @@
 #define PARALLEL_KEY_BRIN_SHARED		UINT64CONST(0xB000000000000001)
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xB000000000000002)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xB000000000000003)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xB000000000000004)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xB000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xB000000000000004)
 
 /*
  * Status for index builds performed in parallel.  This is allocated in a
@@ -148,8 +147,7 @@ typedef struct BrinLeader
 	BrinShared *brinshared;
 	Sharedsort *sharedsort;
 	Snapshot	snapshot;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 } BrinLeader;
 
 /*
@@ -2387,8 +2385,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	BrinShared *brinshared;
 	Sharedsort *sharedsort;
 	BrinLeader *brinleader = palloc0_object(BrinLeader);
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 	bool		leaderparticipates = true;
 	int			querylen;
 
@@ -2430,18 +2427,14 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	shm_toc_estimate_keys(&pcxt->estimator, 2);
 
 	/*
-	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
-	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
 	 * have no way of knowing whether anyone's looking at instrumentation, so
 	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -2514,15 +2507,12 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	}
 
 	/*
-	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * Allocate space for each worker's Instrumentation; no need to
 	 * initialize.
 	 */
-	walusage = shm_toc_allocate(pcxt->toc,
-								mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
-	bufferusage = shm_toc_allocate(pcxt->toc,
-								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
 
 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
@@ -2533,8 +2523,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
 	brinleader->brinshared = brinshared;
 	brinleader->sharedsort = sharedsort;
 	brinleader->snapshot = snapshot;
-	brinleader->walusage = walusage;
-	brinleader->bufferusage = bufferusage;
+	brinleader->instr = instr;
 
 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
@@ -2573,7 +2562,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
-		InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
+		InstrAccumParallelQuery(&brinleader->instr[i]);
 
 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(brinleader->snapshot))
@@ -2888,8 +2877,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
 	QueryInstrumentation *instr;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *worker_instr;
 	int			sortmem;
 
 	/*
@@ -2950,11 +2938,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 								  heapRel, indexRel, sortmem, false);
 
 	/* Report WAL/buffer usage during parallel execution */
-	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(instr,
-						  &bufferusage[ParallelWorkerNumber],
-						  &walusage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 0d80f72a0b085..f3de62ce7f339 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -45,8 +45,7 @@
 #define PARALLEL_KEY_GIN_SHARED			UINT64CONST(0xB000000000000001)
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xB000000000000002)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xB000000000000003)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xB000000000000004)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xB000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xB000000000000004)
 
 /*
  * Status for index builds performed in parallel.  This is allocated in a
@@ -138,8 +137,7 @@ typedef struct GinLeader
 	GinBuildShared *ginshared;
 	Sharedsort *sharedsort;
 	Snapshot	snapshot;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 } GinLeader;
 
 typedef struct
@@ -945,8 +943,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	GinBuildShared *ginshared;
 	Sharedsort *sharedsort;
 	GinLeader  *ginleader = palloc0_object(GinLeader);
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 	bool		leaderparticipates = true;
 	int			querylen;
 
@@ -987,18 +984,14 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	shm_toc_estimate_keys(&pcxt->estimator, 2);
 
 	/*
-	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
-	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
 	 * have no way of knowing whether anyone's looking at instrumentation, so
 	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -1066,15 +1059,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	}
 
 	/*
-	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * Allocate space for each worker's Instrumentation; no need to
 	 * initialize.
 	 */
-	walusage = shm_toc_allocate(pcxt->toc,
-								mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
-	bufferusage = shm_toc_allocate(pcxt->toc,
-								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
 
 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
@@ -1085,8 +1075,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
 	ginleader->ginshared = ginshared;
 	ginleader->sharedsort = sharedsort;
 	ginleader->snapshot = snapshot;
-	ginleader->walusage = walusage;
-	ginleader->bufferusage = bufferusage;
+	ginleader->instr = instr;
 
 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
@@ -1125,7 +1114,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
-		InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
+		InstrAccumParallelQuery(&ginleader->instr[i]);
 
 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(ginleader->snapshot))
@@ -2119,8 +2108,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
 	QueryInstrumentation *instr;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *worker_instr;
 	int			sortmem;
 
 	/*
@@ -2200,11 +2188,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 								 heapRel, indexRel, sortmem, false);
 
 	/* Report WAL/buffer usage during parallel execution */
-	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(instr,
-						  &bufferusage[ParallelWorkerNumber],
-						  &walusage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 2d7b7cef91202..cb238f862a7c8 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -66,8 +66,7 @@
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
 #define PARALLEL_KEY_TUPLESORT_SPOOL2	UINT64CONST(0xA000000000000003)
 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xA000000000000004)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xA000000000000005)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xA000000000000006)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xA000000000000005)
 
 /*
  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
@@ -195,8 +194,7 @@ typedef struct BTLeader
 	Sharedsort *sharedsort;
 	Sharedsort *sharedsort2;
 	Snapshot	snapshot;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 } BTLeader;
 
 /*
@@ -1408,8 +1406,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	Sharedsort *sharedsort2;
 	BTSpool    *btspool = buildstate->spool;
 	BTLeader   *btleader = palloc0_object(BTLeader);
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *instr;
 	bool		leaderparticipates = true;
 	int			querylen;
 
@@ -1462,18 +1459,14 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	}
 
 	/*
-	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
-	 * and PARALLEL_KEY_BUFFER_USAGE.
+	 * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
 	 * have no way of knowing whether anyone's looking at instrumentation, so
 	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -1560,15 +1553,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	}
 
 	/*
-	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
+	 * Allocate space for each worker's Instrumentation; no need to
 	 * initialize.
 	 */
-	walusage = shm_toc_allocate(pcxt->toc,
-								mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
-	bufferusage = shm_toc_allocate(pcxt->toc,
-								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
 
 	/* Launch workers, saving status for leader/caller */
 	LaunchParallelWorkers(pcxt);
@@ -1580,8 +1570,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
 	btleader->sharedsort = sharedsort;
 	btleader->sharedsort2 = sharedsort2;
 	btleader->snapshot = snapshot;
-	btleader->walusage = walusage;
-	btleader->bufferusage = bufferusage;
+	btleader->instr = instr;
 
 	/* If no workers were successfully launched, back out (do serial build) */
 	if (pcxt->nworkers_launched == 0)
@@ -1620,7 +1609,7 @@ _bt_end_parallel(BTLeader *btleader)
 	 * or we might get incomplete data.)
 	 */
 	for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
-		InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
+		InstrAccumParallelQuery(&btleader->instr[i]);
 
 	/* Free last reference to MVCC snapshot, if one was used */
 	if (IsMVCCSnapshot(btleader->snapshot))
@@ -1754,8 +1743,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	LOCKMODE	heapLockmode;
 	LOCKMODE	indexLockmode;
 	QueryInstrumentation *instr;
-	WalUsage   *walusage;
-	BufferUsage *bufferusage;
+	Instrumentation *worker_instr;
 	int			sortmem;
 
 #ifdef BTREE_BUILD_STATS
@@ -1837,11 +1825,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 							   sharedsort2, sortmem, false);
 
 	/* Report WAL/buffer usage during parallel execution */
-	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(instr,
-						  &bufferusage[ParallelWorkerNumber],
-						  &walusage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c
index c330c891c03e3..b5fed54fb85c3 100644
--- a/src/backend/commands/vacuumparallel.c
+++ b/src/backend/commands/vacuumparallel.c
@@ -47,9 +47,8 @@
  */
 #define PARALLEL_VACUUM_KEY_SHARED			1
 #define PARALLEL_VACUUM_KEY_QUERY_TEXT		2
-#define PARALLEL_VACUUM_KEY_BUFFER_USAGE	3
-#define PARALLEL_VACUUM_KEY_WAL_USAGE		4
-#define PARALLEL_VACUUM_KEY_INDEX_STATS		5
+#define PARALLEL_VACUUM_KEY_INSTRUMENTATION	3
+#define PARALLEL_VACUUM_KEY_INDEX_STATS		4
 
 /*
  * Shared information among parallel workers.  So this is allocated in the DSM
@@ -188,11 +187,8 @@ struct ParallelVacuumState
 	/* Shared dead items space among parallel vacuum workers */
 	TidStore   *dead_items;
 
-	/* Points to buffer usage area in DSM */
-	BufferUsage *buffer_usage;
-
-	/* Points to WAL usage area in DSM */
-	WalUsage   *wal_usage;
+	/* Points to instrumentation area in DSM */
+	Instrumentation *instr;
 
 	/*
 	 * False if the index is totally unsuitable target for all parallel
@@ -250,8 +246,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	PVShared   *shared;
 	TidStore   *dead_items;
 	PVIndStats *indstats;
-	BufferUsage *buffer_usage;
-	WalUsage   *wal_usage;
+	Instrumentation *instr;
 	bool	   *will_parallel_vacuum;
 	Size		est_indstats_len;
 	Size		est_shared_len;
@@ -304,18 +299,15 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/*
-	 * Estimate space for BufferUsage and WalUsage --
-	 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
+	 * Estimate space for Instrumentation --
+	 * PARALLEL_VACUUM_KEY_INSTRUMENTATION.
 	 *
 	 * If there are no extensions loaded that care, we could skip this.  We
 	 * have no way of knowing whether anyone's looking at instrumentation, so
 	 * do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
@@ -396,17 +388,13 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
 	pvs->shared = shared;
 
 	/*
-	 * Allocate space for each worker's BufferUsage and WalUsage; no need to
-	 * initialize
+	 * Allocate space for each worker's Instrumentation; no need to
+	 * initialize.
 	 */
-	buffer_usage = shm_toc_allocate(pcxt->toc,
-									mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
-	pvs->buffer_usage = buffer_usage;
-	wal_usage = shm_toc_allocate(pcxt->toc,
-								 mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
-	pvs->wal_usage = wal_usage;
+	instr = shm_toc_allocate(pcxt->toc,
+							 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+	shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, instr);
+	pvs->instr = instr;
 
 	/* Store query string for workers */
 	if (debug_query_string)
@@ -749,7 +737,7 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan
 		WaitForParallelWorkersToFinish(pvs->pcxt);
 
 		for (int i = 0; i < pvs->pcxt->nworkers_launched; i++)
-			InstrAccumParallelQuery(&pvs->buffer_usage[i], &pvs->wal_usage[i]);
+			InstrAccumParallelQuery(&pvs->instr[i]);
 	}
 
 	/*
@@ -1007,8 +995,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	PVShared   *shared;
 	TidStore   *dead_items;
 	QueryInstrumentation *instr;
-	BufferUsage *buffer_usage;
-	WalUsage   *wal_usage;
+	Instrumentation *worker_instr;
 	int			nindexes;
 	char	   *sharedquery;
 	ErrorContextCallback errcallback;
@@ -1102,11 +1089,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
 	parallel_vacuum_process_safe_indexes(&pvs);
 
 	/* Report buffer/WAL usage during parallel execution */
-	buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
-	wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(instr,
-						  &buffer_usage[ParallelWorkerNumber],
-						  &wal_usage[ParallelWorkerNumber]);
+	worker_instr = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, false);
+	InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
 
 	/* Report any remaining cost-based vacuum delay time */
 	if (track_cost_delay_timing)
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index c01e780f918e1..2e57136edfd9f 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -60,13 +60,12 @@
 #define PARALLEL_KEY_EXECUTOR_FIXED		UINT64CONST(0xE000000000000001)
 #define PARALLEL_KEY_PLANNEDSTMT		UINT64CONST(0xE000000000000002)
 #define PARALLEL_KEY_PARAMLISTINFO		UINT64CONST(0xE000000000000003)
-#define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xE000000000000004)
+#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xE000000000000004)
 #define PARALLEL_KEY_TUPLE_QUEUE		UINT64CONST(0xE000000000000005)
-#define PARALLEL_KEY_INSTRUMENTATION	UINT64CONST(0xE000000000000006)
+#define PARALLEL_KEY_NODE_INSTRUMENTATION UINT64CONST(0xE000000000000006)
 #define PARALLEL_KEY_DSA				UINT64CONST(0xE000000000000007)
 #define PARALLEL_KEY_QUERY_TEXT		UINT64CONST(0xE000000000000008)
 #define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009)
-#define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xE00000000000000A)
 
 #define PARALLEL_TUPLE_QUEUE_SIZE		65536
 
@@ -631,8 +630,6 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	char	   *pstmt_data;
 	char	   *pstmt_space;
 	char	   *paramlistinfo_space;
-	BufferUsage *bufusage_space;
-	WalUsage   *walusage_space;
 	SharedExecutorInstrumentation *instrumentation = NULL;
 	SharedJitInstrumentation *jit_instrumentation = NULL;
 	int			pstmt_len;
@@ -696,21 +693,14 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/*
-	 * Estimate space for BufferUsage.
+	 * Estimate space for Instrumentation.
 	 *
 	 * If EXPLAIN is not in use and there are no extensions loaded that care,
 	 * we could skip this.  But we have no way of knowing whether anyone's
 	 * looking at instrumentation, so do it unconditionally.
 	 */
 	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_estimate_keys(&pcxt->estimator, 1);
-
-	/*
-	 * Same thing for WalUsage.
-	 */
-	shm_toc_estimate_chunk(&pcxt->estimator,
-						   mul_size(sizeof(WalUsage), pcxt->nworkers));
+						   mul_size(sizeof(Instrumentation), pcxt->nworkers));
 	shm_toc_estimate_keys(&pcxt->estimator, 1);
 
 	/* Estimate space for tuple queues. */
@@ -796,17 +786,18 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space);
 	SerializeParamList(estate->es_param_list_info, &paramlistinfo_space);
 
-	/* Allocate space for each worker's BufferUsage; no need to initialize. */
-	bufusage_space = shm_toc_allocate(pcxt->toc,
-									  mul_size(sizeof(BufferUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space);
-	pei->buffer_usage = bufusage_space;
+	/*
+	 * Allocate space for each worker's Instrumentation; no need to
+	 * initialize.
+	 */
+	{
+		Instrumentation *instr;
 
-	/* Same for WalUsage. */
-	walusage_space = shm_toc_allocate(pcxt->toc,
-									  mul_size(sizeof(WalUsage), pcxt->nworkers));
-	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space);
-	pei->wal_usage = walusage_space;
+		instr = shm_toc_allocate(pcxt->toc,
+								 mul_size(sizeof(Instrumentation), pcxt->nworkers));
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
+		pei->instrumentation = instr;
+	}
 
 	/* Set up the tuple queues that the workers will write into. */
 	pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false);
@@ -832,9 +823,9 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		instrument = GetInstrumentationArray(instrumentation);
 		for (i = 0; i < nworkers * e.nnodes; ++i)
 			InstrInitNode(&instrument[i], estate->es_instrument->instrument_options);
-		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_NODE_INSTRUMENTATION,
 					   instrumentation);
-		pei->instrumentation = instrumentation;
+		pei->node_instrumentation = instrumentation;
 
 		if (estate->es_jit_flags != PGJIT_NONE)
 		{
@@ -1236,7 +1227,7 @@ ExecParallelFinish(ParallelExecutorInfo *pei)
 	 * finish, or we might get incomplete data.)
 	 */
 	for (i = 0; i < nworkers; i++)
-		InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]);
+		InstrAccumParallelQuery(&pei->instrumentation[i]);
 
 	pei->finished = true;
 }
@@ -1250,11 +1241,11 @@ ExecParallelFinish(ParallelExecutorInfo *pei)
 void
 ExecParallelCleanup(ParallelExecutorInfo *pei)
 {
-	/* Accumulate instrumentation, if any. */
-	if (pei->instrumentation)
+	/* Accumulate node instrumentation, if any. */
+	if (pei->node_instrumentation)
 	{
 		ExecParallelRetrieveInstrumentation(pei->planstate,
-											pei->instrumentation);
+											pei->node_instrumentation);
 
 		ExecFinalizeWorkerInstrumentation(pei->planstate);
 	}
@@ -1481,8 +1472,6 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 {
 	FixedParallelExecutorState *fpes;
 	QueryInstrumentation *instr;
-	BufferUsage *buffer_usage;
-	WalUsage   *wal_usage;
 	DestReceiver *receiver;
 	QueryDesc  *queryDesc;
 	SharedExecutorInstrumentation *instrumentation;
@@ -1497,7 +1486,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 
 	/* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */
 	receiver = ExecParallelGetReceiver(seg, toc);
-	instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true);
+	instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_NODE_INSTRUMENTATION, true);
 	if (instrumentation != NULL)
 		instrument_options = instrumentation->instrument_options;
 	jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
@@ -1555,11 +1544,12 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 	ExecutorFinish(queryDesc);
 
 	/* Report buffer/WAL usage during parallel execution. */
-	buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
-	wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
-	InstrEndParallelQuery(instr,
-						  &buffer_usage[ParallelWorkerNumber],
-						  &wal_usage[ParallelWorkerNumber]);
+	{
+		Instrumentation *worker_instr;
+
+		worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+		InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
+	}
 
 	/* Report instrumentation data if any instrumentation options are set. */
 	if (instrumentation != NULL)
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index f9202b558d656..af64aa145ebea 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -339,11 +339,12 @@ InstrStartParallelQuery(void)
 
 /* report usage after parallel executor shutdown */
 void
-InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage)
+InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst)
 {
 	InstrQueryStopFinalize(qinstr);
-	memcpy(bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
-	memcpy(walusage, &qinstr->instr.walusage, sizeof(WalUsage));
+	dst->need_stack = qinstr->instr.need_stack;
+	memcpy(&dst->bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
+	memcpy(&dst->walusage, &qinstr->instr.walusage, sizeof(WalUsage));
 }
 
 /*
@@ -359,12 +360,11 @@ InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUs
  * activity is accumulated.
  */
 void
-InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrAccumParallelQuery(Instrumentation *instr)
 {
-	BufferUsageAdd(&instr_stack.current->bufusage, bufusage);
-	WalUsageAdd(&instr_stack.current->walusage, walusage);
+	InstrAccumStack(instr_stack.current, instr);
 
-	WalUsageAdd(&pgWalUsage, walusage);
+	WalUsageAdd(&pgWalUsage, &instr->walusage);
 }
 
 /* Node instrumentation handling */
diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h
index 5a2034811d563..6c8b602d07f98 100644
--- a/src/include/executor/execParallel.h
+++ b/src/include/executor/execParallel.h
@@ -25,9 +25,8 @@ typedef struct ParallelExecutorInfo
 {
 	PlanState  *planstate;		/* plan subtree we're running in parallel */
 	ParallelContext *pcxt;		/* parallel context we're using */
-	BufferUsage *buffer_usage;	/* points to bufusage area in DSM */
-	WalUsage   *wal_usage;		/* walusage area in DSM */
-	SharedExecutorInstrumentation *instrumentation; /* optional */
+	Instrumentation *instrumentation;	/* instrumentation area in DSM */
+	SharedExecutorInstrumentation *node_instrumentation;	/* optional */
 	struct SharedJitInstrumentation *jit_instrumentation;	/* optional */
 	dsa_area   *area;			/* points to DSA area in DSM */
 	dsa_pointer param_exec;		/* serialized PARAM_EXEC parameters */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index d2f0191af27a4..b62619412a0ad 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -286,8 +286,8 @@ extern void InstrQueryStopFinalize(QueryInstrumentation *instr);
 extern void InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *instr);
 
 pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void);
-extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, BufferUsage *bufusage, WalUsage *walusage);
-extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
+extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst);
+extern void InstrAccumParallelQuery(Instrumentation *instr);
 
 extern NodeInstrumentation *InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode);
 extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);

From d51ed5a5ebfe83116a4a740ba3b9d3f49687f226 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sat, 7 Mar 2026 17:52:24 -0800
Subject: [PATCH 21/23] instrumentation: Optimize ExecProcNodeInstr
 instructions by inlining

For most queries, the bulk of the overhead of EXPLAIN ANALYZE happens in
ExecProcNodeInstr when starting/stopping instrumentation for that node.

Previously each ExecProcNodeInstr would check which instrumentation
options are active in the InstrStartNode/InstrStopNode calls, and do the
corresponding work (timers, instrumentation stack, etc.). These
conditionals being checked for each tuple being emitted add up, and cause
non-optimal set of instructions to be generated by the compiler.

Because we already have an existing mechanism to specify a function
pointer when instrumentation is enabled, we can instead create specialized
functions that are tailored to the instrumentation options enabled, and
avoid conditionals on subsequent ExecProcNodeInstr calls. This results in
the overhead for EXPLAIN (ANALYZE, TIMING OFF, BUFFERS OFF) for a stress
test with a large COUNT(*) that does many ExecProcNode calls from ~ 20% on
top of actual runtime to ~ 3%. When using BUFFERS ON the same query goes
from ~ 20% to ~ 10% on top of actual runtime.

Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxFP7i7-wy98ZmEJ11edYq-RrPvJoa4kzGhBBjERA4Nyw%40mail.gmail.com#e8dfd018a07d7f8d41565a079d40c564
---
 src/backend/executor/execProcnode.c |  22 +--
 src/backend/executor/instrument.c   | 199 ++++++++++++++++++++--------
 src/include/executor/instrument.h   |   5 +
 3 files changed, 149 insertions(+), 77 deletions(-)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 3b3ec9850e8e1..6e8cbaeccf7aa 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -121,7 +121,6 @@
 #include "nodes/nodeFuncs.h"
 
 static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
-static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
 static bool ExecShutdownNode_walker(PlanState *node, void *context);
 static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context);
 static bool ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context);
@@ -465,7 +464,7 @@ ExecProcNodeFirst(PlanState *node)
 	 * have ExecProcNode() directly call the relevant function from now on.
 	 */
 	if (node->instrument)
-		node->ExecProcNode = ExecProcNodeInstr;
+		node->ExecProcNode = InstrNodeSetupExecProcNode(node->instrument);
 	else
 		node->ExecProcNode = node->ExecProcNodeReal;
 
@@ -473,25 +472,6 @@ ExecProcNodeFirst(PlanState *node)
 }
 
 
-/*
- * ExecProcNode wrapper that performs instrumentation calls.  By keeping
- * this a separate function, we avoid overhead in the normal case where
- * no instrumentation is wanted.
- */
-static TupleTableSlot *
-ExecProcNodeInstr(PlanState *node)
-{
-	TupleTableSlot *result;
-
-	InstrStartNode(node->instrument);
-
-	result = node->ExecProcNodeReal(node);
-
-	InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
-
-	return result;
-}
-
 
 /* ----------------------------------------------------------------
  *		MultiExecProcNode
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index af64aa145ebea..3183f00d6930a 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -66,29 +66,20 @@ InstrInitOptions(Instrumentation *instr, int instrument_options)
 	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
 }
 
-void
-InstrStart(Instrumentation *instr)
+static inline void
+InstrStartTimer(Instrumentation *instr)
 {
-	if (instr->need_timer)
-	{
-		if (!INSTR_TIME_IS_ZERO(instr->starttime))
-			elog(ERROR, "InstrStart called twice in a row");
-		else
-			INSTR_TIME_SET_CURRENT(instr->starttime);
-	}
+	Assert(INSTR_TIME_IS_ZERO(instr->starttime));
 
-	if (instr->need_stack)
-		InstrPushStack(instr);
+	INSTR_TIME_SET_CURRENT(instr->starttime);
 }
 
-static void
+static inline void
 InstrStopTimer(Instrumentation *instr)
 {
 	instr_time	endtime;
 
-	/* let's update the time only if the timer was requested */
-	if (INSTR_TIME_IS_ZERO(instr->starttime))
-		elog(ERROR, "InstrStop called without start");
+	Assert(!INSTR_TIME_IS_ZERO(instr->starttime));
 
 	INSTR_TIME_SET_CURRENT(endtime);
 	INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
@@ -96,6 +87,16 @@ InstrStopTimer(Instrumentation *instr)
 	INSTR_TIME_SET_ZERO(instr->starttime);
 }
 
+void
+InstrStart(Instrumentation *instr)
+{
+	if (instr->need_timer)
+		InstrStartTimer(instr);
+
+	if (instr->need_stack)
+		InstrPushStack(instr);
+}
+
 void
 InstrStop(Instrumentation *instr)
 {
@@ -391,65 +392,57 @@ InstrInitNode(NodeInstrumentation *instr, int instrument_options)
 	InstrInitOptions(&instr->instr, instrument_options);
 }
 
-/* Entry to a plan node */
+/* Entry to a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
 void
 InstrStartNode(NodeInstrumentation *instr)
 {
 	InstrStart(&instr->instr);
 }
 
-
-/* Exit from a plan node */
-void
-InstrStopNode(NodeInstrumentation *instr, double nTuples)
+/*
+ * Updates the node instrumentation time counter.
+ *
+ * Note this is different from InstrStop because total is only updated in
+ * InstrEndLoop. We need the separate counter variable because we need to
+ * calculate start-up time for the first tuple in each cycle, and then
+ * accumulate it together.
+ */
+static inline void
+InstrStopNodeTimer(NodeInstrumentation *instr)
 {
-	double		save_tuplecount = instr->tuplecount;
 	instr_time	endtime;
 
-	/* count the returned tuples */
-	instr->tuplecount += nTuples;
+	Assert(!INSTR_TIME_IS_ZERO(instr->instr.starttime));
+
+	INSTR_TIME_SET_CURRENT(endtime);
+	INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime);
+	INSTR_TIME_SET_ZERO(instr->instr.starttime);
 
 	/*
-	 * Update the time only if the timer was requested.
+	 * Is this the first tuple of this cycle?
 	 *
-	 * Note this is different from InstrStop because total is only updated in
-	 * InstrEndLoop. We need the separate counter variable because we need to
-	 * calculate start-up time for the first tuple in each cycle, and then
-	 * accumulate it together.
+	 * In async mode, if the plan node hadn't emitted any tuples before, this
+	 * might be the first tuple
 	 */
-	if (instr->instr.need_timer)
-	{
-		if (INSTR_TIME_IS_ZERO(instr->instr.starttime))
-			elog(ERROR, "InstrStopNode called without start");
-
-		INSTR_TIME_SET_CURRENT(endtime);
-		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime);
+	if (!instr->running || (instr->async_mode && instr->tuplecount < 1.0))
+		instr->firsttuple = instr->counter;
+}
 
-		INSTR_TIME_SET_ZERO(instr->instr.starttime);
-	}
+/* Exit from a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStopNode(NodeInstrumentation *instr, double nTuples)
+{
+	if (instr->instr.need_timer)
+		InstrStopNodeTimer(instr);
 
-	/*
-	 * Only pop the stack, accumulation runs in
-	 * ExecFinalizeNodeInstrumentation
-	 */
+	/* Only pop the stack, accumulation runs in InstrFinalizeNode */
 	if (instr->instr.need_stack)
 		InstrPopStack(&instr->instr);
 
-	/* Is this the first tuple of this cycle? */
-	if (!instr->running)
-	{
-		instr->running = true;
-		instr->firsttuple = instr->counter;
-	}
-	else
-	{
-		/*
-		 * In async mode, if the plan node hadn't emitted any tuples before,
-		 * this might be the first tuple
-		 */
-		if (instr->async_mode && save_tuplecount < 1.0)
-			instr->firsttuple = instr->counter;
-	}
+	instr->running = true;
+
+	/* count the returned tuples */
+	instr->tuplecount += nTuples;
 }
 
 /* Update tuple count */
@@ -507,6 +500,100 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
 		InstrAccumStack(&dst->instr, &add->instr);
 }
 
+/*
+ * Specialized handling of instrumented ExecProcNode
+ *
+ * These functions are equivalent to running ExecProcNodeReal wrapped in
+ * InstrStartNode and InstrStopNode, but avoid the conditionals in the hot path
+ * by checking the instrumentation options when the ExecProcNode pointer gets
+ * first set, and then using a special-purpose function for each. This results
+ * in a more optimized set of compiled instructions.
+ */
+
+#include "executor/tuptable.h"
+#include "nodes/execnodes.h"
+
+/* Simplified pop: restore saved state instead of re-deriving from array */
+static inline void
+InstrPopStackTo(Instrumentation *prev)
+{
+	Assert(instr_stack.stack_size > 0);
+	Assert(instr_stack.stack_size > 1 ? instr_stack.entries[instr_stack.stack_size - 2] == prev : &instr_top == prev);
+	instr_stack.entries[instr_stack.stack_size - 1]->on_stack = false;
+	instr_stack.stack_size--;
+	instr_stack.current = prev;
+}
+
+static pg_attribute_always_inline TupleTableSlot *
+ExecProcNodeInstr(PlanState *node, bool need_timer, bool need_stack)
+{
+	NodeInstrumentation *instr = node->instrument;
+	Instrumentation *prev = instr_stack.current;
+	TupleTableSlot *result;
+
+	if (need_stack)
+		InstrPushStack(&instr->instr);
+	if (need_timer)
+		InstrStartTimer(&instr->instr);
+
+	result = node->ExecProcNodeReal(node);
+
+	if (need_timer)
+		InstrStopNodeTimer(instr);
+	if (need_stack)
+		InstrPopStackTo(prev);
+
+	instr->running = true;
+	if (!TupIsNull(result))
+		instr->tuplecount += 1.0;
+
+	return result;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrFull(PlanState *node)
+{
+	return ExecProcNodeInstr(node, true, true);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsStackOnly(PlanState *node)
+{
+	return ExecProcNodeInstr(node, false, true);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsTimerOnly(PlanState *node)
+{
+	return ExecProcNodeInstr(node, true, false);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsOnly(PlanState *node)
+{
+	return ExecProcNodeInstr(node, false, false);
+}
+
+/*
+ * Returns an ExecProcNode wrapper that performs instrumentation calls,
+ * tailored to the instrumentation options enabled for the node.
+ */
+ExecProcNodeMtd
+InstrNodeSetupExecProcNode(NodeInstrumentation *instr)
+{
+	bool		need_timer = instr->instr.need_timer;
+	bool		need_stack = instr->instr.need_stack;
+
+	if (need_timer && need_stack)
+		return ExecProcNodeInstrFull;
+	else if (need_stack)
+		return ExecProcNodeInstrRowsStackOnly;
+	else if (need_timer)
+		return ExecProcNodeInstrRowsTimerOnly;
+	else
+		return ExecProcNodeInstrRowsOnly;
+}
+
 /* Trigger instrumentation handling */
 TriggerInstrumentation *
 InstrAllocTrigger(QueryInstrumentation *qinstr, int n)
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index b62619412a0ad..bae8a9b0e62ed 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -297,6 +297,11 @@ extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
 extern void InstrEndLoop(NodeInstrumentation *instr);
 extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
 
+typedef struct TupleTableSlot TupleTableSlot;
+typedef struct PlanState PlanState;
+typedef TupleTableSlot *(*ExecProcNodeMtd) (PlanState *pstate);
+extern ExecProcNodeMtd InstrNodeSetupExecProcNode(NodeInstrumentation *instr);
+
 extern TriggerInstrumentation *InstrAllocTrigger(QueryInstrumentation *qinstr, int n);
 extern void InstrStartTrigger(QueryInstrumentation *qinstr,
 							  TriggerInstrumentation *tginstr);

From b57c4118984bd46b848607680afff11b7960f1bd Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sat, 7 Mar 2026 11:46:19 -0800
Subject: [PATCH 22/23] Index scans: Show table buffer accesses separately in
 EXPLAIN ANALYZE

This sets up a separate instrumentation stack that is used whilst an
Index Scan or Index Only Scan does scanning on the table, for example due
to additional data being needed.

EXPLAIN ANALYZE will now show "Table Buffers" that represent such activity.
The activity is also included in regular "Buffers" together with index
activity and that of any child nodes.

Author: Lukas Fittl <lukas@fittl.com>
Suggested-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Zsolt Parragi <zsolt.parragi@percona.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxrmpECzVFpeeEEHDGe6u625s%2BYkmVv5-gw3L_NDSfbiA%40mail.gmail.com#cb583a08e8e096aa1f093bb178906173
---
 doc/src/sgml/perform.sgml                  |  13 ++-
 doc/src/sgml/ref/explain.sgml              |   1 +
 src/backend/commands/explain.c             |  47 ++++++--
 src/backend/executor/execProcnode.c        |  46 ++++++++
 src/backend/executor/nodeBitmapIndexscan.c |   2 +-
 src/backend/executor/nodeIndexonlyscan.c   |  41 ++++++-
 src/backend/executor/nodeIndexscan.c       | 127 +++++++++++++++++----
 src/include/executor/instrument_node.h     |   5 +
 8 files changed, 244 insertions(+), 38 deletions(-)

diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 604e8578a8dcd..d28f4f22535b9 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -734,6 +734,7 @@ WHERE t1.unique1 &lt; 10 AND t1.unique2 = t2.unique2;
    -&gt;  Index Scan using tenk2_unique2 on tenk2 t2  (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10)
          Index Cond: (unique2 = t1.unique2)
          Index Searches: 10
+         Table Buffers: shared hit=10
          Buffers: shared hit=24 read=6
  Planning:
    Buffers: shared hit=15 dirtied=9
@@ -1005,7 +1006,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @&gt; polygon '(0.5,2.0)';
    Index Cond: (f1 @&gt; '((0.5,2))'::polygon)
    Rows Removed by Index Recheck: 1
    Index Searches: 1
-   Buffers: shared hit=1
+   Table Buffers: shared hit=1
+   Buffers: shared hit=2
  Planning Time: 0.039 ms
  Execution Time: 0.098 ms
 </screen>
@@ -1014,7 +1016,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @&gt; polygon '(0.5,2.0)';
     then rejected by a recheck of the index condition.  This happens because a
     GiST index is <quote>lossy</quote> for polygon containment tests: it actually
     returns the rows with polygons that overlap the target, and then we have
-    to do the exact containment test on those rows.
+    to do the exact containment test on those rows. The <literal>Table Buffers</literal>
+    counts indicate how many operations were performed on the table instead of
+    the index. This number is included in the <literal>Buffers</literal> counts.
    </para>
 
    <para>
@@ -1203,13 +1207,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 &lt; 100 AND unique2 &gt; 9000
                                                           QUERY PLAN
 -------------------------------------------------------------------&zwsp;------------------------------------------------------------
  Limit  (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1)
-   Buffers: shared hit=16
+   Buffers: shared hit=14
    -&gt;  Index Scan using tenk1_unique2 on tenk1  (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1)
          Index Cond: (unique2 &gt; 9000)
          Filter: (unique1 &lt; 100)
          Rows Removed by Filter: 287
          Index Searches: 1
-         Buffers: shared hit=16
+         Table Buffers: shared hit=11
+         Buffers: shared hit=14
  Planning Time: 0.077 ms
  Execution Time: 0.086 ms
 </screen>
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 5b8b521802ee5..71070736acb98 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -509,6 +509,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200);
    -&gt;  Index Scan using test_pkey on test  (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1)
          Index Cond: ((id &gt; 100) AND (id &lt; 200))
          Index Searches: 1
+         Table Buffers: shared hit=1
          Buffers: shared hit=4
  Planning Time: 0.244 ms
  Execution Time: 0.073 ms
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 9fc39cabdf815..42fc00cbd34db 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -144,7 +144,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
 static void show_wal_usage(ExplainState *es, const WalUsage *usage);
 static void show_memory_counters(ExplainState *es,
 								 const MemoryContextCounters *mem_counters);
@@ -611,7 +611,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		}
 
 		if (bufusage)
-			show_buffer_usage(es, bufusage);
+			show_buffer_usage(es, bufusage, NULL);
 
 		if (mem_counters)
 			show_memory_counters(es, mem_counters);
@@ -1028,7 +1028,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 		if (es->buffers && peek_buffer_usage(es, &metrics->instr.bufusage))
 		{
 			es->indent++;
-			show_buffer_usage(es, &metrics->instr.bufusage);
+			show_buffer_usage(es, &metrics->instr.bufusage, NULL);
 			es->indent--;
 		}
 	}
@@ -1042,7 +1042,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 								BYTES_TO_KILOBYTES(metrics->bytesSent), es);
 		ExplainPropertyText("Format", format, es);
 		if (es->buffers)
-			show_buffer_usage(es, &metrics->instr.bufusage);
+			show_buffer_usage(es, &metrics->instr.bufusage, NULL);
 	}
 
 	ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1970,6 +1970,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
 			show_indexsearches_info(planstate, es);
+
+			if (es->buffers && planstate->instrument)
+				show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument->table_instr.bufusage, "Table");
 			break;
 		case T_IndexOnlyScan:
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1987,6 +1990,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				ExplainPropertyFloat("Heap Fetches", NULL,
 									 planstate->instrument->ntuples2, 0, es);
 			show_indexsearches_info(planstate, es);
+
+			if (es->buffers && planstate->instrument)
+				show_buffer_usage(es, &((IndexOnlyScanState *) planstate)->ioss_Instrument->table_instr.bufusage, "Table");
 			break;
 		case T_BitmapIndexScan:
 			show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
@@ -2288,7 +2294,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 	/* Show buffer/WAL usage */
 	if (es->buffers && planstate->instrument)
-		show_buffer_usage(es, &planstate->instrument->instr.bufusage);
+		show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL);
 	if (es->wal && planstate->instrument)
 		show_wal_usage(es, &planstate->instrument->instr.walusage);
 
@@ -2307,7 +2313,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 			ExplainOpenWorker(n, es);
 			if (es->buffers)
-				show_buffer_usage(es, &instrument->instr.bufusage);
+				show_buffer_usage(es, &instrument->instr.bufusage, NULL);
 			if (es->wal)
 				show_wal_usage(es, &instrument->instr.walusage);
 			ExplainCloseWorker(n, es);
@@ -4126,7 +4132,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
  * Show buffer usage details.  This better be sync with peek_buffer_usage.
  */
 static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
 {
 	if (es->format == EXPLAIN_FORMAT_TEXT)
 	{
@@ -4151,6 +4157,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 		if (has_shared || has_local || has_temp)
 		{
 			ExplainIndentText(es);
+			if (title)
+				appendStringInfo(es->str, "%s ", title);
 			appendStringInfoString(es->str, "Buffers:");
 
 			if (has_shared)
@@ -4206,6 +4214,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 		if (has_shared_timing || has_local_timing || has_temp_timing)
 		{
 			ExplainIndentText(es);
+			if (title)
+				appendStringInfo(es->str, "%s ", title);
 			appendStringInfoString(es->str, "I/O Timings:");
 
 			if (has_shared_timing)
@@ -4247,6 +4257,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 	}
 	else
 	{
+		char	   *buffers_title = NULL;
+
+		if (title)
+		{
+			buffers_title = psprintf("%s Buffers", title);
+			ExplainOpenGroup(buffers_title, buffers_title, true, es);
+		}
+
 		ExplainPropertyInteger("Shared Hit Blocks", NULL,
 							   usage->shared_blks_hit, es);
 		ExplainPropertyInteger("Shared Read Blocks", NULL,
@@ -4267,8 +4285,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 							   usage->temp_blks_read, es);
 		ExplainPropertyInteger("Temp Written Blocks", NULL,
 							   usage->temp_blks_written, es);
+
+		if (buffers_title)
+			ExplainCloseGroup(buffers_title, buffers_title, true, es);
+
 		if (track_io_timing)
 		{
+			char	   *timings_title = NULL;
+
+			if (title)
+			{
+				timings_title = psprintf("%s I/O Timings", title);
+				ExplainOpenGroup(timings_title, timings_title, true, es);
+			}
+
 			ExplainPropertyFloat("Shared I/O Read Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
 								 3, es);
@@ -4287,6 +4317,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 			ExplainPropertyFloat("Temp I/O Write Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
 								 3, es);
+
+			if (timings_title)
+				ExplainCloseGroup(timings_title, timings_title, true, es);
 		}
 	}
 }
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 6e8cbaeccf7aa..a59de0ef22b29 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -846,6 +846,20 @@ ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context)
 	planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker,
 						  &node->instrument->instr);
 
+	/* IndexScan/IndexOnlyScan have a separate entry to track table access */
+	if (IsA(node, IndexScanState))
+	{
+		IndexScanState *iss = castNode(IndexScanState, node);
+
+		InstrFinalizeChild(&iss->iss_Instrument->table_instr, &node->instrument->instr);
+	}
+	else if (IsA(node, IndexOnlyScanState))
+	{
+		IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node);
+
+		InstrFinalizeChild(&ioss->ioss_Instrument->table_instr, &node->instrument->instr);
+	}
+
 	InstrFinalizeChild(&node->instrument->instr, parent);
 
 	return false;
@@ -891,6 +905,38 @@ ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context)
 
 	num_workers = node->worker_instrument->num_workers;
 
+	/*
+	 * Fold per-worker IndexScan/IndexOnlyScan table buffer stats into the
+	 * per-worker node stats, matching what ExecFinalizeNodeInstrumentation
+	 * does for the leader.
+	 */
+	if (IsA(node, IndexScanState))
+	{
+		IndexScanState *iss = castNode(IndexScanState, node);
+
+		if (iss->iss_SharedInfo)
+		{
+			int			nworkers = Min(num_workers, iss->iss_SharedInfo->num_workers);
+
+			for (int n = 0; n < nworkers; n++)
+				InstrAccumStack(&node->worker_instrument->instrument[n].instr,
+								&iss->iss_SharedInfo->winstrument[n].table_instr);
+		}
+	}
+	else if (IsA(node, IndexOnlyScanState))
+	{
+		IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node);
+
+		if (ioss->ioss_SharedInfo)
+		{
+			int			nworkers = Min(num_workers, ioss->ioss_SharedInfo->num_workers);
+
+			for (int n = 0; n < nworkers; n++)
+				InstrAccumStack(&node->worker_instrument->instrument[n].instr,
+								&ioss->ioss_SharedInfo->winstrument[n].table_instr);
+		}
+	}
+
 	/* Accumulate this node's per-worker stats to parent's per-worker stats */
 	if (parent && parent->worker_instrument)
 	{
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 70c55ee6d614d..63e24a0bcd4cf 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -276,7 +276,7 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation of bitmap index scans if requested */
 	if (estate->es_instrument)
-		indexstate->biss_Instrument = palloc0_object(IndexScanInstrumentation);
+		indexstate->biss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
 
 	/* Open the index relation. */
 	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index de6154fd54139..9e64ce2bd2da7 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -67,6 +67,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid;
+	Instrumentation *table_instr = NULL;
 
 	/*
 	 * extract necessary information from index scan node
@@ -83,6 +84,9 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
+	if (node->ioss_Instrument && node->ioss_Instrument->table_instr.need_stack)
+		table_instr = &node->ioss_Instrument->table_instr;
+
 	if (scandesc == NULL)
 	{
 		/*
@@ -165,11 +169,22 @@ IndexOnlyNext(IndexOnlyScanState *node)
 							ItemPointerGetBlockNumber(tid),
 							&node->ioss_VMBuffer))
 		{
+			bool		found;
+
 			/*
 			 * Rats, we have to visit the heap to check visibility.
 			 */
 			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+
+			if (table_instr)
+				InstrPushStack(table_instr);
+
+			found = index_fetch_heap(scandesc, node->ioss_TableSlot);
+
+			if (table_instr)
+				InstrPopStack(table_instr);
+
+			if (!found)
 				continue;		/* no visible tuple, try next index entry */
 
 			ExecClearTuple(node->ioss_TableSlot);
@@ -436,6 +451,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node)
 		 * which will have a new IndexOnlyScanState and zeroed stats.
 		 */
 		winstrument->nsearches += node->ioss_Instrument->nsearches;
+		InstrAccumStack(&winstrument->table_instr, &node->ioss_Instrument->table_instr);
 	}
 
 	/*
@@ -610,7 +626,21 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation of index-only scans if requested */
 	if (estate->es_instrument)
-		indexstate->ioss_Instrument = palloc0_object(IndexScanInstrumentation);
+	{
+		indexstate->ioss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
+
+		/*
+		 * Track table and index access separately. We intentionally don't
+		 * collect timing (even if enabled), since we don't need it, and
+		 * IndexOnlyNext calls InstrPushStack / InstrPopStack (instead of the
+		 * full InstrNode*) to reduce overhead.
+		 */
+		if ((estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0)
+		{
+			InstrInitOptions(&indexstate->ioss_Instrument->table_instr, INSTRUMENT_BUFFERS);
+			InstrQueryRememberChild(estate->es_instrument, &indexstate->ioss_Instrument->table_instr);
+		}
+	}
 
 	/* Open the index relation. */
 	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
@@ -899,4 +929,11 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
 		SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
 	node->ioss_SharedInfo = palloc(size);
 	memcpy(node->ioss_SharedInfo, SharedInfo, size);
+
+	/* Aggregate workers' table buffer/WAL usage into leader's entry */
+	for (int i = 0; i < node->ioss_SharedInfo->num_workers; i++)
+	{
+		InstrAccumStack(&node->ioss_Instrument->table_instr,
+						&node->ioss_SharedInfo->winstrument[i].table_instr);
+	}
 }
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 1620d14607173..02ef9d124a368 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -85,7 +85,10 @@ IndexNext(IndexScanState *node)
 	ExprContext *econtext;
 	ScanDirection direction;
 	IndexScanDesc scandesc;
+	ItemPointer tid;
 	TupleTableSlot *slot;
+	bool		found;
+	Instrumentation *table_instr = NULL;
 
 	/*
 	 * extract necessary information from index scan node
@@ -102,6 +105,9 @@ IndexNext(IndexScanState *node)
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
+	if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack)
+		table_instr = &node->iss_Instrument->table_instr;
+
 	if (scandesc == NULL)
 	{
 		/*
@@ -132,8 +138,24 @@ IndexNext(IndexScanState *node)
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
+		if (table_instr)
+			InstrPushStack(table_instr);
+
+		for (;;)
+		{
+			found = index_fetch_heap(scandesc, slot);
+			if (found || !scandesc->xs_heap_continue)
+				break;
+		}
+
+		if (table_instr)
+			InstrPopStack(table_instr);
+
+		if (unlikely(!found))
+			continue;
+
 		CHECK_FOR_INTERRUPTS();
 
 		/*
@@ -181,6 +203,7 @@ IndexNextWithReorder(IndexScanState *node)
 	Datum	   *lastfetched_vals;
 	bool	   *lastfetched_nulls;
 	int			cmp;
+	Instrumentation *table_instr = NULL;
 
 	estate = node->ss.ps.state;
 
@@ -200,6 +223,9 @@ IndexNextWithReorder(IndexScanState *node)
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
+	if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack)
+		table_instr = &node->iss_Instrument->table_instr;
+
 	if (scandesc == NULL)
 	{
 		/*
@@ -263,36 +289,67 @@ IndexNextWithReorder(IndexScanState *node)
 		}
 
 		/*
-		 * Fetch next tuple from the index.
+		 * Fetch next valid tuple from the index.
 		 */
-next_indextuple:
-		if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+		for (;;)
 		{
+			ItemPointer tid;
+			bool		found;
+
+			/* Time to fetch the next TID from the index */
+			tid = index_getnext_tid(scandesc, ForwardScanDirection);
+
+			/* If we're out of index entries, we're done */
+			if (tid == NULL)
+			{
+				/*
+				 * No more tuples from the index.  But we still need to drain
+				 * any remaining tuples from the queue before we're done.
+				 */
+				node->iss_ReachedEnd = true;
+				break;
+			}
+
+			Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid));
+
+			if (table_instr)
+				InstrPushStack(table_instr);
+
+			for (;;)
+			{
+				found = index_fetch_heap(scandesc, slot);
+				if (found || !scandesc->xs_heap_continue)
+					break;
+			}
+
+			if (table_instr)
+				InstrPopStack(table_instr);
+
 			/*
-			 * No more tuples from the index.  But we still need to drain any
-			 * remaining tuples from the queue before we're done.
+			 * If the index was lossy, we have to recheck the index quals and
+			 * ORDER BY expressions using the fetched tuple.
 			 */
-			node->iss_ReachedEnd = true;
-			continue;
-		}
-
-		/*
-		 * If the index was lossy, we have to recheck the index quals and
-		 * ORDER BY expressions using the fetched tuple.
-		 */
-		if (scandesc->xs_recheck)
-		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->indexqualorig, econtext))
+			if (found && scandesc->xs_recheck)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				/* allow this loop to be cancellable */
-				CHECK_FOR_INTERRUPTS();
-				goto next_indextuple;
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->indexqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					/* allow this loop to be cancellable */
+					CHECK_FOR_INTERRUPTS();
+					continue;
+				}
 			}
+
+			if (found)
+				break;
 		}
 
+		/* No more index entries, re-run to clear the reorder queue */
+		if (node->iss_ReachedEnd)
+			continue;
+
 		if (scandesc->xs_recheckorderby)
 		{
 			econtext->ecxt_scantuple = slot;
@@ -818,6 +875,7 @@ ExecEndIndexScan(IndexScanState *node)
 		 * which will have a new IndexOnlyScanState and zeroed stats.
 		 */
 		winstrument->nsearches += node->iss_Instrument->nsearches;
+		InstrAccumStack(&winstrument->table_instr, &node->iss_Instrument->table_instr);
 	}
 
 	/*
@@ -980,7 +1038,21 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 
 	/* Set up instrumentation of index scans if requested */
 	if (estate->es_instrument)
-		indexstate->iss_Instrument = palloc0_object(IndexScanInstrumentation);
+	{
+		indexstate->iss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
+
+		/*
+		 * Track table and index access separately. We intentionally don't
+		 * collect timing (even if enabled), since we don't need it, and
+		 * IndexNext / IndexNextWithReorder call InstrPushStack /
+		 * InstrPopStack (instead of the full InstrNode*) to reduce overhead.
+		 */
+		if ((estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0)
+		{
+			InstrInitOptions(&indexstate->iss_Instrument->table_instr, INSTRUMENT_BUFFERS);
+			InstrQueryRememberChild(estate->es_instrument, &indexstate->iss_Instrument->table_instr);
+		}
+	}
 
 	/* Open the index relation. */
 	lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
@@ -1834,4 +1906,11 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node)
 		SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
 	node->iss_SharedInfo = palloc(size);
 	memcpy(node->iss_SharedInfo, SharedInfo, size);
+
+	/* Aggregate workers' table buffer/WAL usage into leader's entry */
+	for (int i = 0; i < node->iss_SharedInfo->num_workers; i++)
+	{
+		InstrAccumStack(&node->iss_Instrument->table_instr,
+						&node->iss_SharedInfo->winstrument[i].table_instr);
+	}
 }
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 2a0ff377a7312..e2315cef384c6 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -18,6 +18,8 @@
 #ifndef INSTRUMENT_NODE_H
 #define INSTRUMENT_NODE_H
 
+#include "executor/instrument.h"
+
 
 /* ---------------------
  *	Instrumentation information for aggregate function execution
@@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation
 {
 	/* Index search count (incremented with pgstat_count_index_scan call) */
 	uint64		nsearches;
+
+	/* Instrumentation utilized for tracking buffer usage during table access */
+	Instrumentation table_instr;
 } IndexScanInstrumentation;
 
 /*

From ff71ea65359af4cc1c6b8df4d9017da5fe87e4b7 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sat, 7 Mar 2026 11:46:41 -0800
Subject: [PATCH 23/23] Add test_session_buffer_usage test module

This is intended for testing instrumentation related logic as it pertains
to the top level stack that is maintained as a running total. There is
currently no in-core user that utilizes the top-level values in this
manner, and especially during abort situations this helps ensure we don't
lose activity due to incorrect handling of unfinalized node stacks.
---
 src/test/modules/Makefile                     |   1 +
 src/test/modules/meson.build                  |   1 +
 .../test_session_buffer_usage/Makefile        |  23 ++
 .../expected/test_session_buffer_usage.out    | 342 ++++++++++++++++++
 .../test_session_buffer_usage/meson.build     |  33 ++
 .../sql/test_session_buffer_usage.sql         | 245 +++++++++++++
 .../test_session_buffer_usage--1.0.sql        |  31 ++
 .../test_session_buffer_usage.c               |  95 +++++
 .../test_session_buffer_usage.control         |   5 +
 9 files changed, 776 insertions(+)
 create mode 100644 src/test/modules/test_session_buffer_usage/Makefile
 create mode 100644 src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
 create mode 100644 src/test/modules/test_session_buffer_usage/meson.build
 create mode 100644 src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
 create mode 100644 src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
 create mode 100644 src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
 create mode 100644 src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control

diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 864b407abcff7..c5ace162fe23c 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -48,6 +48,7 @@ SUBDIRS = \
 		  test_resowner \
 		  test_rls_hooks \
 		  test_saslprep \
+		  test_session_buffer_usage \
 		  test_shm_mq \
 		  test_slru \
 		  test_tidstore \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index e5acacd508368..802cc93d71a48 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -49,6 +49,7 @@ subdir('test_regex')
 subdir('test_resowner')
 subdir('test_rls_hooks')
 subdir('test_saslprep')
+subdir('test_session_buffer_usage')
 subdir('test_shm_mq')
 subdir('test_slru')
 subdir('test_tidstore')
diff --git a/src/test/modules/test_session_buffer_usage/Makefile b/src/test/modules/test_session_buffer_usage/Makefile
new file mode 100644
index 0000000000000..1252b222cb9f8
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_session_buffer_usage/Makefile
+
+MODULE_big = test_session_buffer_usage
+OBJS = \
+	$(WIN32RES) \
+	test_session_buffer_usage.o
+
+EXTENSION = test_session_buffer_usage
+DATA = test_session_buffer_usage--1.0.sql
+PGFILEDESC = "test_session_buffer_usage - show buffer usage statistics for the current session"
+
+REGRESS = test_session_buffer_usage
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_session_buffer_usage
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out b/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
new file mode 100644
index 0000000000000..5f7d349871af8
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
@@ -0,0 +1,342 @@
+LOAD 'test_session_buffer_usage';
+CREATE EXTENSION test_session_buffer_usage;
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM test_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+  AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+  AND local_blks_hit >= 0 AND local_blks_read >= 0
+  AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+  AND temp_blks_read >= 0 AND temp_blks_written >= 0
+  AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+  AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+  AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+ ok 
+----
+ t
+(1 row)
+
+-- Verify counters increase after buffer activity
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+ count 
+-------
+  1000
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM test_session_buffer_usage();
+ blocks_increased 
+------------------
+ t
+(1 row)
+
+DROP TABLE test_buf_activity;
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+SELECT count(*) FROM par_dc_tab;
+ count 
+-------
+  5000
+(1 row)
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count 
+-------
+  5000
+(1 row)
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count 
+-------
+  5000
+(1 row)
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+       AS leader_buffers_match
+FROM test_session_buffer_usage(), dc_serial_result s;
+ leader_buffers_match 
+----------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_dc_tab, dc_serial_result;
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+        WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+        SELECT a / 0 FROM ins;
+ERROR:  division by zero
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM test_session_buffer_usage();
+ exception_buffers_visible 
+---------------------------
+ t
+(1 row)
+
+DROP TABLE exc_tab;
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+ERROR:  duplicate key value violates unique constraint "unique_tab_a_key"
+DETAIL:  Key (a)=(1) already exists.
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM test_session_buffer_usage();
+ unique_violation_buffers_visible 
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE unique_tab;
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+    BEGIN
+        EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+            WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+            SELECT a / 0 FROM ins';
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT subxact_exc_func();
+ subxact_exc_func 
+------------------
+ caught
+(1 row)
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM test_session_buffer_usage();
+ subxact_buffers_visible 
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+    rec record;
+    cnt int := 0;
+BEGIN
+    BEGIN
+        FOR rec IN SELECT * FROM cursor_tab LOOP
+            cnt := cnt + 1;
+            IF cnt = 250 THEN
+                PERFORM 1 / 0;
+            END IF;
+        END LOOP;
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught after ' || cnt || ' rows';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT cursor_exc_func();
+    cursor_exc_func    
+-----------------------
+ caught after 250 rows
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0
+       AS cursor_subxact_buffers_visible
+FROM test_session_buffer_usage();
+ cursor_subxact_buffers_visible 
+--------------------------------
+ t
+(1 row)
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a
+-- trigger that throws an error is still properly propagated.
+CREATE TEMP TABLE trig_err_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int, b char(200));
+INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+-- Warm local buffers so trig_work_tab reads become hits
+SELECT count(*) FROM trig_work_tab;
+ count 
+-------
+   500
+(1 row)
+
+CREATE FUNCTION trig_err_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM count(*) FROM trig_work_tab;
+    RAISE EXCEPTION 'trigger error';
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_err_func();
+-- Measure how many local buffer hits a scan of trig_work_tab produces
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) FROM trig_work_tab;
+ count 
+-------
+   500
+(1 row)
+
+CREATE TEMP TABLE trig_serial_result AS
+SELECT local_blks_hit AS serial_hits FROM test_session_buffer_usage();
+-- Now trigger the same scan via a trigger that errors
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+    INSERT INTO trig_err_tab VALUES (1);
+ERROR:  trigger error
+CONTEXT:  PL/pgSQL function trig_err_func() line 4 at RAISE
+-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran.
+-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated.
+SELECT local_blks_hit >= s.serial_hits / 2
+       AS trigger_abort_buffers_propagated
+FROM test_session_buffer_usage(), trig_serial_result s;
+ trigger_abort_buffers_propagated 
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result;
+DROP FUNCTION trig_err_func;
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+ count 
+-------
+  5000
+(1 row)
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR:  invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset 
+---------------------------------
+ 
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR:  invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CONTEXT:  parallel worker
+RESET debug_parallel_query;
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+       AS worker_abort_buffers_not_propagated
+FROM test_session_buffer_usage(), par_abort_serial_result s;
+ worker_abort_buffers_not_propagated 
+-------------------------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_abort_tab, par_abort_serial_result;
+-- Cleanup
+DROP EXTENSION test_session_buffer_usage;
diff --git a/src/test/modules/test_session_buffer_usage/meson.build b/src/test/modules/test_session_buffer_usage/meson.build
new file mode 100644
index 0000000000000..b96f67dc7fe37
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_session_buffer_usage_sources = files(
+  'test_session_buffer_usage.c',
+)
+
+if host_system == 'windows'
+  test_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_session_buffer_usage',
+    '--FILEDESC', 'test_session_buffer_usage - show buffer usage statistics for the current session',])
+endif
+
+test_session_buffer_usage = shared_module('test_session_buffer_usage',
+  test_session_buffer_usage_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_session_buffer_usage
+
+test_install_data += files(
+  'test_session_buffer_usage.control',
+  'test_session_buffer_usage--1.0.sql',
+)
+
+tests += {
+  'name': 'test_session_buffer_usage',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_session_buffer_usage',
+    ],
+  },
+}
diff --git a/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql b/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
new file mode 100644
index 0000000000000..daf2159c4a653
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
@@ -0,0 +1,245 @@
+LOAD 'test_session_buffer_usage';
+CREATE EXTENSION test_session_buffer_usage;
+
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM test_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+  AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+  AND local_blks_hit >= 0 AND local_blks_read >= 0
+  AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+  AND temp_blks_read >= 0 AND temp_blks_written >= 0
+  AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+  AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+  AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+
+-- Verify counters increase after buffer activity
+SELECT test_session_buffer_usage_reset();
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM test_session_buffer_usage();
+
+DROP TABLE test_buf_activity;
+
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+SELECT count(*) FROM par_dc_tab;
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+       AS leader_buffers_match
+FROM test_session_buffer_usage(), dc_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_dc_tab, dc_serial_result;
+
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+
+SELECT test_session_buffer_usage_reset();
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+        WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+        SELECT a / 0 FROM ins;
+
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP TABLE exc_tab;
+
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+
+SELECT test_session_buffer_usage_reset();
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP TABLE unique_tab;
+
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+    BEGIN
+        EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+            WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+            SELECT a / 0 FROM ins';
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT test_session_buffer_usage_reset();
+SELECT subxact_exc_func();
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+    rec record;
+    cnt int := 0;
+BEGIN
+    BEGIN
+        FOR rec IN SELECT * FROM cursor_tab LOOP
+            cnt := cnt + 1;
+            IF cnt = 250 THEN
+                PERFORM 1 / 0;
+            END IF;
+        END LOOP;
+    EXCEPTION WHEN division_by_zero THEN
+        RETURN 'caught after ' || cnt || ' rows';
+    END;
+    RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT test_session_buffer_usage_reset();
+SELECT cursor_exc_func();
+
+SELECT local_blks_hit + local_blks_read > 0
+       AS cursor_subxact_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+
+-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a
+-- trigger that throws an error is still properly propagated.
+CREATE TEMP TABLE trig_err_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int, b char(200));
+INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+-- Warm local buffers so trig_work_tab reads become hits
+SELECT count(*) FROM trig_work_tab;
+
+CREATE FUNCTION trig_err_func() RETURNS trigger AS $$
+BEGIN
+    PERFORM count(*) FROM trig_work_tab;
+    RAISE EXCEPTION 'trigger error';
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab
+    FOR EACH ROW EXECUTE FUNCTION trig_err_func();
+
+-- Measure how many local buffer hits a scan of trig_work_tab produces
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM trig_work_tab;
+
+CREATE TEMP TABLE trig_serial_result AS
+SELECT local_blks_hit AS serial_hits FROM test_session_buffer_usage();
+
+-- Now trigger the same scan via a trigger that errors
+SELECT test_session_buffer_usage_reset();
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+    INSERT INTO trig_err_tab VALUES (1);
+
+-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran.
+-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated.
+SELECT local_blks_hit >= s.serial_hits / 2
+       AS trigger_abort_buffers_propagated
+FROM test_session_buffer_usage(), trig_serial_result s;
+
+DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result;
+DROP FUNCTION trig_err_func;
+
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT test_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+
+SELECT test_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+RESET debug_parallel_query;
+
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+       AS worker_abort_buffers_not_propagated
+FROM test_session_buffer_usage(), par_abort_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_abort_tab, par_abort_serial_result;
+
+-- Cleanup
+DROP EXTENSION test_session_buffer_usage;
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
new file mode 100644
index 0000000000000..e9833be470ae5
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
@@ -0,0 +1,31 @@
+/* src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_session_buffer_usage" to load this file. \quit
+
+CREATE FUNCTION test_session_buffer_usage(
+    OUT shared_blks_hit bigint,
+    OUT shared_blks_read bigint,
+    OUT shared_blks_dirtied bigint,
+    OUT shared_blks_written bigint,
+    OUT local_blks_hit bigint,
+    OUT local_blks_read bigint,
+    OUT local_blks_dirtied bigint,
+    OUT local_blks_written bigint,
+    OUT temp_blks_read bigint,
+    OUT temp_blks_written bigint,
+    OUT shared_blk_read_time double precision,
+    OUT shared_blk_write_time double precision,
+    OUT local_blk_read_time double precision,
+    OUT local_blk_write_time double precision,
+    OUT temp_blk_read_time double precision,
+    OUT temp_blk_write_time double precision
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'test_session_buffer_usage'
+LANGUAGE C PARALLEL RESTRICTED;
+
+CREATE FUNCTION test_session_buffer_usage_reset()
+RETURNS void
+AS 'MODULE_PATHNAME', 'test_session_buffer_usage_reset'
+LANGUAGE C PARALLEL RESTRICTED;
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
new file mode 100644
index 0000000000000..50eb1a2ffe621
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_session_buffer_usage.c
+ *	  show buffer usage statistics for the current session
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *	  src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/instrument.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC_EXT(
+					.name = "test_session_buffer_usage",
+					.version = PG_VERSION
+);
+
+#define NUM_BUFFER_USAGE_COLUMNS 16
+
+PG_FUNCTION_INFO_V1(test_session_buffer_usage);
+PG_FUNCTION_INFO_V1(test_session_buffer_usage_reset);
+
+#define HAVE_INSTR_STACK 1		/* Change to 0 when testing before stack
+								 * change */
+
+/*
+ * SQL function: test_session_buffer_usage()
+ *
+ * Returns a single row with all BufferUsage counters accumulated since the
+ * start of the session. Excludes any usage not yet added to the top of the
+ * stack (e.g. if this gets called inside a statement that also had buffer
+ * activity).
+ */
+Datum
+test_session_buffer_usage(PG_FUNCTION_ARGS)
+{
+	TupleDesc	tupdesc;
+	Datum		values[NUM_BUFFER_USAGE_COLUMNS];
+	bool		nulls[NUM_BUFFER_USAGE_COLUMNS];
+	BufferUsage *usage;
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	memset(nulls, 0, sizeof(nulls));
+
+#if HAVE_INSTR_STACK
+	usage = &instr_top.bufusage;
+#else
+	usage = &pgBufferUsage;
+#endif
+
+	values[0] = Int64GetDatum(usage->shared_blks_hit);
+	values[1] = Int64GetDatum(usage->shared_blks_read);
+	values[2] = Int64GetDatum(usage->shared_blks_dirtied);
+	values[3] = Int64GetDatum(usage->shared_blks_written);
+	values[4] = Int64GetDatum(usage->local_blks_hit);
+	values[5] = Int64GetDatum(usage->local_blks_read);
+	values[6] = Int64GetDatum(usage->local_blks_dirtied);
+	values[7] = Int64GetDatum(usage->local_blks_written);
+	values[8] = Int64GetDatum(usage->temp_blks_read);
+	values[9] = Int64GetDatum(usage->temp_blks_written);
+	values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time));
+	values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time));
+	values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time));
+	values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time));
+	values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+	values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * SQL function: test_session_buffer_usage_reset()
+ *
+ * Resets all BufferUsage counters on the top instrumentation stack to zero.
+ * Useful in tests to avoid the baseline/delta pattern.
+ */
+Datum
+test_session_buffer_usage_reset(PG_FUNCTION_ARGS)
+{
+#if HAVE_INSTR_STACK
+	memset(&instr_top.bufusage, 0, sizeof(BufferUsage));
+#else
+	memset(&pgBufferUsage, 0, sizeof(BufferUsage));
+#endif
+
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control
new file mode 100644
index 0000000000000..41cfb15a7650a
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control
@@ -0,0 +1,5 @@
+# test_session_buffer_usage extension
+comment = 'show buffer usage statistics for the current session'
+default_version = '1.0'
+module_pathname = '$libdir/test_session_buffer_usage'
+relocatable = true