diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index e856cd35a6f0f..4be81489ff4fb 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -305,19 +305,9 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
if (auto_explain_enabled())
{
- /*
- * Set up to track total elapsed time in ExecutorRun. Make sure the
- * space is allocated in the per-query context so it will go away at
- * ExecutorEnd.
- */
+ /* Set up to track total elapsed time in ExecutorRun. */
if (queryDesc->totaltime == NULL)
- {
- MemoryContext oldcxt;
-
- oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
- MemoryContextSwitchTo(oldcxt);
- }
+ queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
}
}
@@ -381,14 +371,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
*/
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
/* Log plan if duration is exceeded. */
- msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
+ msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total);
if (msec >= auto_explain_log_min_duration)
{
ExplainState *es = NewExplainState();
diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out
index e4d6564ea5b5a..cba487f6be582 100644
--- a/contrib/pg_stat_statements/expected/utility.out
+++ b/contrib/pg_stat_statements/expected/utility.out
@@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
(3 rows)
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+SELECT run_explain_buffers_test();
+ run_explain_buffers_test
+--------------------------
+
+(1 row)
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+ query | has_buffer_stats
+-----------------------------------+------------------
+ SELECT run_explain_buffers_test() | t
+(1 row)
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+ v int;
+BEGIN
+ EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+CALL pgss_call_rollback_proc();
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+ query | calls | local_hitread | wal_bytes_generated | wal_records_generated
+--------------------------------+-------+---------------+---------------------+-----------------------
+ CALL pgss_call_rollback_proc() | 1 | t | t | t
+(1 row)
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
-- CALL
CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
DECLARE
diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out
index 977e382d84894..611213daef6c2 100644
--- a/contrib/pg_stat_statements/expected/wal.out
+++ b/contrib/pg_stat_statements/expected/wal.out
@@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
t
(1 row)
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+SELECT pgss_error_func();
+ pgss_error_func
+-----------------
+
+(1 row)
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+ query | calls | local_hitread | wal_bytes_generated | wal_records_generated
+--------------------------+-------+---------------+---------------------+-----------------------
+ SELECT pgss_error_func() | 1 | t | t | t
+(1 row)
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 5494d41dca161..78f1518c94084 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -911,22 +911,11 @@ pgss_planner(Query *parse,
&& pgss_track_planning && query_string
&& parse->queryId != INT64CONST(0))
{
- instr_time start;
- instr_time duration;
- BufferUsage bufusage_start,
- bufusage;
- WalUsage walusage_start,
- walusage;
+ Instrumentation instr = {0};
- /* We need to track buffer usage as the planner can access them. */
- bufusage_start = pgBufferUsage;
-
- /*
- * Similarly the planner could write some WAL records in some cases
- * (e.g. setting a hint bit with those being WAL-logged)
- */
- walusage_start = pgWalUsage;
- INSTR_TIME_SET_CURRENT(start);
+ /* Track time and buffer/WAL usage as the planner can access them. */
+ InstrInitOptions(&instr, INSTRUMENT_ALL);
+ InstrStart(&instr);
nesting_level++;
PG_TRY();
@@ -940,30 +929,20 @@ pgss_planner(Query *parse,
}
PG_FINALLY();
{
+ InstrStopFinalize(&instr);
nesting_level--;
}
PG_END_TRY();
- INSTR_TIME_SET_CURRENT(duration);
- INSTR_TIME_SUBTRACT(duration, start);
-
- /* calc differences of buffer counters. */
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
- /* calc differences of WAL counters. */
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
pgss_store(query_string,
parse->queryId,
parse->stmt_location,
parse->stmt_len,
PGSS_PLAN,
- INSTR_TIME_GET_MILLISEC(duration),
+ INSTR_TIME_GET_MILLISEC(instr.total),
0,
- &bufusage,
- &walusage,
+ &instr.bufusage,
+ &instr.walusage,
NULL,
NULL,
0,
@@ -1015,19 +994,9 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
*/
if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0))
{
- /*
- * Set up to track total elapsed time in ExecutorRun. Make sure the
- * space is allocated in the per-query context so it will go away at
- * ExecutorEnd.
- */
+ /* Set up to track total elapsed time in ExecutorRun. */
if (queryDesc->totaltime == NULL)
- {
- MemoryContext oldcxt;
-
- oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
- MemoryContextSwitchTo(oldcxt);
- }
+ queryDesc->totaltime = InstrQueryAlloc(INSTRUMENT_ALL);
}
}
@@ -1084,21 +1053,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
if (queryId != INT64CONST(0) && queryDesc->totaltime &&
pgss_enabled(nesting_level))
{
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
pgss_store(queryDesc->sourceText,
queryId,
queryDesc->plannedstmt->stmt_location,
queryDesc->plannedstmt->stmt_len,
PGSS_EXEC,
- INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total),
+ INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->instr.total),
queryDesc->estate->es_total_processed,
- &queryDesc->totaltime->bufusage,
- &queryDesc->totaltime->walusage,
+ &queryDesc->totaltime->instr.bufusage,
+ &queryDesc->totaltime->instr.walusage,
queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
NULL,
queryDesc->estate->es_parallel_workers_to_launch,
@@ -1162,17 +1125,11 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
!IsA(parsetree, ExecuteStmt) &&
!IsA(parsetree, PrepareStmt))
{
- instr_time start;
- instr_time duration;
uint64 rows;
- BufferUsage bufusage_start,
- bufusage;
- WalUsage walusage_start,
- walusage;
+ Instrumentation instr = {0};
- bufusage_start = pgBufferUsage;
- walusage_start = pgWalUsage;
- INSTR_TIME_SET_CURRENT(start);
+ InstrInitOptions(&instr, INSTRUMENT_ALL);
+ InstrStart(&instr);
nesting_level++;
PG_TRY();
@@ -1188,6 +1145,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
}
PG_FINALLY();
{
+ InstrStopFinalize(&instr);
nesting_level--;
}
PG_END_TRY();
@@ -1202,9 +1160,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
* former value, which'd otherwise be a good idea.
*/
- INSTR_TIME_SET_CURRENT(duration);
- INSTR_TIME_SUBTRACT(duration, start);
-
/*
* Track the total number of rows retrieved or affected by the utility
* statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
@@ -1216,23 +1171,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
qc->nprocessed : 0;
- /* calc differences of buffer counters. */
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
- /* calc differences of WAL counters. */
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
pgss_store(queryString,
saved_queryId,
saved_stmt_location,
saved_stmt_len,
PGSS_EXEC,
- INSTR_TIME_GET_MILLISEC(duration),
+ INSTR_TIME_GET_MILLISEC(instr.total),
rows,
- &bufusage,
- &walusage,
+ &instr.bufusage,
+ &instr.walusage,
NULL,
NULL,
0,
diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql
index dd97203c21025..7540e49c73caf 100644
--- a/contrib/pg_stat_statements/sql/utility.sql
+++ b/contrib/pg_stat_statements/sql/utility.sql
@@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7;
SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+SELECT run_explain_buffers_test();
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+ v int;
+BEGIN
+ EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+CALL pgss_call_rollback_proc();
+
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
+
-- CALL
CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
DECLARE
diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql
index 1dc1552a81ebc..467e321b2062e 100644
--- a/contrib/pg_stat_statements/sql/wal.sql
+++ b/contrib/pg_stat_statements/sql/wal.sql
@@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated,
wal_records >= rows as wal_records_ge_rows
FROM pg_stat_statements ORDER BY query COLLATE "C";
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT pgss_error_func();
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 192f8011160a9..06673017bcfaf 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -60,6 +60,7 @@ typedef struct ConnCacheEntry
/* Remaining fields are invalid when conn is NULL: */
int xact_depth; /* 0 = no xact open, 1 = main xact open, 2 =
* one level of subxact open, etc */
+ bool xact_read_only; /* xact r/o state */
bool have_prep_stmt; /* have we prepared any stmts in this xact? */
bool have_error; /* have any subxacts aborted in this xact? */
bool changing_xact_state; /* xact state change in process */
@@ -86,6 +87,12 @@ static unsigned int prep_stmt_number = 0;
/* tracks whether any work is needed in callback functions */
static bool xact_got_connection = false;
+/*
+ * tracks the topmost read-only local transaction's nesting level determined
+ * by GetTopReadOnlyTransactionNestLevel()
+ */
+static int read_only_level = 0;
+
/* custom wait event values, retrieved from shared memory */
static uint32 pgfdw_we_cleanup_result = 0;
static uint32 pgfdw_we_connect = 0;
@@ -378,6 +385,7 @@ make_new_connection(ConnCacheEntry *entry, UserMapping *user)
/* Reset all transient state fields, to be sure all are clean */
entry->xact_depth = 0;
+ entry->xact_read_only = false;
entry->have_prep_stmt = false;
entry->have_error = false;
entry->changing_xact_state = false;
@@ -871,29 +879,106 @@ do_sql_command_end(PGconn *conn, const char *sql, bool consume_input)
* those scans. A disadvantage is that we can't provide sane emulation of
* READ COMMITTED behavior --- it would be nice if we had some other way to
* control which remote queries share a snapshot.
+ *
+ * Note also that we always start the remote transaction with the same
+ * read/write and deferrable properties as the local transaction, and start
+ * the remote subtransaction with the same read/write property as the local
+ * subtransaction.
*/
static void
begin_remote_xact(ConnCacheEntry *entry)
{
int curlevel = GetCurrentTransactionNestLevel();
- /* Start main transaction if we haven't yet */
+ /*
+ * If the current local (sub)transaction is read-only, set the topmost
+ * read-only local transaction's nesting level if we haven't yet.
+ *
+ * Note: once it's set, it's retained until the topmost read-only local
+ * transaction is committed/aborted (see pgfdw_xact_callback and
+ * pgfdw_subxact_callback).
+ */
+ if (XactReadOnly)
+ {
+ if (read_only_level == 0)
+ read_only_level = GetTopReadOnlyTransactionNestLevel();
+ Assert(read_only_level > 0);
+ }
+ else
+ Assert(read_only_level == 0);
+
+ /*
+ * Start main transaction if we haven't yet; otherwise, change the current
+ * remote (sub)transaction's read/write mode if needed.
+ */
if (entry->xact_depth <= 0)
{
- const char *sql;
+ /*
+ * This is the case when we haven't yet started a main transaction.
+ */
+ StringInfoData sql;
+ bool ro = (read_only_level == 1);
elog(DEBUG3, "starting remote transaction on connection %p",
entry->conn);
+ initStringInfo(&sql);
+ appendStringInfoString(&sql, "START TRANSACTION ISOLATION LEVEL ");
if (IsolationIsSerializable())
- sql = "START TRANSACTION ISOLATION LEVEL SERIALIZABLE";
+ appendStringInfoString(&sql, "SERIALIZABLE");
else
- sql = "START TRANSACTION ISOLATION LEVEL REPEATABLE READ";
+ appendStringInfoString(&sql, "REPEATABLE READ");
+ if (ro)
+ appendStringInfoString(&sql, " READ ONLY");
+ if (XactDeferrable)
+ appendStringInfoString(&sql, " DEFERRABLE");
entry->changing_xact_state = true;
- do_sql_command(entry->conn, sql);
+ do_sql_command(entry->conn, sql.data);
entry->xact_depth = 1;
+ if (ro)
+ {
+ Assert(!entry->xact_read_only);
+ entry->xact_read_only = true;
+ }
entry->changing_xact_state = false;
}
+ else if (!entry->xact_read_only)
+ {
+ /*
+ * The remote (sub)transaction has been opened in read-write mode.
+ */
+ Assert(read_only_level == 0 ||
+ entry->xact_depth <= read_only_level);
+
+ /*
+ * If its nesting depth matches read_only_level, it means that the
+ * local read-write (sub)transaction that started it has changed to
+ * read-only after that; in which case change it to read-only as well.
+ * Otherwise, the local (sub)transaction is still read-write, so there
+ * is no need to do anything.
+ */
+ if (entry->xact_depth == read_only_level)
+ {
+ entry->changing_xact_state = true;
+ do_sql_command(entry->conn, "SET transaction_read_only = on");
+ entry->xact_read_only = true;
+ entry->changing_xact_state = false;
+ }
+ }
+ else
+ {
+ /*
+ * The remote (sub)transaction has been opened in read-only mode.
+ */
+ Assert(read_only_level > 0 &&
+ entry->xact_depth >= read_only_level);
+
+ /*
+ * The local read-only (sub)transaction that started it is guaranteed
+ * to be still read-only (see check_transaction_read_only), so there
+ * is no need to do anything.
+ */
+ }
/*
* If we're in a subtransaction, stack up savepoints to match our level.
@@ -902,12 +987,21 @@ begin_remote_xact(ConnCacheEntry *entry)
*/
while (entry->xact_depth < curlevel)
{
- char sql[64];
+ StringInfoData sql;
+ bool ro = (entry->xact_depth + 1 == read_only_level);
- snprintf(sql, sizeof(sql), "SAVEPOINT s%d", entry->xact_depth + 1);
+ initStringInfo(&sql);
+ appendStringInfo(&sql, "SAVEPOINT s%d", entry->xact_depth + 1);
+ if (ro)
+ appendStringInfoString(&sql, "; SET transaction_read_only = on");
entry->changing_xact_state = true;
- do_sql_command(entry->conn, sql);
+ do_sql_command(entry->conn, sql.data);
entry->xact_depth++;
+ if (ro)
+ {
+ Assert(!entry->xact_read_only);
+ entry->xact_read_only = true;
+ }
entry->changing_xact_state = false;
}
}
@@ -1212,6 +1306,9 @@ pgfdw_xact_callback(XactEvent event, void *arg)
/* Also reset cursor numbering for next transaction */
cursor_number = 0;
+
+ /* Likewise for read_only_level */
+ read_only_level = 0;
}
/*
@@ -1310,6 +1407,10 @@ pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid,
false);
}
}
+
+ /* If in read_only_level, reset it */
+ if (curlevel == read_only_level)
+ read_only_level = 0;
}
/*
@@ -1412,6 +1513,9 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
/* Reset state to show we're out of a transaction */
entry->xact_depth = 0;
+ /* Reset xact r/o state */
+ entry->xact_read_only = false;
+
/*
* If the connection isn't in a good idle state, it is marked as
* invalid or keep_connections option of its server is disabled, then
@@ -1432,6 +1536,10 @@ pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel)
{
/* Reset state to show we're out of a subtransaction */
entry->xact_depth--;
+
+ /* If in read_only_level, reset xact r/o state */
+ if (entry->xact_depth + 1 == read_only_level)
+ entry->xact_read_only = false;
}
}
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index ac34a1acacb62..cd22553236f05 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -12575,6 +12575,142 @@ SELECT count(*) FROM remote_application_name
DROP FOREIGN TABLE remote_application_name;
DROP VIEW my_application_name;
-- ===================================================================
+-- test read-only and/or deferrable transactions
+-- ===================================================================
+CREATE TABLE loct (f1 int, f2 text);
+CREATE FUNCTION locf() RETURNS SETOF loct LANGUAGE SQL AS
+ 'UPDATE public.loct SET f2 = f2 || f2 RETURNING *';
+CREATE VIEW locv AS SELECT t.* FROM locf() t;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+ SERVER loopback OPTIONS (table_name 'locv');
+CREATE FOREIGN TABLE remt2 (f1 int, f2 text)
+ SERVER loopback2 OPTIONS (table_name 'locv');
+INSERT INTO loct VALUES (1, 'foo'), (2, 'bar');
+START TRANSACTION READ ONLY;
+SAVEPOINT s;
+SELECT * FROM remt; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+START TRANSACTION;
+SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt; -- should work
+ f1 | f2
+----+--------
+ 1 | foofoo
+ 2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt; -- should work
+ f1 | f2
+----+--------
+ 1 | foofoo
+ 2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+-- Exercise abort code paths in pgfdw_xact_callback/pgfdw_subxact_callback
+-- in situations where multiple connections are involved
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt; -- should work
+ f1 | f2
+----+--------
+ 1 | foofoo
+ 2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt2; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt; -- should work
+ f1 | f2
+----+--------
+ 1 | foofoo
+ 2 | barbar
+(2 rows)
+
+SET transaction_read_only = on;
+SELECT * FROM remt2; -- should fail
+ERROR: cannot execute UPDATE in a read-only transaction
+CONTEXT: SQL function "locf" statement 1
+remote SQL command: SELECT f1, f2 FROM public.locv
+ROLLBACK;
+DROP FOREIGN TABLE remt;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+ SERVER loopback OPTIONS (table_name 'loct');
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+SELECT * FROM remt;
+ f1 | f2
+----+-----
+ 1 | foo
+ 2 | bar
+(2 rows)
+
+COMMIT;
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE DEFERRABLE;
+SELECT * FROM remt;
+ f1 | f2
+----+-----
+ 1 | foo
+ 2 | bar
+(2 rows)
+
+COMMIT;
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+SELECT * FROM remt;
+ f1 | f2
+----+-----
+ 1 | foo
+ 2 | bar
+(2 rows)
+
+COMMIT;
+-- Clean up
+DROP FOREIGN TABLE remt;
+DROP FOREIGN TABLE remt2;
+DROP VIEW locv;
+DROP FUNCTION locf();
+DROP TABLE loct;
+-- ===================================================================
-- test parallel commit and parallel abort
-- ===================================================================
ALTER SERVER loopback OPTIONS (ADD parallel_commit 'true');
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 41e47cc795ba8..cc8ec24c30eb0 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2779,7 +2779,7 @@ postgresIterateDirectModify(ForeignScanState *node)
if (!resultRelInfo->ri_projectReturning)
{
TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
- Instrumentation *instr = node->ss.ps.instrument;
+ NodeInstrumentation *instr = node->ss.ps.instrument;
Assert(!dmstate->has_returning);
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 0e218b29a29ed..59963e298b846 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -4328,6 +4328,86 @@ SELECT count(*) FROM remote_application_name
DROP FOREIGN TABLE remote_application_name;
DROP VIEW my_application_name;
+-- ===================================================================
+-- test read-only and/or deferrable transactions
+-- ===================================================================
+CREATE TABLE loct (f1 int, f2 text);
+CREATE FUNCTION locf() RETURNS SETOF loct LANGUAGE SQL AS
+ 'UPDATE public.loct SET f2 = f2 || f2 RETURNING *';
+CREATE VIEW locv AS SELECT t.* FROM locf() t;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+ SERVER loopback OPTIONS (table_name 'locv');
+CREATE FOREIGN TABLE remt2 (f1 int, f2 text)
+ SERVER loopback2 OPTIONS (table_name 'locv');
+INSERT INTO loct VALUES (1, 'foo'), (2, 'bar');
+
+START TRANSACTION READ ONLY;
+SAVEPOINT s;
+SELECT * FROM remt; -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt; -- should fail
+ROLLBACK;
+
+START TRANSACTION;
+SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ROLLBACK;
+
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt; -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt; -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt; -- should fail
+ROLLBACK;
+
+-- Exercise abort code paths in pgfdw_xact_callback/pgfdw_subxact_callback
+-- in situations where multiple connections are involved
+START TRANSACTION;
+SAVEPOINT s;
+SELECT * FROM remt; -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt2; -- should fail
+ROLLBACK TO s;
+RELEASE SAVEPOINT s;
+SELECT * FROM remt; -- should work
+SET transaction_read_only = on;
+SELECT * FROM remt2; -- should fail
+ROLLBACK;
+
+DROP FOREIGN TABLE remt;
+CREATE FOREIGN TABLE remt (f1 int, f2 text)
+ SERVER loopback OPTIONS (table_name 'loct');
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY;
+SELECT * FROM remt;
+COMMIT;
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE DEFERRABLE;
+SELECT * FROM remt;
+COMMIT;
+
+START TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+SELECT * FROM remt;
+COMMIT;
+
+-- Clean up
+DROP FOREIGN TABLE remt;
+DROP FOREIGN TABLE remt2;
+DROP VIEW locv;
+DROP FUNCTION locf();
+DROP TABLE loct;
+
-- ===================================================================
-- test parallel commit and parallel abort
-- ===================================================================
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 604e8578a8dcd..d28f4f22535b9 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2;
-> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10)
Index Cond: (unique2 = t1.unique2)
Index Searches: 10
+ Table Buffers: shared hit=10
Buffers: shared hit=24 read=6
Planning:
Buffers: shared hit=15 dirtied=9
@@ -1005,7 +1006,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)';
Index Cond: (f1 @> '((0.5,2))'::polygon)
Rows Removed by Index Recheck: 1
Index Searches: 1
- Buffers: shared hit=1
+ Table Buffers: shared hit=1
+ Buffers: shared hit=2
Planning Time: 0.039 ms
Execution Time: 0.098 ms
@@ -1014,7 +1016,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)';
then rejected by a recheck of the index condition. This happens because a
GiST index is lossy
for polygon containment tests: it actually
returns the rows with polygons that overlap the target, and then we have
- to do the exact containment test on those rows.
+ to do the exact containment test on those rows. The Table Buffers
+ counts indicate how many operations were performed on the table instead of
+ the index. This number is included in the Buffers counts.
@@ -1203,13 +1207,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000
QUERY PLAN
-------------------------------------------------------------------&zwsp;------------------------------------------------------------
Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1)
- Buffers: shared hit=16
+ Buffers: shared hit=14
-> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1)
Index Cond: (unique2 > 9000)
Filter: (unique1 < 100)
Rows Removed by Filter: 287
Index Searches: 1
- Buffers: shared hit=16
+ Table Buffers: shared hit=11
+ Buffers: shared hit=14
Planning Time: 0.077 ms
Execution Time: 0.086 ms
diff --git a/doc/src/sgml/postgres-fdw.sgml b/doc/src/sgml/postgres-fdw.sgml
index de69ddcdebcc7..9185c76f93290 100644
--- a/doc/src/sgml/postgres-fdw.sgml
+++ b/doc/src/sgml/postgres-fdw.sgml
@@ -1103,6 +1103,23 @@ CREATE SUBSCRIPTION my_subscription SERVER subscription_server PUBLICATION testp
PostgreSQL release might modify these rules.
+
+ The remote transaction is opened in the same read/write mode as the local
+ transaction: if the local transaction is READ ONLY,
+ the remote transaction is opened in READ ONLY mode,
+ otherwise it is opened in READ WRITE mode.
+ (This rule is also applied to remote and local subtransactions.)
+ Note that this does not prevent login triggers executed on the remote
+ server from writing.
+
+
+
+ The remote transaction is also opened in the same deferrable mode as the
+ local transaction: if the local transaction is DEFERRABLE,
+ the remote transaction is opened in DEFERRABLE mode,
+ otherwise it is opened in NOT DEFERRABLE mode.
+
+
Note that it is currently not supported by
postgres_fdw to prepare the remote transaction for
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 5b8b521802ee5..71070736acb98 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -509,6 +509,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200);
-> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1)
Index Cond: ((id > 100) AND (id < 200))
Index Searches: 1
+ Table Buffers: shared hit=1
Buffers: shared hit=4
Planning Time: 0.244 ms
Execution Time: 0.073 ms
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index bdb30752e098c..9e545b4ef0e66 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -51,8 +51,7 @@
#define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
-#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
-#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xB000000000000004)
/*
* Status for index builds performed in parallel. This is allocated in a
@@ -148,8 +147,7 @@ typedef struct BrinLeader
BrinShared *brinshared;
Sharedsort *sharedsort;
Snapshot snapshot;
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ Instrumentation *instr;
} BrinLeader;
/*
@@ -2387,8 +2385,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
BrinShared *brinshared;
Sharedsort *sharedsort;
BrinLeader *brinleader = palloc0_object(BrinLeader);
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ Instrumentation *instr;
bool leaderparticipates = true;
int querylen;
@@ -2430,18 +2427,14 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
shm_toc_estimate_keys(&pcxt->estimator, 2);
/*
- * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
- * and PARALLEL_KEY_BUFFER_USAGE.
+ * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgWalUsage or
- * pgBufferUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_estimate_keys(&pcxt->estimator, 1);
- shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
shm_toc_estimate_keys(&pcxt->estimator, 1);
/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -2514,15 +2507,12 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
}
/*
- * Allocate space for each worker's WalUsage and BufferUsage; no need to
+ * Allocate space for each worker's Instrumentation; no need to
* initialize.
*/
- walusage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
- bufferusage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+ instr = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
/* Launch workers, saving status for leader/caller */
LaunchParallelWorkers(pcxt);
@@ -2533,8 +2523,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
brinleader->brinshared = brinshared;
brinleader->sharedsort = sharedsort;
brinleader->snapshot = snapshot;
- brinleader->walusage = walusage;
- brinleader->bufferusage = bufferusage;
+ brinleader->instr = instr;
/* If no workers were successfully launched, back out (do serial build) */
if (pcxt->nworkers_launched == 0)
@@ -2573,7 +2562,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
* or we might get incomplete data.)
*/
for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
- InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
+ InstrAccumParallelQuery(&brinleader->instr[i]);
/* Free last reference to MVCC snapshot, if one was used */
if (IsMVCCSnapshot(brinleader->snapshot))
@@ -2887,8 +2876,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ QueryInstrumentation *instr;
+ Instrumentation *worker_instr;
int sortmem;
/*
@@ -2936,7 +2925,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
tuplesort_attach_shared(sharedsort, seg);
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Might as well use reliable figure when doling out maintenance_work_mem
@@ -2949,10 +2938,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
heapRel, indexRel, sortmem, false);
/* Report WAL/buffer usage during parallel execution */
- bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
- walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
- &walusage[ParallelWorkerNumber]);
+ worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+ InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 9d83a4957757b..f3de62ce7f339 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -45,8 +45,7 @@
#define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
-#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
-#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
+#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xB000000000000004)
/*
* Status for index builds performed in parallel. This is allocated in a
@@ -138,8 +137,7 @@ typedef struct GinLeader
GinBuildShared *ginshared;
Sharedsort *sharedsort;
Snapshot snapshot;
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ Instrumentation *instr;
} GinLeader;
typedef struct
@@ -945,8 +943,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
GinBuildShared *ginshared;
Sharedsort *sharedsort;
GinLeader *ginleader = palloc0_object(GinLeader);
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ Instrumentation *instr;
bool leaderparticipates = true;
int querylen;
@@ -987,18 +984,14 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
shm_toc_estimate_keys(&pcxt->estimator, 2);
/*
- * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
- * and PARALLEL_KEY_BUFFER_USAGE.
+ * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgWalUsage or
- * pgBufferUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_estimate_keys(&pcxt->estimator, 1);
- shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
shm_toc_estimate_keys(&pcxt->estimator, 1);
/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -1066,15 +1059,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
}
/*
- * Allocate space for each worker's WalUsage and BufferUsage; no need to
+ * Allocate space for each worker's Instrumentation; no need to
* initialize.
*/
- walusage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
- bufferusage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+ instr = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
/* Launch workers, saving status for leader/caller */
LaunchParallelWorkers(pcxt);
@@ -1085,8 +1075,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
ginleader->ginshared = ginshared;
ginleader->sharedsort = sharedsort;
ginleader->snapshot = snapshot;
- ginleader->walusage = walusage;
- ginleader->bufferusage = bufferusage;
+ ginleader->instr = instr;
/* If no workers were successfully launched, back out (do serial build) */
if (pcxt->nworkers_launched == 0)
@@ -1125,7 +1114,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
* or we might get incomplete data.)
*/
for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
- InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
+ InstrAccumParallelQuery(&ginleader->instr[i]);
/* Free last reference to MVCC snapshot, if one was used */
if (IsMVCCSnapshot(ginleader->snapshot))
@@ -2118,8 +2107,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ QueryInstrumentation *instr;
+ Instrumentation *worker_instr;
int sortmem;
/*
@@ -2186,7 +2175,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
tuplesort_attach_shared(sharedsort, seg);
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Might as well use reliable figure when doling out maintenance_work_mem
@@ -2199,10 +2188,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
heapRel, indexRel, sortmem, false);
/* Report WAL/buffer usage during parallel execution */
- bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
- walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
- &walusage[ParallelWorkerNumber]);
+ worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+ InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c
index bbd8a165ddc23..33d14f1de7d52 100644
--- a/src/backend/access/heap/heapam_indexscan.c
+++ b/src/backend/access/heap/heapam_indexscan.c
@@ -41,20 +41,13 @@ heapam_index_fetch_begin(Relation rel, uint32 flags)
void
heapam_index_fetch_reset(IndexFetchTableData *scan)
{
- IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
-
- if (BufferIsValid(hscan->xs_cbuf))
- {
- ReleaseBuffer(hscan->xs_cbuf);
- hscan->xs_cbuf = InvalidBuffer;
- hscan->xs_blk = InvalidBlockNumber;
- }
-
- if (BufferIsValid(hscan->xs_vmbuffer))
- {
- ReleaseBuffer(hscan->xs_vmbuffer);
- hscan->xs_vmbuffer = InvalidBuffer;
- }
+ /*
+ * Resets are a no-op.
+ *
+ * Deliberately avoid dropping pins now held in xs_cbuf and xs_vmbuffer.
+ * This saves cycles during certain tight nested loop joins (it can avoid
+ * repeated pinning and unpinning of the same buffer across rescans).
+ */
}
void
@@ -62,7 +55,13 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
{
IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
- heapam_index_fetch_reset(scan);
+ /* drop pin if there's a pinned heap page */
+ if (BufferIsValid(hscan->xs_cbuf))
+ ReleaseBuffer(hscan->xs_cbuf);
+
+ /* drop pin if there's a pinned visibility map page */
+ if (BufferIsValid(hscan->xs_vmbuffer))
+ ReleaseBuffer(hscan->xs_vmbuffer);
pfree(hscan);
}
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 88c71cd85b60b..291d9d67bc295 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -637,8 +637,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
TimestampTz starttime = 0;
PgStat_Counter startreadtime = 0,
startwritetime = 0;
- WalUsage startwalusage = pgWalUsage;
- BufferUsage startbufferusage = pgBufferUsage;
+ QueryInstrumentation *instr = NULL;
ErrorContextCallback errcallback;
char **indnames = NULL;
Size dead_items_max_bytes = 0;
@@ -654,6 +653,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
startreadtime = pgStatBlockReadTime;
startwritetime = pgStatBlockWriteTime;
}
+ instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+ InstrQueryStart(instr);
}
/* Used for instrumentation and stats report */
@@ -984,14 +985,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
{
TimestampTz endtime = GetCurrentTimestamp();
+ InstrQueryStopFinalize(instr);
+
if (verbose || params->log_vacuum_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
params->log_vacuum_min_duration))
{
long secs_dur;
int usecs_dur;
- WalUsage walusage;
- BufferUsage bufferusage;
StringInfoData buf;
char *msgfmt;
int32 diff;
@@ -1000,12 +1001,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams *params,
int64 total_blks_hit;
int64 total_blks_read;
int64 total_blks_dirtied;
+ BufferUsage bufferusage = instr->instr.bufusage;
+ WalUsage walusage = instr->instr.walusage;
TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
- memset(&bufferusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
total_blks_hit = bufferusage.shared_blks_hit +
bufferusage.local_blks_hit;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 44496ae0963e1..23288a4f99490 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -375,7 +375,7 @@ index_rescan(IndexScanDesc scan,
Assert(nkeys == scan->numberOfKeys);
Assert(norderbys == scan->numberOfOrderBys);
- /* Release resources (like buffer pins) from table accesses */
+ /* reset table AM state for rescan */
if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
@@ -452,7 +452,7 @@ index_restrpos(IndexScanDesc scan)
SCAN_CHECKS;
CHECK_SCAN_PROCEDURE(amrestrpos);
- /* release resources (like buffer pins) from table accesses */
+ /* reset table AM state for restoring the marked position */
if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
@@ -578,6 +578,7 @@ index_parallelrescan(IndexScanDesc scan)
{
SCAN_CHECKS;
+ /* reset table AM state for rescan */
if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
@@ -659,7 +660,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
/* If we're out of index entries, we're done */
if (!found)
{
- /* release resources (like buffer pins) from table accesses */
+ /* reset table AM state */
if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 756dfa3dcf47e..cb238f862a7c8 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -66,8 +66,7 @@
#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
#define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
-#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
-#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
+#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xA000000000000005)
/*
* DISABLE_LEADER_PARTICIPATION disables the leader's participation in
@@ -195,8 +194,7 @@ typedef struct BTLeader
Sharedsort *sharedsort;
Sharedsort *sharedsort2;
Snapshot snapshot;
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ Instrumentation *instr;
} BTLeader;
/*
@@ -1408,8 +1406,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Sharedsort *sharedsort2;
BTSpool *btspool = buildstate->spool;
BTLeader *btleader = palloc0_object(BTLeader);
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ Instrumentation *instr;
bool leaderparticipates = true;
int querylen;
@@ -1462,18 +1459,14 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
}
/*
- * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
- * and PARALLEL_KEY_BUFFER_USAGE.
+ * Estimate space for Instrumentation -- PARALLEL_KEY_INSTRUMENTATION.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgWalUsage or
- * pgBufferUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_estimate_keys(&pcxt->estimator, 1);
- shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
shm_toc_estimate_keys(&pcxt->estimator, 1);
/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
@@ -1560,15 +1553,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
}
/*
- * Allocate space for each worker's WalUsage and BufferUsage; no need to
+ * Allocate space for each worker's Instrumentation; no need to
* initialize.
*/
- walusage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
- bufferusage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
+ instr = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
/* Launch workers, saving status for leader/caller */
LaunchParallelWorkers(pcxt);
@@ -1580,8 +1570,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
btleader->sharedsort = sharedsort;
btleader->sharedsort2 = sharedsort2;
btleader->snapshot = snapshot;
- btleader->walusage = walusage;
- btleader->bufferusage = bufferusage;
+ btleader->instr = instr;
/* If no workers were successfully launched, back out (do serial build) */
if (pcxt->nworkers_launched == 0)
@@ -1620,7 +1609,7 @@ _bt_end_parallel(BTLeader *btleader)
* or we might get incomplete data.)
*/
for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
- InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
+ InstrAccumParallelQuery(&btleader->instr[i]);
/* Free last reference to MVCC snapshot, if one was used */
if (IsMVCCSnapshot(btleader->snapshot))
@@ -1753,8 +1742,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
- WalUsage *walusage;
- BufferUsage *bufferusage;
+ QueryInstrumentation *instr;
+ Instrumentation *worker_instr;
int sortmem;
#ifdef BTREE_BUILD_STATS
@@ -1828,7 +1817,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
}
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/* Perform sorting of spool, and possibly a spool2 */
sortmem = maintenance_work_mem / btshared->scantuplesortstates;
@@ -1836,10 +1825,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
sharedsort2, sortmem, false);
/* Report WAL/buffer usage during parallel execution */
- bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
- walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
- &walusage[ParallelWorkerNumber]);
+ worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+ InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
#ifdef BTREE_BUILD_STATS
if (log_btree_build_stats)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index aafc53e016467..48bc90c967353 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1046,6 +1046,34 @@ TransactionStartedDuringRecovery(void)
return CurrentTransactionState->startedInRecovery;
}
+/*
+ * GetTopReadOnlyTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction or when neither
+ * a top-level transaction nor subtransactions are read-only, one when the
+ * top-level transaction is read-only, two when one level of subtransaction is
+ * read-only, etc.
+ *
+ * Note: subtransactions of the topmost read-only transaction are also
+ * read-only, because they inherit read-only mode from the transaction, and
+ * thus can't change to read-write mode (see check_transaction_read_only).
+ */
+int
+GetTopReadOnlyTransactionNestLevel(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (!XactReadOnly)
+ return 0;
+ while (s->nestingLevel > 1)
+ {
+ if (!s->prevXactReadOnly)
+ return s->nestingLevel;
+ s = s->parent;
+ }
+ return s->nestingLevel;
+}
+
/*
* EnterParallelMode
*/
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 9e8999bbb616f..71c9a26566236 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1103,10 +1103,10 @@ XLogInsertRecord(XLogRecData *rdata,
/* Report WAL traffic to the instrumentation. */
if (inserted)
{
- pgWalUsage.wal_bytes += rechdr->xl_tot_len;
- pgWalUsage.wal_records++;
- pgWalUsage.wal_fpi += num_fpi;
- pgWalUsage.wal_fpi_bytes += fpi_bytes;
+ INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len);
+ INSTR_WALUSAGE_INCR(wal_records);
+ INSTR_WALUSAGE_ADD(wal_fpi, num_fpi);
+ INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes);
/* Required for the flush of pending stats WAL data */
pgstat_report_fixed = true;
@@ -2085,7 +2085,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
WriteRqst.Flush = InvalidXLogRecPtr;
XLogWrite(WriteRqst, tli, false);
LWLockRelease(WALWriteLock);
- pgWalUsage.wal_buffers_full++;
+ INSTR_WALUSAGE_INCR(wal_buffers_full);
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
/*
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index c52c0a6023ddf..ebd41176b9446 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -1184,7 +1184,7 @@ build_indices(void)
heap = table_open(ILHead->il_heap, NoLock);
ind = index_open(ILHead->il_ind, NoLock);
- index_build(heap, ind, ILHead->il_info, false, false);
+ index_build(heap, ind, ILHead->il_info, false, false, false);
index_close(ind, NoLock);
table_close(heap, NoLock);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 5748aa9a1a9af..ae6b7cda3ddfe 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -3570,7 +3570,8 @@ RelationTruncateIndexes(Relation heapRelation)
/* Initialize the index and rebuild */
/* Note: we do not need to re-establish pkey setting */
- index_build(heapRelation, currentIndex, indexInfo, true, false);
+ index_build(heapRelation, currentIndex, indexInfo, true, false,
+ true);
/* We're done with this index */
index_close(currentIndex, NoLock);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 1ccfa687f052b..9407c357f2716 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -715,6 +715,9 @@ UpdateIndexRelation(Oid indexoid,
* already exists.
* INDEX_CREATE_PARTITIONED:
* create a partitioned index (table must be partitioned)
+ * INDEX_CREATE_SUPPRESS_PROGRESS:
+ * don't report progress during the index build.
+ *
* constr_flags: flags passed to index_constraint_create
* (only if INDEX_CREATE_ADD_CONSTRAINT is set)
* allow_system_table_mods: allow table to be a system catalog
@@ -760,6 +763,7 @@ index_create(Relation heapRelation,
bool invalid = (flags & INDEX_CREATE_INVALID) != 0;
bool concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
bool partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0;
+ bool progress = (flags & INDEX_CREATE_SUPPRESS_PROGRESS) == 0;
char relkind;
TransactionId relfrozenxid;
MultiXactId relminmxid;
@@ -1276,7 +1280,8 @@ index_create(Relation heapRelation,
}
else
{
- index_build(heapRelation, indexRelation, indexInfo, false, true);
+ index_build(heapRelation, indexRelation, indexInfo, false, true,
+ progress);
}
/*
@@ -1289,22 +1294,23 @@ index_create(Relation heapRelation,
}
/*
- * index_concurrently_create_copy
+ * index_create_copy
*
- * Create concurrently an index based on the definition of the one provided by
- * caller. The index is inserted into catalogs and needs to be built later
- * on. This is called during concurrent reindex processing.
+ * Create an index based on the definition of the one provided by caller. The
+ * index is inserted into catalogs. 'flags' are passed directly to
+ * index_create.
*
* "tablespaceOid" is the tablespace to use for this index.
*/
Oid
-index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
- Oid tablespaceOid, const char *newName)
+index_create_copy(Relation heapRelation, uint16 flags,
+ Oid oldIndexId, Oid tablespaceOid, const char *newName)
{
Relation indexRelation;
IndexInfo *oldInfo,
*newInfo;
Oid newIndexId = InvalidOid;
+ bool concurrently = (flags & INDEX_CREATE_CONCURRENT) != 0;
HeapTuple indexTuple,
classTuple;
Datum indclassDatum,
@@ -1328,7 +1334,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
* Concurrent build of an index with exclusion constraints is not
* supported.
*/
- if (oldInfo->ii_ExclusionOps != NULL)
+ if (oldInfo->ii_ExclusionOps != NULL && concurrently)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("concurrent index creation for exclusion constraints is not supported")));
@@ -1384,9 +1390,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
}
/*
- * Build the index information for the new index. Note that rebuild of
- * indexes with exclusion constraints is not supported, hence there is no
- * need to fill all the ii_Exclusion* fields.
+ * Build the index information for the new index.
*/
newInfo = makeIndexInfo(oldInfo->ii_NumIndexAttrs,
oldInfo->ii_NumIndexKeyAttrs,
@@ -1395,11 +1399,24 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
indexPreds,
oldInfo->ii_Unique,
oldInfo->ii_NullsNotDistinct,
- false, /* not ready for inserts */
- true,
+ !concurrently, /* isready */
+ concurrently, /* concurrent */
indexRelation->rd_indam->amsummarizing,
oldInfo->ii_WithoutOverlaps);
+ /* fetch exclusion constraint info if any */
+ if (indexRelation->rd_index->indisexclusion)
+ {
+ /*
+ * XXX Beware: we're making newInfo point to oldInfo-owned memory. It
+ * would be more orthodox to palloc+memcpy, but we don't need that
+ * here at present.
+ */
+ newInfo->ii_ExclusionOps = oldInfo->ii_ExclusionOps;
+ newInfo->ii_ExclusionProcs = oldInfo->ii_ExclusionProcs;
+ newInfo->ii_ExclusionStrats = oldInfo->ii_ExclusionStrats;
+ }
+
/*
* Extract the list of column names and the column numbers for the new
* index information. All this information will be used for the index
@@ -1459,7 +1476,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId,
indcoloptions->values,
stattargets,
reloptionsDatum,
- INDEX_CREATE_SKIP_BUILD | INDEX_CREATE_CONCURRENT,
+ flags,
0,
true, /* allow table to be a system catalog? */
false, /* is_internal? */
@@ -1523,7 +1540,7 @@ index_concurrently_build(Oid heapRelationId,
indexInfo->ii_BrokenHotChain = false;
/* Now build the index */
- index_build(heapRel, indexRelation, indexInfo, false, true);
+ index_build(heapRel, indexRelation, indexInfo, false, true, true);
/* Roll back any GUC changes executed by index functions */
AtEOXact_GUC(false, save_nestlevel);
@@ -2994,6 +3011,7 @@ index_update_stats(Relation rel,
*
* isreindex indicates we are recreating a previously-existing index.
* parallel indicates if parallelism may be useful.
+ * progress indicates if the backend should update its progress info.
*
* Note: before Postgres 8.2, the passed-in heap and index Relations
* were automatically closed by this routine. This is no longer the case.
@@ -3004,7 +3022,8 @@ index_build(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool isreindex,
- bool parallel)
+ bool parallel,
+ bool progress)
{
IndexBuildResult *stats;
Oid save_userid;
@@ -3055,6 +3074,7 @@ index_build(Relation heapRelation,
RestrictSearchPath();
/* Set up initial progress report status */
+ if (progress)
{
const int progress_index[] = {
PROGRESS_CREATEIDX_PHASE,
@@ -3812,7 +3832,7 @@ reindex_index(const ReindexStmt *stmt, Oid indexId,
/* Initialize the index and rebuild */
/* Note: we do not need to re-establish pkey setting */
- index_build(heapRelation, iRel, indexInfo, true, true);
+ index_build(heapRelation, iRel, indexInfo, true, true, progress);
/* Re-allow use of target index */
ResetReindexProcessing();
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 49a5cdf579c16..10f8a2dc81cd5 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -309,9 +309,7 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
Oid save_userid;
int save_sec_context;
int save_nestlevel;
- WalUsage startwalusage = pgWalUsage;
- BufferUsage startbufferusage = pgBufferUsage;
- BufferUsage bufferusage;
+ QueryInstrumentation *instr = NULL;
PgStat_Counter startreadtime = 0;
PgStat_Counter startwritetime = 0;
@@ -362,6 +360,9 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
}
pg_rusage_init(&ru0);
+
+ instr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+ InstrQueryStart(instr);
}
/* Used for instrumentation and stats report */
@@ -742,12 +743,13 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
{
TimestampTz endtime = GetCurrentTimestamp();
+ InstrQueryStopFinalize(instr);
+
if (verbose || params->log_analyze_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
params->log_analyze_min_duration))
{
long delay_in_ms;
- WalUsage walusage;
double read_rate = 0;
double write_rate = 0;
char *msgfmt;
@@ -755,18 +757,15 @@ do_analyze_rel(Relation onerel, const VacuumParams *params,
int64 total_blks_hit;
int64 total_blks_read;
int64 total_blks_dirtied;
-
- memset(&bufferusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-
- total_blks_hit = bufferusage.shared_blks_hit +
- bufferusage.local_blks_hit;
- total_blks_read = bufferusage.shared_blks_read +
- bufferusage.local_blks_read;
- total_blks_dirtied = bufferusage.shared_blks_dirtied +
- bufferusage.local_blks_dirtied;
+ BufferUsage bufusage = instr->instr.bufusage;
+ WalUsage walusage = instr->instr.walusage;
+
+ total_blks_hit = bufusage.shared_blks_hit +
+ bufusage.local_blks_hit;
+ total_blks_read = bufusage.shared_blks_read +
+ bufusage.local_blks_read;
+ total_blks_dirtied = bufusage.shared_blks_dirtied +
+ bufusage.local_blks_dirtied;
/*
* We do not expect an analyze to take > 25 days and it simplifies
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e4b70166b0e50..42fc00cbd34db 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -144,7 +144,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
static const char *explain_get_index_name(Oid indexId);
static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
static void show_wal_usage(ExplainState *es, const WalUsage *usage);
static void show_memory_counters(ExplainState *es,
const MemoryContextCounters *mem_counters);
@@ -324,13 +324,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
QueryEnvironment *queryEnv)
{
PlannedStmt *plan;
- instr_time planstart,
- planduration;
- BufferUsage bufusage_start,
- bufusage;
+ QueryInstrumentation *plan_instr = NULL;
MemoryContextCounters mem_counters;
MemoryContext planner_ctx = NULL;
MemoryContext saved_ctx = NULL;
+ int instrument_options = INSTRUMENT_TIMER;
+
+ if (es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+
+ plan_instr = InstrQueryAlloc(instrument_options);
if (es->memory)
{
@@ -348,15 +351,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
saved_ctx = MemoryContextSwitchTo(planner_ctx);
}
- if (es->buffers)
- bufusage_start = pgBufferUsage;
- INSTR_TIME_SET_CURRENT(planstart);
+ InstrQueryStart(plan_instr);
/* plan the query */
plan = pg_plan_query(query, queryString, cursorOptions, params, es);
- INSTR_TIME_SET_CURRENT(planduration);
- INSTR_TIME_SUBTRACT(planduration, planstart);
+ InstrQueryStopFinalize(plan_instr);
if (es->memory)
{
@@ -364,16 +364,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
MemoryContextMemConsumed(planner_ctx, &mem_counters);
}
- /* calc differences of buffer counters. */
- if (es->buffers)
- {
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
- }
-
/* run it (if needed) and produce output */
ExplainOnePlan(plan, into, es, queryString, params, queryEnv,
- &planduration, (es->buffers ? &bufusage : NULL),
+ &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL),
es->memory ? &mem_counters : NULL);
}
@@ -590,7 +583,12 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
/* grab serialization metrics before we destroy the DestReceiver */
if (es->serialize != EXPLAIN_SERIALIZE_NONE)
- serializeMetrics = GetSerializationMetrics(dest);
+ {
+ SerializeMetrics *metrics = GetSerializationMetrics(dest);
+
+ if (metrics)
+ memcpy(&serializeMetrics, metrics, sizeof(SerializeMetrics));
+ }
/* call the DestReceiver's destroy method even during explain */
dest->rDestroy(dest);
@@ -613,7 +611,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
}
if (bufusage)
- show_buffer_usage(es, bufusage);
+ show_buffer_usage(es, bufusage, NULL);
if (mem_counters)
show_memory_counters(es, mem_counters);
@@ -1019,7 +1017,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
ExplainIndentText(es);
if (es->timing)
appendStringInfo(es->str, "Serialization: time=%.3f ms output=" UINT64_FORMAT "kB format=%s\n",
- 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent),
+ 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total),
BYTES_TO_KILOBYTES(metrics->bytesSent),
format);
else
@@ -1027,10 +1025,10 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
BYTES_TO_KILOBYTES(metrics->bytesSent),
format);
- if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
+ if (es->buffers && peek_buffer_usage(es, &metrics->instr.bufusage))
{
es->indent++;
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->instr.bufusage, NULL);
es->indent--;
}
}
@@ -1038,13 +1036,13 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
{
if (es->timing)
ExplainPropertyFloat("Time", "ms",
- 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->timeSpent),
+ 1000.0 * INSTR_TIME_GET_DOUBLE(metrics->instr.total),
3, es);
ExplainPropertyUInteger("Output Volume", "kB",
BYTES_TO_KILOBYTES(metrics->bytesSent), es);
ExplainPropertyText("Format", format, es);
if (es->buffers)
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->instr.bufusage, NULL);
}
ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1101,18 +1099,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
{
Trigger *trig = rInfo->ri_TrigDesc->triggers + nt;
- Instrumentation *instr = rInfo->ri_TrigInstrument + nt;
+ TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt;
char *relname;
char *conname = NULL;
- /* Must clean up instrumentation state */
- InstrEndLoop(instr);
-
/*
* We ignore triggers that were never invoked; they likely aren't
* relevant to the current query type.
*/
- if (instr->ntuples == 0)
+ if (tginstr->firings == 0)
continue;
ExplainOpenGroup("Trigger", NULL, true, es);
@@ -1137,11 +1132,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
if (show_relname)
appendStringInfo(es->str, " on %s", relname);
if (es->timing)
- appendStringInfo(es->str, ": time=%.3f calls=%.0f\n",
- INSTR_TIME_GET_MILLISEC(instr->total),
- instr->ntuples);
+ appendStringInfo(es->str, ": time=%.3f calls=%d\n",
+ INSTR_TIME_GET_MILLISEC(tginstr->instr.total),
+ tginstr->firings);
else
- appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples);
+ appendStringInfo(es->str, ": calls=%d\n", tginstr->firings);
}
else
{
@@ -1151,9 +1146,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
ExplainPropertyText("Relation", relname, es);
if (es->timing)
ExplainPropertyFloat("Time", "ms",
- INSTR_TIME_GET_MILLISEC(instr->total), 3,
+ INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3,
es);
- ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es);
+ ExplainPropertyInteger("Calls", NULL, tginstr->firings, es);
}
if (conname)
@@ -1840,7 +1835,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
{
double nloops = planstate->instrument->nloops;
double startup_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->startup) / nloops;
- double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->total) / nloops;
+ double total_ms = INSTR_TIME_GET_MILLISEC(planstate->instrument->instr.total) / nloops;
double rows = planstate->instrument->ntuples / nloops;
if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -1893,11 +1888,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* prepare per-worker general execution details */
if (es->workers_state && es->verbose)
{
- WorkerInstrumentation *w = planstate->worker_instrument;
+ WorkerNodeInstrumentation *w = planstate->worker_instrument;
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
double startup_ms;
double total_ms;
@@ -1906,7 +1901,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (nloops <= 0)
continue;
startup_ms = INSTR_TIME_GET_MILLISEC(instrument->startup) / nloops;
- total_ms = INSTR_TIME_GET_MILLISEC(instrument->total) / nloops;
+ total_ms = INSTR_TIME_GET_MILLISEC(instrument->instr.total) / nloops;
rows = instrument->ntuples / nloops;
ExplainOpenWorker(n, es);
@@ -1975,6 +1970,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
show_indexsearches_info(planstate, es);
+
+ if (es->buffers && planstate->instrument)
+ show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument->table_instr.bufusage, "Table");
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1992,6 +1990,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
show_indexsearches_info(planstate, es);
+
+ if (es->buffers && planstate->instrument)
+ show_buffer_usage(es, &((IndexOnlyScanState *) planstate)->ioss_Instrument->table_instr.bufusage, "Table");
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
@@ -2293,18 +2294,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->bufusage);
+ show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL);
if (es->wal && planstate->instrument)
- show_wal_usage(es, &planstate->instrument->walusage);
+ show_wal_usage(es, &planstate->instrument->instr.walusage);
/* Prepare per-worker buffer/WAL usage */
if (es->workers_state && (es->buffers || es->wal) && es->verbose)
{
- WorkerInstrumentation *w = planstate->worker_instrument;
+ WorkerNodeInstrumentation *w = planstate->worker_instrument;
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
if (nloops <= 0)
@@ -2312,9 +2313,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->bufusage);
+ show_buffer_usage(es, &instrument->instr.bufusage, NULL);
if (es->wal)
- show_wal_usage(es, &instrument->walusage);
+ show_wal_usage(es, &instrument->instr.walusage);
ExplainCloseWorker(n, es);
}
}
@@ -3924,26 +3925,45 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es)
static void
show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
{
+ uint64 exact_pages;
+ uint64 lossy_pages;
+
if (!es->analyze)
return;
+ /* Start with leader's stats */
+ exact_pages = planstate->stats.exact_pages;
+ lossy_pages = planstate->stats.lossy_pages;
+
+ /* Accumulate worker stats into node-level totals */
+ if (planstate->sinstrument != NULL)
+ {
+ for (int n = 0; n < planstate->sinstrument->num_workers; n++)
+ {
+ BitmapHeapScanInstrumentation *si = &planstate->sinstrument->sinstrument[n];
+
+ exact_pages += si->exact_pages;
+ lossy_pages += si->lossy_pages;
+ }
+ }
+
if (es->format != EXPLAIN_FORMAT_TEXT)
{
ExplainPropertyUInteger("Exact Heap Blocks", NULL,
- planstate->stats.exact_pages, es);
+ exact_pages, es);
ExplainPropertyUInteger("Lossy Heap Blocks", NULL,
- planstate->stats.lossy_pages, es);
+ lossy_pages, es);
}
else
{
- if (planstate->stats.exact_pages > 0 || planstate->stats.lossy_pages > 0)
+ if (exact_pages > 0 || lossy_pages > 0)
{
ExplainIndentText(es);
appendStringInfoString(es->str, "Heap Blocks:");
- if (planstate->stats.exact_pages > 0)
- appendStringInfo(es->str, " exact=" UINT64_FORMAT, planstate->stats.exact_pages);
- if (planstate->stats.lossy_pages > 0)
- appendStringInfo(es->str, " lossy=" UINT64_FORMAT, planstate->stats.lossy_pages);
+ if (exact_pages > 0)
+ appendStringInfo(es->str, " exact=" UINT64_FORMAT, exact_pages);
+ if (lossy_pages > 0)
+ appendStringInfo(es->str, " lossy=" UINT64_FORMAT, lossy_pages);
appendStringInfoChar(es->str, '\n');
}
}
@@ -4112,7 +4132,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
* Show buffer usage details. This better be sync with peek_buffer_usage.
*/
static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
{
if (es->format == EXPLAIN_FORMAT_TEXT)
{
@@ -4137,6 +4157,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared || has_local || has_temp)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "Buffers:");
if (has_shared)
@@ -4192,6 +4214,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared_timing || has_local_timing || has_temp_timing)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "I/O Timings:");
if (has_shared_timing)
@@ -4233,6 +4257,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
}
else
{
+ char *buffers_title = NULL;
+
+ if (title)
+ {
+ buffers_title = psprintf("%s Buffers", title);
+ ExplainOpenGroup(buffers_title, buffers_title, true, es);
+ }
+
ExplainPropertyInteger("Shared Hit Blocks", NULL,
usage->shared_blks_hit, es);
ExplainPropertyInteger("Shared Read Blocks", NULL,
@@ -4253,8 +4285,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
usage->temp_blks_read, es);
ExplainPropertyInteger("Temp Written Blocks", NULL,
usage->temp_blks_written, es);
+
+ if (buffers_title)
+ ExplainCloseGroup(buffers_title, buffers_title, true, es);
+
if (track_io_timing)
{
+ char *timings_title = NULL;
+
+ if (title)
+ {
+ timings_title = psprintf("%s I/O Timings", title);
+ ExplainOpenGroup(timings_title, timings_title, true, es);
+ }
+
ExplainPropertyFloat("Shared I/O Read Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
3, es);
@@ -4273,6 +4317,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
ExplainPropertyFloat("Temp I/O Write Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
3, es);
+
+ if (timings_title)
+ ExplainCloseGroup(timings_title, timings_title, true, es);
}
}
}
diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c
index 3c96061cf32ab..9c1b30fb75b73 100644
--- a/src/backend/commands/explain_dr.c
+++ b/src/backend/commands/explain_dr.c
@@ -110,15 +110,11 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
MemoryContext oldcontext;
StringInfo buf = &myState->buf;
int natts = typeinfo->natts;
- instr_time start,
- end;
- BufferUsage instr_start;
+ Instrumentation *instr = &myState->metrics.instr;
/* only measure time, buffers if requested */
- if (myState->es->timing)
- INSTR_TIME_SET_CURRENT(start);
- if (myState->es->buffers)
- instr_start = pgBufferUsage;
+ if (instr->need_timer || instr->need_stack)
+ InstrStart(instr);
/* Set or update my derived attribute info, if needed */
if (myState->attrinfo != typeinfo || myState->nattrs != natts)
@@ -186,18 +182,9 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
MemoryContextSwitchTo(oldcontext);
MemoryContextReset(myState->tmpcontext);
- /* Update timing data */
- if (myState->es->timing)
- {
- INSTR_TIME_SET_CURRENT(end);
- INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start);
- }
-
- /* Update buffer metrics */
- if (myState->es->buffers)
- BufferUsageAccumDiff(&myState->metrics.bufferUsage,
- &pgBufferUsage,
- &instr_start);
+ /* Stop per-tuple measurement */
+ if (instr->need_timer || instr->need_stack)
+ InstrStop(instr);
return true;
}
@@ -233,9 +220,17 @@ serializeAnalyzeStartup(DestReceiver *self, int operation, TupleDesc typeinfo)
/* The output buffer is re-used across rows, as in printtup.c */
initStringInfo(&receiver->buf);
- /* Initialize results counters */
+ /* Initialize metrics and per-tuple instrumentation */
memset(&receiver->metrics, 0, sizeof(SerializeMetrics));
- INSTR_TIME_SET_ZERO(receiver->metrics.timeSpent);
+ {
+ int instrument_options = 0;
+
+ if (receiver->es->timing)
+ instrument_options |= INSTRUMENT_TIMER;
+ if (receiver->es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+ InstrInitOptions(&receiver->metrics.instr, instrument_options);
+ }
}
/*
@@ -246,6 +241,8 @@ serializeAnalyzeShutdown(DestReceiver *self)
{
SerializeDestReceiver *receiver = (SerializeDestReceiver *) self;
+ InstrFinalizeChild(&receiver->metrics.instr, instr_stack.current);
+
if (receiver->finfos)
pfree(receiver->finfos);
receiver->finfos = NULL;
@@ -290,22 +287,17 @@ CreateExplainSerializeDestReceiver(ExplainState *es)
}
/*
- * GetSerializationMetrics - collect metrics
+ * GetSerializationMetrics - get serialization metrics
*
- * We have to be careful here since the receiver could be an IntoRel
- * receiver if the subject statement is CREATE TABLE AS. In that
- * case, return all-zeroes stats.
+ * Returns a pointer to the SerializeMetrics inside the dest receiver,
+ * or NULL if the receiver is not a SerializeDestReceiver (e.g. an IntoRel
+ * receiver for CREATE TABLE AS).
*/
-SerializeMetrics
+SerializeMetrics *
GetSerializationMetrics(DestReceiver *dest)
{
- SerializeMetrics empty;
-
if (dest->mydest == DestExplainSerialize)
- return ((SerializeDestReceiver *) dest)->metrics;
-
- memset(&empty, 0, sizeof(SerializeMetrics));
- INSTR_TIME_SET_ZERO(empty.timeSpent);
+ return &((SerializeDestReceiver *) dest)->metrics;
- return empty;
+ return NULL;
}
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 373e823479466..9ab74c8df0a1b 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -3989,10 +3989,13 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein
tablespaceid = indexRel->rd_rel->reltablespace;
/* Create new index definition based on given index */
- newIndexId = index_concurrently_create_copy(heapRel,
- idx->indexId,
- tablespaceid,
- concurrentName);
+ newIndexId = index_create_copy(heapRel,
+ INDEX_CREATE_CONCURRENT |
+ INDEX_CREATE_SKIP_BUILD |
+ INDEX_CREATE_SUPPRESS_PROGRESS,
+ idx->indexId,
+ tablespaceid,
+ concurrentName);
/*
* Now open the relation of the new index, a session-level lock is
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 876aad2100aeb..ee8113575882e 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -22,6 +22,7 @@
#include "catalog/pg_type.h"
#include "commands/createas.h"
#include "commands/explain.h"
+#include "executor/instrument.h"
#include "commands/explain_format.h"
#include "commands/explain_state.h"
#include "commands/prepare.h"
@@ -580,14 +581,17 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
ListCell *p;
ParamListInfo paramLI = NULL;
EState *estate = NULL;
- instr_time planstart;
- instr_time planduration;
- BufferUsage bufusage_start,
- bufusage;
+ QueryInstrumentation *plan_instr = NULL;
+ int instrument_options = INSTRUMENT_TIMER;
MemoryContextCounters mem_counters;
MemoryContext planner_ctx = NULL;
MemoryContext saved_ctx = NULL;
+ if (es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+
+ plan_instr = InstrQueryAlloc(instrument_options);
+
if (es->memory)
{
/* See ExplainOneQuery about this */
@@ -598,9 +602,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
saved_ctx = MemoryContextSwitchTo(planner_ctx);
}
- if (es->buffers)
- bufusage_start = pgBufferUsage;
- INSTR_TIME_SET_CURRENT(planstart);
+ InstrQueryStart(plan_instr);
/* Look it up in the hash table */
entry = FetchPreparedStatement(execstmt->name, true);
@@ -635,8 +637,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
cplan = GetCachedPlan(entry->plansource, paramLI,
CurrentResourceOwner, pstate->p_queryEnv);
- INSTR_TIME_SET_CURRENT(planduration);
- INSTR_TIME_SUBTRACT(planduration, planstart);
+ InstrQueryStopFinalize(plan_instr);
if (es->memory)
{
@@ -644,13 +645,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
MemoryContextMemConsumed(planner_ctx, &mem_counters);
}
- /* calc differences of buffer counters. */
- if (es->buffers)
- {
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
- }
-
plan_list = cplan->stmt_list;
/* Explain each query */
@@ -660,7 +654,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
if (pstmt->commandType != CMD_UTILITY)
ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv,
- &planduration, (es->buffers ? &bufusage : NULL),
+ &plan_instr->instr.total, (es->buffers ? &plan_instr->instr.bufusage : NULL),
es->memory ? &mem_counters : NULL);
else
ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 0ce2e81f9c2f2..f72c1ac521a3e 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -2139,7 +2139,7 @@ ExecuteTruncateGuts(List *explicit_rels,
rel,
0, /* dummy rangetable index */
NULL,
- 0);
+ NULL);
estate->es_opened_result_relations =
lappend(estate->es_opened_result_relations, resultRelInfo);
resultRelInfo++;
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 90e94fb8a5a4b..b8b8840345bd2 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -92,7 +92,8 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
int tgindx,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
+ QueryInstrumentation *qinstr,
MemoryContext per_tuple_context);
static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
ResultRelInfo *src_partinfo,
@@ -2311,7 +2312,8 @@ static HeapTuple
ExecCallTriggerFunc(TriggerData *trigdata,
int tgindx,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
+ QueryInstrumentation *qinstr,
MemoryContext per_tuple_context)
{
LOCAL_FCINFO(fcinfo, 0);
@@ -2346,7 +2348,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
* If doing EXPLAIN ANALYZE, start charging time to this trigger.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStartTrigger(qinstr, instr + tgindx);
/*
* Do the function evaluation in the per-tuple memory context, so that
@@ -2391,10 +2393,10 @@ ExecCallTriggerFunc(TriggerData *trigdata,
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
- * one "tuple returned" (really the number of firings).
+ * the firing of the trigger.
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStopTrigger(instr + tgindx, 1);
return (HeapTuple) DatumGetPointer(result);
}
@@ -2441,6 +2443,7 @@ ExecBSInsertTriggers(EState *estate, ResultRelInfo *relinfo)
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple)
@@ -2502,6 +2505,7 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple == NULL)
{
@@ -2606,6 +2610,7 @@ ExecIRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple == NULL)
{
@@ -2670,6 +2675,7 @@ ExecBSDeleteTriggers(EState *estate, ResultRelInfo *relinfo)
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple)
@@ -2780,6 +2786,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple == NULL)
{
@@ -2884,6 +2891,7 @@ ExecIRDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (rettuple == NULL)
return false; /* Delete was suppressed */
@@ -2942,6 +2950,7 @@ ExecBSUpdateTriggers(EState *estate, ResultRelInfo *relinfo)
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple)
@@ -3094,6 +3103,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple == NULL)
@@ -3258,6 +3268,7 @@ ExecIRUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple == NULL)
{
@@ -3316,6 +3327,7 @@ ExecBSTruncateTriggers(EState *estate, ResultRelInfo *relinfo)
i,
relinfo->ri_TrigFunctions,
relinfo->ri_TrigInstrument,
+ estate->es_instrument,
GetPerTupleMemoryContext(estate));
if (newtuple)
@@ -3947,7 +3959,7 @@ static void AfterTriggerExecute(EState *estate,
ResultRelInfo *dst_relInfo,
TriggerDesc *trigdesc,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context,
TupleTableSlot *trig_tuple_slot1,
TupleTableSlot *trig_tuple_slot2);
@@ -4342,7 +4354,7 @@ AfterTriggerExecute(EState *estate,
ResultRelInfo *src_relInfo,
ResultRelInfo *dst_relInfo,
TriggerDesc *trigdesc,
- FmgrInfo *finfo, Instrumentation *instr,
+ FmgrInfo *finfo, TriggerInstrumentation *instr,
MemoryContext per_tuple_context,
TupleTableSlot *trig_tuple_slot1,
TupleTableSlot *trig_tuple_slot2)
@@ -4383,7 +4395,7 @@ AfterTriggerExecute(EState *estate,
* to include time spent re-fetching tuples in the trigger cost.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStartTrigger(estate->es_instrument, instr + tgindx);
/*
* Fetch the required tuple(s).
@@ -4571,6 +4583,7 @@ AfterTriggerExecute(EState *estate,
tgindx,
finfo,
NULL,
+ NULL,
per_tuple_context);
if (rettuple != NULL &&
rettuple != LocTriggerData.tg_trigtuple &&
@@ -4600,10 +4613,10 @@ AfterTriggerExecute(EState *estate,
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
- * one "tuple returned" (really the number of firings).
+ * the firing of the trigger.
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStopTrigger(instr + tgindx, 1);
}
@@ -4719,7 +4732,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
Relation rel = NULL;
TriggerDesc *trigdesc = NULL;
FmgrInfo *finfo = NULL;
- Instrumentation *instr = NULL;
+ TriggerInstrumentation *instr = NULL;
TupleTableSlot *slot1 = NULL,
*slot2 = NULL;
diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c
index 77834b96a21c1..b5fed54fb85c3 100644
--- a/src/backend/commands/vacuumparallel.c
+++ b/src/backend/commands/vacuumparallel.c
@@ -47,9 +47,8 @@
*/
#define PARALLEL_VACUUM_KEY_SHARED 1
#define PARALLEL_VACUUM_KEY_QUERY_TEXT 2
-#define PARALLEL_VACUUM_KEY_BUFFER_USAGE 3
-#define PARALLEL_VACUUM_KEY_WAL_USAGE 4
-#define PARALLEL_VACUUM_KEY_INDEX_STATS 5
+#define PARALLEL_VACUUM_KEY_INSTRUMENTATION 3
+#define PARALLEL_VACUUM_KEY_INDEX_STATS 4
/*
* Shared information among parallel workers. So this is allocated in the DSM
@@ -188,11 +187,8 @@ struct ParallelVacuumState
/* Shared dead items space among parallel vacuum workers */
TidStore *dead_items;
- /* Points to buffer usage area in DSM */
- BufferUsage *buffer_usage;
-
- /* Points to WAL usage area in DSM */
- WalUsage *wal_usage;
+ /* Points to instrumentation area in DSM */
+ Instrumentation *instr;
/*
* False if the index is totally unsuitable target for all parallel
@@ -250,8 +246,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
PVShared *shared;
TidStore *dead_items;
PVIndStats *indstats;
- BufferUsage *buffer_usage;
- WalUsage *wal_usage;
+ Instrumentation *instr;
bool *will_parallel_vacuum;
Size est_indstats_len;
Size est_shared_len;
@@ -304,18 +299,15 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
shm_toc_estimate_keys(&pcxt->estimator, 1);
/*
- * Estimate space for BufferUsage and WalUsage --
- * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
+ * Estimate space for Instrumentation --
+ * PARALLEL_VACUUM_KEY_INSTRUMENTATION.
*
* If there are no extensions loaded that care, we could skip this. We
- * have no way of knowing whether anyone's looking at pgBufferUsage or
- * pgWalUsage, so do it unconditionally.
+ * have no way of knowing whether anyone's looking at instrumentation, so
+ * do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_estimate_keys(&pcxt->estimator, 1);
- shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
shm_toc_estimate_keys(&pcxt->estimator, 1);
/* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
@@ -396,17 +388,13 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes,
pvs->shared = shared;
/*
- * Allocate space for each worker's BufferUsage and WalUsage; no need to
- * initialize
+ * Allocate space for each worker's Instrumentation; no need to
+ * initialize.
*/
- buffer_usage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
- pvs->buffer_usage = buffer_usage;
- wal_usage = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
- pvs->wal_usage = wal_usage;
+ instr = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, instr);
+ pvs->instr = instr;
/* Store query string for workers */
if (debug_query_string)
@@ -749,7 +737,7 @@ parallel_vacuum_process_all_indexes(ParallelVacuumState *pvs, int num_index_scan
WaitForParallelWorkersToFinish(pvs->pcxt);
for (int i = 0; i < pvs->pcxt->nworkers_launched; i++)
- InstrAccumParallelQuery(&pvs->buffer_usage[i], &pvs->wal_usage[i]);
+ InstrAccumParallelQuery(&pvs->instr[i]);
}
/*
@@ -1006,8 +994,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
PVIndStats *indstats;
PVShared *shared;
TidStore *dead_items;
- BufferUsage *buffer_usage;
- WalUsage *wal_usage;
+ QueryInstrumentation *instr;
+ Instrumentation *worker_instr;
int nindexes;
char *sharedquery;
ErrorContextCallback errcallback;
@@ -1095,16 +1083,14 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
error_context_stack = &errcallback;
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/* Process indexes to perform vacuum/cleanup */
parallel_vacuum_process_safe_indexes(&pvs);
/* Report buffer/WAL usage during parallel execution */
- buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
- wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
- &wal_usage[ParallelWorkerNumber]);
+ worker_instr = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_INSTRUMENTATION, false);
+ InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
/* Report any remaining cost-based vacuum delay time */
if (track_cost_delay_timing)
diff --git a/src/backend/executor/README.instrument b/src/backend/executor/README.instrument
new file mode 100644
index 0000000000000..7df837dbc77e8
--- /dev/null
+++ b/src/backend/executor/README.instrument
@@ -0,0 +1,237 @@
+src/backend/executor/README.instrument
+
+Instrumentation
+===============
+
+The instrumentation subsystem measures time, buffer usage and WAL activity
+during query execution and other similar activities. It is used by
+EXPLAIN ANALYZE, pg_stat_statements, and other consumers that need
+activity and/or timing metrics over a section of code.
+
+The design has two central goals:
+
+* Make it cheap to measure activity in a section of code, even when
+ that section is called many times and the aggregate is what is used
+ (as is the case with per-node instrumentation in the executor)
+
+* Ensure nested instrumentation accurately measures activity/timing,
+ even when execution is aborted due to errors being thrown.
+
+The key data structures are defined in src/include/executor/instrument.h
+and the implementation lives in src/backend/executor/instrument.c.
+
+
+Instrumentation Options
+-----------------------
+
+Callers specify what to measure with a bitmask of InstrumentOption flags:
+
+ INSTRUMENT_ROWS -- row counts only (used with NodeInstrumentation)
+ INSTRUMENT_TIMER -- wall-clock timing and row counts
+ INSTRUMENT_BUFFERS -- buffer hit/read/dirtied/written counts and I/O time
+ INSTRUMENT_WAL -- WAL records, FPI, bytes
+
+INSTRUMENT_BUFFERS and INSTRUMENT_WAL utilize the instrumentation stack
+(described below) for efficient handling of counter values.
+
+
+Struct Hierarchy
+----------------
+
+There are the following instrumentation structs, each specialized for a
+different scope:
+
+Instrumentation Base struct. Holds timing and buffer/WAL counters.
+
+QueryInstrumentation Extends Instrumentation for query-level tracking. When
+ stack-based tracking is enabled, it owns a dedicated
+ MemoryContext and uses the ResourceOwner mechanism for
+ abort cleanup.
+
+NodeInstrumentation Extends Instrumentation for per-plan-node statistics
+ (startup time, tuple counts, loop counts, etc).
+
+TriggerInstrumentation Extends Instrumentation with a firing count.
+
+
+Stack-based instrumentation
+===========================
+
+For tracking WAL or buffer usage counters, the specialized stack-based
+instrumentation is used.
+
+A simple approach to measuring buffer/WAL activity in a code section could be
+to have a set of global counters, snapshot all the counters at the start, and
+diff them at the end. But, this is expensive in practice: BufferUsage alone
+has many fields, and the diff must be computed for every InstrStartNode /
+InstrStopNode cycle.
+
+An alternative is to write counter updates directly into the struct that
+should receive them, avoiding the diff. But that has two complexities: Low-level
+code such as the buffer manager, has no direct pointers to higher level
+structs, such as plan nodes tracking buffer usage. And instrumentation is often
+nested: We might both be interested in the aggregate buffer usage of a query, and
+the individual per-node details. Stack-based instrumentation solves for that:
+
+At all times, there is a stack that tracks which Instrumentation is currently
+active. The stack is represented by instr_stack, a per-backend global
+that holds a dynamic array of Instrumentation pointers. The field
+instr_stack.current always points to the current stack entry that should
+be updated when activity occurs. When the stack array is empty, the
+current stack points to instr_top.
+
+For example, if a backend has two portals open, the overall nesting of
+Instrumentation and their respective InstrStart/InstrStop calls creates a
+tree-like structure like this:
+
+ Session (instr_top)
+ |
+ +-- Query A (QueryInstrumentation)
+ | |
+ | +-- NestLoop (NodeInstrumentation)
+ | |
+ | +-- Seq Scan A (NodeInstrumentation)
+ | +-- Seq Scan B (NodeInstrumentation)
+ |
+ +-- Query B (QueryInstrumentation)
+ |
+ +-- Seq Scan C (NodeInstrumentation)
+
+While executing Seq Scan B, the stack looks like:
+
+ instr_top (implicit bottom, not in the entries array)
+ 0: Query A
+ 1: NestLoop
+ 2: Seq Scan B <-- instr_stack.current
+
+When no query is running, the stack is empty (stack_size == 0) and
+instr_stack.current points to instr_top.
+
+Any buffer or WAL counter update (via the INSTR_BUFUSAGE_* and
+INSTR_WALUSAGE_* macros in the buffer manager, WAL insertion code, etc.)
+writes directly into instr_stack.current. Each instrumentation node starts
+zeroed, so the values it accumulates while on top of the stack represent
+exactly the activity that occurred during that time.
+
+Every Instrumentation node (except for instr_top) has a target, or parent, it
+will be accumulated into, which is typically the Instrumentation that was the
+current stack entry when it was created.
+
+For example, when Seq Scan A gets finalized in regular execution via ExecutorFinish,
+its instrumentation data gets added to the immediate parent in
+the execution tree, the NestLoop, which will then get added to Query A's
+QueryInstrumentation, which then accumulates to the parent.
+
+While we can typically think of this as a tree, the NodeInstrumentation
+underneath a particular QueryInstrumentation could behave differently --
+for example, it could propagate directly to the QueryInstrumentation, in
+order to not show cumulative numbers in EXPLAIN ANALYZE.
+
+Note these relationships are partially implicit, especially when it comes
+to NodeInstrumentation. Each QueryInstrumentation maintains a list of its
+unfinalized child nodes. The parent of a QueryInstrumentation itself is
+determined by the stack (see below): when a query is finalized or cleaned
+up on abort, its counters are accumulated to whatever entry is then current
+on the stack, which is typically instr_top.
+
+
+Finalization and Abort Safety
+=============================
+
+Finalization is the process of rolling up a node's buffer/WAL counters to
+its parent. In normal execution, nodes are pushed onto the stack when they
+start and popped when they stop; at finalization time their accumulated
+counters are added to the parent.
+
+Due to the use of longjmp for error handling, functions can exit abruptly
+without executing their normal cleanup code. On abort, two things need
+to happen:
+
+1. The stack is reset to the level saved at the start of the aborting
+ (sub-)transaction level. This ensures that we don't later try to update
+ counters on a freed stack entry. We also need to ensure that the stack
+ entry that was current before a particular Instrumentation started, is
+ current again after it stops.
+
+2. Finalize all affected Instrumentation nodes, rolling up their counters
+ to the innermost surviving Instrumentation, so that data is not lost.
+
+For example, if Seq Scan B aborts while the stack is:
+
+ instr_top (implicit bottom)
+ 0: Query A
+ 1: NestLoop
+ 2: Seq Scan B
+
+The abort handler for Query A accumulates all unfinalized children (Seq
+Scan A, Seq Scan B, NestLoop) directly into Query A's counters, then
+unwinds the instrumentation stack and accumulates Query A's counters to
+instr_top.
+
+Note that on abort the children do not accumulate through each other (Seq
+Scan B -> NestLoop -> Query A); they all accumulate directly to their
+parent QueryInstrumentation. This means the order in which children are
+released does not matter -- this is important because ResourceOwner cleanup
+does not guarantee a particular release order. The per-node breakdown is lost,
+but the instrumentation active when the query was started (instr_top in the
+above example) survives the abort, and its counters include the activity.
+
+If multiple QueryInstrumentations are active on the stack (e.g. nested
+portals), the abort handler of each uses InstrStopFinalize() to accumulate
+the statistics to the parent entry of either the entry being released, or a
+previously released entry if it was higher up in the stack, so they compose
+correctly regardless of release order.
+
+There are two mechanisms for achieving abort safety:
+
+* Resource Owner (QueryInstrumentation): registers with the current
+ ResourceOwner at start. On transaction abort, the resource owner system
+ calls the release callback, which walks unfinalized child entries,
+ accumulates their data, unwinds the stack, and destroys the dedicated
+ memory context (freeing the QueryInstrumentation and all child
+ allocations as a unit). This is the recommended approach when the
+ instrumented code already has an appropriate resource owner (e.g. it
+ runs inside a portal). The query executor uses this path.
+
+* PG_FINALLY (base Instrumentation): when no suitable resource owner
+ exists, or when the caller wants to inspect the instrumentation data
+ even after an error, the base Instrumentation can be used with a
+ PG_TRY/PG_FINALLY block that calls InstrStopFinalize().
+
+Both mechanisms add overhead, so neither is suitable for high-frequency
+instrumentation like per-node measurements in the executor. Instead,
+plan node and trigger children rely on their parent QueryInstrumentation
+for abort safety: they are allocated in the parent's memory context and
+registered in its unfinalized-entries list, so the parent's abort handler
+recovers their data automatically. In normal execution, children are
+finalized explicitly by the caller.
+
+Parallel Query
+--------------
+
+Parallel workers get their own QueryInstrumentation so they can measure
+buffer and WAL activity independently, then copy the totals into dynamic
+shared memory at worker shutdown. The leader accumulates these into its
+own stack.
+
+When per-node instrumentation is active, parallel workers skip per-node
+finalization at shutdown to avoid double-counting; the per-node data is
+aggregated separately through InstrAggNode().
+
+
+Memory Handling
+===============
+
+Instrumentation objects that use the stack must survive until finalization
+runs, including the abort case. To ensure this, QueryInstrumentation
+creates a dedicated "Instrumentation" MemoryContext (instr_cxt) as a child
+of TopMemoryContext. All child instrumentation (nodes, triggers) should be
+allocated in this context.
+
+On successful completion, instr_cxt is reparented to CurrentMemoryContext
+so its lifetime is tied to the caller's context. On abort, the
+ResourceOwner cleanup frees it after accumulating the instrumentation data
+to the current stack entry after resetting the stack.
+
+When the stack is not needed (timer/rows only), Instrumentation allocations
+happen in CurrentMemoryContext instead of TopMemoryContext.
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 45e00c6af85de..d0cd34d286c28 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -78,6 +78,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
/* decls for local routines only used within this module */
static void InitPlan(QueryDesc *queryDesc, int eflags);
static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecFinalizeTriggerInstrumentation(EState *estate);
static void ExecPostprocessPlan(EState *estate);
static void ExecEndPlan(PlanState *planstate, EState *estate);
static void ExecutePlan(QueryDesc *queryDesc,
@@ -247,9 +248,16 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
estate->es_snapshot = RegisterSnapshot(queryDesc->snapshot);
estate->es_crosscheck_snapshot = RegisterSnapshot(queryDesc->crosscheck_snapshot);
estate->es_top_eflags = eflags;
- estate->es_instrument = queryDesc->instrument_options;
estate->es_jit_flags = queryDesc->plannedstmt->jitFlags;
+ /*
+ * Set up per-node instrumentation if needed. We do this before InitPlan
+ * so that node and trigger instrumentation can be allocated within the
+ * query's dedicated instrumentation memory context.
+ */
+ if (!estate->es_instrument && queryDesc->instrument_options)
+ estate->es_instrument = InstrQueryAlloc(queryDesc->instrument_options);
+
/*
* Set up an AFTER-trigger statement context, unless told not to, or
* unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
@@ -331,9 +339,11 @@ standard_ExecutorRun(QueryDesc *queryDesc,
*/
oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
- /* Allow instrumentation of Executor overall runtime */
+ /* Start up instrumentation for this execution run */
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ InstrQueryStart(queryDesc->totaltime);
+ if (estate->es_instrument)
+ InstrQueryStart(estate->es_instrument);
/*
* extract information from the query descriptor and the query feature.
@@ -384,8 +394,10 @@ standard_ExecutorRun(QueryDesc *queryDesc,
if (sendTuples)
dest->rShutdown(dest);
+ if (estate->es_instrument)
+ InstrQueryStop(estate->es_instrument);
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, estate->es_processed);
+ InstrQueryStop(queryDesc->totaltime);
MemoryContextSwitchTo(oldcontext);
}
@@ -435,7 +447,9 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
/* Allow instrumentation of Executor overall runtime */
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ InstrQueryStart(queryDesc->totaltime);
+ if (estate->es_instrument)
+ InstrQueryStart(estate->es_instrument);
/* Run ModifyTable nodes to completion */
ExecPostprocessPlan(estate);
@@ -444,8 +458,32 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
AfterTriggerEndQuery(estate);
+ if (estate->es_instrument)
+ {
+ /*
+ * Accumulate per-node and trigger statistics to their respective
+ * parent instrumentation stacks.
+ *
+ * We skip this in parallel workers because their per-node stats are
+ * reported individually via ExecParallelReportInstrumentation, and
+ * the leader's own ExecFinalizeNodeInstrumentation handles
+ * propagation. If we accumulated here, the leader would
+ * double-count: worker parent nodes would already include their
+ * children's stats, and then the leader's accumulation would add the
+ * children again.
+ */
+ if (!IsParallelWorker())
+ {
+ ExecFinalizeNodeInstrumentation(queryDesc->planstate);
+
+ ExecFinalizeTriggerInstrumentation(estate);
+ }
+
+ InstrQueryStopFinalize(estate->es_instrument);
+ }
+
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, 0);
+ InstrQueryStopFinalize(queryDesc->totaltime);
MemoryContextSwitchTo(oldcontext);
@@ -1263,7 +1301,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
Relation resultRelationDesc,
Index resultRelationIndex,
ResultRelInfo *partition_root_rri,
- int instrument_options)
+ QueryInstrumentation *qinstr)
{
MemSet(resultRelInfo, 0, sizeof(ResultRelInfo));
resultRelInfo->type = T_ResultRelInfo;
@@ -1284,8 +1322,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
palloc0_array(FmgrInfo, n);
resultRelInfo->ri_TrigWhenExprs = (ExprState **)
palloc0_array(ExprState *, n);
- if (instrument_options)
- resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+ if (qinstr)
+ resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(qinstr, n);
}
else
{
@@ -1358,6 +1396,10 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
* also provides a way for EXPLAIN ANALYZE to report the runtimes of such
* triggers.) So we make additional ResultRelInfo's as needed, and save them
* in es_trig_target_relations.
+ *
+ * Note: if new relation lists are searched here, they must also be added to
+ * ExecFinalizeTriggerInstrumentation so that trigger instrumentation data
+ * is properly accumulated.
*/
ResultRelInfo *
ExecGetTriggerResultRel(EState *estate, Oid relid,
@@ -1500,6 +1542,30 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo)
return resultRelInfo->ri_ancestorResultRels;
}
+static void
+ExecFinalizeTriggerInstrumentation(EState *estate)
+{
+ List *rels = NIL;
+
+ rels = list_concat(rels, estate->es_tuple_routing_result_relations);
+ rels = list_concat(rels, estate->es_opened_result_relations);
+ rels = list_concat(rels, estate->es_trig_target_relations);
+
+ foreach_node(ResultRelInfo, rInfo, rels)
+ {
+ TriggerInstrumentation *ti = rInfo->ri_TrigInstrument;
+
+ if (ti == NULL || rInfo->ri_TrigDesc == NULL)
+ continue;
+
+ for (int nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
+ {
+ if (ti[nt].instr.need_stack)
+ InstrAccumStack(&estate->es_instrument->instr, &ti[nt].instr);
+ }
+ }
+}
+
/* ----------------------------------------------------------------
* ExecPostprocessPlan
*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 755191b51ef66..2e57136edfd9f 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -60,13 +60,12 @@
#define PARALLEL_KEY_EXECUTOR_FIXED UINT64CONST(0xE000000000000001)
#define PARALLEL_KEY_PLANNEDSTMT UINT64CONST(0xE000000000000002)
#define PARALLEL_KEY_PARAMLISTINFO UINT64CONST(0xE000000000000003)
-#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xE000000000000004)
+#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xE000000000000004)
#define PARALLEL_KEY_TUPLE_QUEUE UINT64CONST(0xE000000000000005)
-#define PARALLEL_KEY_INSTRUMENTATION UINT64CONST(0xE000000000000006)
+#define PARALLEL_KEY_NODE_INSTRUMENTATION UINT64CONST(0xE000000000000006)
#define PARALLEL_KEY_DSA UINT64CONST(0xE000000000000007)
#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xE000000000000008)
#define PARALLEL_KEY_JIT_INSTRUMENTATION UINT64CONST(0xE000000000000009)
-#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xE00000000000000A)
#define PARALLEL_TUPLE_QUEUE_SIZE 65536
@@ -87,7 +86,7 @@ typedef struct FixedParallelExecutorState
* instrument_options: Same meaning here as in instrument.c.
*
* instrument_offset: Offset, relative to the start of this structure,
- * of the first Instrumentation object. This will depend on the length of
+ * of the first NodeInstrumentation object. This will depend on the length of
* the plan_node_id array.
*
* num_workers: Number of workers.
@@ -104,11 +103,15 @@ struct SharedExecutorInstrumentation
int num_workers;
int num_plan_nodes;
int plan_node_id[FLEXIBLE_ARRAY_MEMBER];
- /* array of num_plan_nodes * num_workers Instrumentation objects follows */
+
+ /*
+ * array of num_plan_nodes * num_workers NodeInstrumentation objects
+ * follows
+ */
};
#define GetInstrumentationArray(sei) \
(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
- (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+ (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
/* Context object for ExecParallelEstimate. */
typedef struct ExecParallelEstimateContext
@@ -627,8 +630,6 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
char *pstmt_data;
char *pstmt_space;
char *paramlistinfo_space;
- BufferUsage *bufusage_space;
- WalUsage *walusage_space;
SharedExecutorInstrumentation *instrumentation = NULL;
SharedJitInstrumentation *jit_instrumentation = NULL;
int pstmt_len;
@@ -692,21 +693,14 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
shm_toc_estimate_keys(&pcxt->estimator, 1);
/*
- * Estimate space for BufferUsage.
+ * Estimate space for Instrumentation.
*
* If EXPLAIN is not in use and there are no extensions loaded that care,
* we could skip this. But we have no way of knowing whether anyone's
- * looking at pgBufferUsage, so do it unconditionally.
- */
- shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_estimate_keys(&pcxt->estimator, 1);
-
- /*
- * Same thing for WalUsage.
+ * looking at instrumentation, so do it unconditionally.
*/
shm_toc_estimate_chunk(&pcxt->estimator,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
shm_toc_estimate_keys(&pcxt->estimator, 1);
/* Estimate space for tuple queues. */
@@ -731,7 +725,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation_len = MAXALIGN(instrumentation_len);
instrument_offset = instrumentation_len;
instrumentation_len +=
- mul_size(sizeof(Instrumentation),
+ mul_size(sizeof(NodeInstrumentation),
mul_size(e.nnodes, nworkers));
shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -792,17 +786,18 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
shm_toc_insert(pcxt->toc, PARALLEL_KEY_PARAMLISTINFO, paramlistinfo_space);
SerializeParamList(estate->es_param_list_info, ¶mlistinfo_space);
- /* Allocate space for each worker's BufferUsage; no need to initialize. */
- bufusage_space = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(BufferUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufusage_space);
- pei->buffer_usage = bufusage_space;
+ /*
+ * Allocate space for each worker's Instrumentation; no need to
+ * initialize.
+ */
+ {
+ Instrumentation *instr;
- /* Same for WalUsage. */
- walusage_space = shm_toc_allocate(pcxt->toc,
- mul_size(sizeof(WalUsage), pcxt->nworkers));
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage_space);
- pei->wal_usage = walusage_space;
+ instr = shm_toc_allocate(pcxt->toc,
+ mul_size(sizeof(Instrumentation), pcxt->nworkers));
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instr);
+ pei->instrumentation = instr;
+ }
/* Set up the tuple queues that the workers will write into. */
pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false);
@@ -817,20 +812,20 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
*/
if (estate->es_instrument)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
- instrumentation->instrument_options = estate->es_instrument;
+ instrumentation->instrument_options = estate->es_instrument->instrument_options;
instrumentation->instrument_offset = instrument_offset;
instrumentation->num_workers = nworkers;
instrumentation->num_plan_nodes = e.nnodes;
instrument = GetInstrumentationArray(instrumentation);
for (i = 0; i < nworkers * e.nnodes; ++i)
- InstrInit(&instrument[i], estate->es_instrument);
- shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
+ InstrInitNode(&instrument[i], estate->es_instrument->instrument_options);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_NODE_INSTRUMENTATION,
instrumentation);
- pei->instrumentation = instrumentation;
+ pei->node_instrumentation = instrumentation;
if (estate->es_jit_flags != PGJIT_NONE)
{
@@ -1059,7 +1054,7 @@ static bool
ExecParallelRetrieveInstrumentation(PlanState *planstate,
SharedExecutorInstrumentation *instrumentation)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
int n;
int ibytes;
@@ -1077,19 +1072,33 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
instrument = GetInstrumentationArray(instrumentation);
instrument += i * instrumentation->num_workers;
for (n = 0; n < instrumentation->num_workers; ++n)
+ {
InstrAggNode(planstate->instrument, &instrument[n]);
+ /*
+ * Also add worker WAL usage to the global pgWalUsage counter.
+ *
+ * When per-node instrumentation is active, parallel workers skip
+ * ExecFinalizeNodeInstrumentation (to avoid double-counting in
+ * EXPLAIN), so per-node WAL activity is not rolled up into the
+ * query-level stats that InstrAccumParallelQuery receives. Without
+ * this, pgWalUsage would under-report WAL generated by parallel
+ * workers when instrumentation is active.
+ */
+ WalUsageAdd(&pgWalUsage, &instrument[n].instr.walusage);
+ }
+
/*
* Also store the per-worker detail.
*
- * Worker instrumentation should be allocated in the same context as the
- * regular instrumentation information, which is the per-query context.
- * Switch into per-query memory context.
+ * Ensure worker instrumentation is allocated in the per-query context. We
+ * don't need to place this in the instrumentation context since no more
+ * stack-based instrumentation work is being done.
*/
oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
- ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+ ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
planstate->worker_instrument =
- palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+ palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument));
MemoryContextSwitchTo(oldcontext);
planstate->worker_instrument->num_workers = instrumentation->num_workers;
@@ -1218,7 +1227,7 @@ ExecParallelFinish(ParallelExecutorInfo *pei)
* finish, or we might get incomplete data.)
*/
for (i = 0; i < nworkers; i++)
- InstrAccumParallelQuery(&pei->buffer_usage[i], &pei->wal_usage[i]);
+ InstrAccumParallelQuery(&pei->instrumentation[i]);
pei->finished = true;
}
@@ -1232,10 +1241,14 @@ ExecParallelFinish(ParallelExecutorInfo *pei)
void
ExecParallelCleanup(ParallelExecutorInfo *pei)
{
- /* Accumulate instrumentation, if any. */
- if (pei->instrumentation)
+ /* Accumulate node instrumentation, if any. */
+ if (pei->node_instrumentation)
+ {
ExecParallelRetrieveInstrumentation(pei->planstate,
- pei->instrumentation);
+ pei->node_instrumentation);
+
+ ExecFinalizeWorkerInstrumentation(pei->planstate);
+ }
/* Accumulate JIT instrumentation, if any. */
if (pei->jit_instrumentation)
@@ -1319,7 +1332,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
{
int i;
int plan_node_id = planstate->plan->plan_node_id;
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
InstrEndLoop(planstate->instrument);
@@ -1458,8 +1471,7 @@ void
ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
{
FixedParallelExecutorState *fpes;
- BufferUsage *buffer_usage;
- WalUsage *wal_usage;
+ QueryInstrumentation *instr;
DestReceiver *receiver;
QueryDesc *queryDesc;
SharedExecutorInstrumentation *instrumentation;
@@ -1474,7 +1486,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
/* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */
receiver = ExecParallelGetReceiver(seg, toc);
- instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, true);
+ instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_NODE_INSTRUMENTATION, true);
if (instrumentation != NULL)
instrument_options = instrumentation->instrument_options;
jit_instrumentation = shm_toc_lookup(toc, PARALLEL_KEY_JIT_INSTRUMENTATION,
@@ -1518,7 +1530,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
* leader, which also doesn't count buffer accesses and WAL activity that
* occur during executor startup.
*/
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Run the plan. If we specified a tuple bound, be careful not to demand
@@ -1532,10 +1544,12 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
ExecutorFinish(queryDesc);
/* Report buffer/WAL usage during parallel execution. */
- buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
- wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
- &wal_usage[ParallelWorkerNumber]);
+ {
+ Instrumentation *worker_instr;
+
+ worker_instr = shm_toc_lookup(toc, PARALLEL_KEY_INSTRUMENTATION, false);
+ InstrEndParallelQuery(instr, &worker_instr[ParallelWorkerNumber]);
+ }
/* Report instrumentation data if any instrumentation options are set. */
if (instrumentation != NULL)
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index d96d4f9947b79..6f2909a1bc35a 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1381,7 +1381,7 @@ ExecInitPartitionDispatchInfo(EState *estate,
{
ResultRelInfo *rri = makeNode(ResultRelInfo);
- InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
+ InitResultRelInfo(rri, rel, 0, rootResultRelInfo, NULL);
proute->nonleaf_partitions[dispatchidx] = rri;
}
else
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d35976925ae76..a59de0ef22b29 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -121,8 +121,9 @@
#include "nodes/nodeFuncs.h"
static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
-static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
static bool ExecShutdownNode_walker(PlanState *node, void *context);
+static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context);
+static bool ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context);
/* ------------------------------------------------------------------------
@@ -414,8 +415,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
/* Set up instrumentation for this node if requested */
if (estate->es_instrument)
- result->instrument = InstrAlloc(1, estate->es_instrument,
- result->async_capable);
+ result->instrument = InstrAllocNode(estate->es_instrument,
+ result->async_capable);
return result;
}
@@ -463,7 +464,7 @@ ExecProcNodeFirst(PlanState *node)
* have ExecProcNode() directly call the relevant function from now on.
*/
if (node->instrument)
- node->ExecProcNode = ExecProcNodeInstr;
+ node->ExecProcNode = InstrNodeSetupExecProcNode(node->instrument);
else
node->ExecProcNode = node->ExecProcNodeReal;
@@ -471,25 +472,6 @@ ExecProcNodeFirst(PlanState *node)
}
-/*
- * ExecProcNode wrapper that performs instrumentation calls. By keeping
- * this a separate function, we avoid overhead in the normal case where
- * no instrumentation is wanted.
- */
-static TupleTableSlot *
-ExecProcNodeInstr(PlanState *node)
-{
- TupleTableSlot *result;
-
- InstrStartNode(node->instrument);
-
- result = node->ExecProcNodeReal(node);
-
- InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
-
- return result;
-}
-
/* ----------------------------------------------------------------
* MultiExecProcNode
@@ -788,10 +770,10 @@ ExecShutdownNode_walker(PlanState *node, void *context)
* at least once already. We don't expect much CPU consumption during
* node shutdown, but in the case of Gather or Gather Merge, we may shut
* down workers at this stage. If so, their buffer usage will get
- * propagated into pgBufferUsage at this point, and we want to make sure
- * that it gets associated with the Gather node. We skip this if the node
- * has never been executed, so as to avoid incorrectly making it appear
- * that it has.
+ * propagated into the current instrumentation stack entry at this point,
+ * and we want to make sure that it gets associated with the Gather node.
+ * We skip this if the node has never been executed, so as to avoid
+ * incorrectly making it appear that it has.
*/
if (node->instrument && node->instrument->running)
InstrStartNode(node->instrument);
@@ -829,6 +811,145 @@ ExecShutdownNode_walker(PlanState *node, void *context)
return false;
}
+/*
+ * ExecFinalizeNodeInstrumentation
+ *
+ * Accumulate instrumentation stats from all execution nodes to their respective
+ * parents (or the original parent instrumentation).
+ *
+ * This must run after the cleanup done by ExecShutdownNode, and not rely on any
+ * resources cleaned up by it. We also expect shutdown actions to have occurred,
+ * e.g. parallel worker instrumentation to have been added to the leader.
+ */
+void
+ExecFinalizeNodeInstrumentation(PlanState *node)
+{
+ (void) ExecFinalizeNodeInstrumentation_walker(node, instr_stack.current);
+}
+
+static bool
+ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context)
+{
+ Instrumentation *parent = (Instrumentation *) context;
+
+ Assert(parent != NULL);
+
+ if (node == NULL)
+ return false;
+
+ Assert(node->instrument != NULL);
+
+ /*
+ * Recurse into children first (bottom-up accumulation), and accummulate
+ * to this nodes instrumentation as the parent context.
+ */
+ planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker,
+ &node->instrument->instr);
+
+ /* IndexScan/IndexOnlyScan have a separate entry to track table access */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ InstrFinalizeChild(&iss->iss_Instrument->table_instr, &node->instrument->instr);
+ }
+ else if (IsA(node, IndexOnlyScanState))
+ {
+ IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node);
+
+ InstrFinalizeChild(&ioss->ioss_Instrument->table_instr, &node->instrument->instr);
+ }
+
+ InstrFinalizeChild(&node->instrument->instr, parent);
+
+ return false;
+}
+
+/*
+ * ExecFinalizeWorkerInstrumentation
+ *
+ * Accumulate per-worker instrumentation stats from child nodes into their
+ * parents, mirroring what ExecFinalizeNodeInstrumentation does for the
+ * leader's own stats. Without this, per-worker buffer/WAL stats shown by
+ * EXPLAIN (ANALYZE, VERBOSE) would only reflect each node's own direct
+ * activity, not including children.
+ *
+ * This must run after ExecParallelRetrieveInstrumentation has populated
+ * worker_instrument for all nodes in the parallel subtree.
+ */
+void
+ExecFinalizeWorkerInstrumentation(PlanState *node)
+{
+ (void) ExecFinalizeWorkerInstrumentation_walker(node, NULL);
+}
+
+static bool
+ExecFinalizeWorkerInstrumentation_walker(PlanState *node, void *context)
+{
+ PlanState *parent = (PlanState *) context;
+ int num_workers;
+
+ if (node == NULL)
+ return false;
+
+ /*
+ * Recurse into children first (bottom-up accumulation), passing this node
+ * as parent context if it has worker_instrument, otherwise pass through
+ * the previous parent.
+ */
+ planstate_tree_walker(node, ExecFinalizeWorkerInstrumentation_walker,
+ node->worker_instrument ? (void *) node : context);
+
+ if (!node->worker_instrument)
+ return false;
+
+ num_workers = node->worker_instrument->num_workers;
+
+ /*
+ * Fold per-worker IndexScan/IndexOnlyScan table buffer stats into the
+ * per-worker node stats, matching what ExecFinalizeNodeInstrumentation
+ * does for the leader.
+ */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ if (iss->iss_SharedInfo)
+ {
+ int nworkers = Min(num_workers, iss->iss_SharedInfo->num_workers);
+
+ for (int n = 0; n < nworkers; n++)
+ InstrAccumStack(&node->worker_instrument->instrument[n].instr,
+ &iss->iss_SharedInfo->winstrument[n].table_instr);
+ }
+ }
+ else if (IsA(node, IndexOnlyScanState))
+ {
+ IndexOnlyScanState *ioss = castNode(IndexOnlyScanState, node);
+
+ if (ioss->ioss_SharedInfo)
+ {
+ int nworkers = Min(num_workers, ioss->ioss_SharedInfo->num_workers);
+
+ for (int n = 0; n < nworkers; n++)
+ InstrAccumStack(&node->worker_instrument->instrument[n].instr,
+ &ioss->ioss_SharedInfo->winstrument[n].table_instr);
+ }
+ }
+
+ /* Accumulate this node's per-worker stats to parent's per-worker stats */
+ if (parent && parent->worker_instrument)
+ {
+ int parent_workers = parent->worker_instrument->num_workers;
+
+ for (int n = 0; n < Min(num_workers, parent_workers); n++)
+ InstrAccumStack(&parent->worker_instrument->instrument[n].instr,
+ &node->worker_instrument->instrument[n].instr);
+ }
+
+ return false;
+}
+
/*
* ExecSetTupleBound
*
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 1eb6b9f1f4068..700764daf45d3 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -150,7 +150,7 @@ CreateExecutorState(void)
estate->es_total_processed = 0;
estate->es_top_eflags = 0;
- estate->es_instrument = 0;
+ estate->es_instrument = NULL;
estate->es_finished = false;
estate->es_exprcontexts = NIL;
@@ -227,6 +227,15 @@ FreeExecutorState(EState *estate)
estate->es_partition_directory = NULL;
}
+ /*
+ * Make sure the instrumentation context gets freed. This usually gets
+ * re-parented under the per-query context in InstrQueryStopFinalize, but
+ * that won't happen during EXPLAIN (BUFFERS) since ExecutorFinish never
+ * gets called, so we would otherwise leak it in TopMemoryContext.
+ */
+ if (estate->es_instrument && estate->es_instrument->instr.need_stack)
+ MemoryContextDelete(estate->es_instrument->instr_cxt);
+
/*
* Free the per-query memory context, thereby releasing all working
* memory, including the EState node itself.
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index a40610bc2522f..3183f00d6930a 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,124 +16,438 @@
#include
#include "executor/instrument.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
-BufferUsage pgBufferUsage;
-static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
-static WalUsage save_pgWalUsage;
+Instrumentation instr_top;
+InstrStackState instr_stack = {
+ .stack_space = 0,
+ .stack_size = 0,
+ .entries = NULL,
+ .current = &instr_top,
+};
-static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
-static void WalUsageAdd(WalUsage *dst, WalUsage *add);
-
-
-/* Allocate new instrumentation structure(s) */
-Instrumentation *
-InstrAlloc(int n, int instrument_options, bool async_mode)
+void
+InstrStackGrow(void)
{
- Instrumentation *instr;
+ int space = instr_stack.stack_space;
- /* initialize all fields to zeroes, then modify as needed */
- instr = palloc0(n * sizeof(Instrumentation));
- if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
- {
- bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
- bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
- bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
- int i;
+ Assert(instr_stack.stack_size >= instr_stack.stack_space);
- for (i = 0; i < n; i++)
- {
- instr[i].need_bufusage = need_buffers;
- instr[i].need_walusage = need_wal;
- instr[i].need_timer = need_timer;
- instr[i].async_mode = async_mode;
- }
+ if (instr_stack.entries == NULL)
+ {
+ space = 10; /* Allocate sufficient initial space for
+ * typical activity */
+ instr_stack.entries = MemoryContextAlloc(TopMemoryContext,
+ sizeof(Instrumentation *) * space);
+ }
+ else
+ {
+ space *= 2;
+ instr_stack.entries = repalloc_array(instr_stack.entries, Instrumentation *, space);
}
- return instr;
+ /* Update stack space after allocation succeeded to protect against OOMs */
+ instr_stack.stack_space = space;
+}
+
+/* General purpose instrumentation handling */
+static inline bool
+InstrNeedStack(int instrument_options)
+{
+ return (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)) != 0;
}
-/* Initialize a pre-allocated instrumentation structure. */
void
-InstrInit(Instrumentation *instr, int instrument_options)
+InstrInitOptions(Instrumentation *instr, int instrument_options)
{
- memset(instr, 0, sizeof(Instrumentation));
- instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
- instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+ instr->need_stack = InstrNeedStack(instrument_options);
instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
}
-/* Entry to a plan node */
+static inline void
+InstrStartTimer(Instrumentation *instr)
+{
+ Assert(INSTR_TIME_IS_ZERO(instr->starttime));
+
+ INSTR_TIME_SET_CURRENT(instr->starttime);
+}
+
+static inline void
+InstrStopTimer(Instrumentation *instr)
+{
+ instr_time endtime;
+
+ Assert(!INSTR_TIME_IS_ZERO(instr->starttime));
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+
+ INSTR_TIME_SET_ZERO(instr->starttime);
+}
+
void
-InstrStartNode(Instrumentation *instr)
+InstrStart(Instrumentation *instr)
{
if (instr->need_timer)
- {
- if (!INSTR_TIME_IS_ZERO(instr->starttime))
- elog(ERROR, "InstrStartNode called twice in a row");
- else
- INSTR_TIME_SET_CURRENT(instr->starttime);
- }
+ InstrStartTimer(instr);
+
+ if (instr->need_stack)
+ InstrPushStack(instr);
+}
- /* save buffer usage totals at node entry, if needed */
- if (instr->need_bufusage)
- instr->bufusage_start = pgBufferUsage;
+void
+InstrStop(Instrumentation *instr)
+{
+ if (instr->need_timer)
+ InstrStopTimer(instr);
- if (instr->need_walusage)
- instr->walusage_start = pgWalUsage;
+ if (instr->need_stack)
+ InstrPopStack(instr);
}
-/* Exit from a plan node */
+/*
+ * Stops instrumentation, finalizes the stack entry and accumulates to its parent.
+ *
+ * Note that this intentionally allows passing a stack that is not the current
+ * top, as can happen with PG_FINALLY, or resource owners, which don't have a
+ * guaranteed cleanup order.
+ *
+ * We are careful here to achieve two goals:
+ *
+ * 1) Reset the stack to the parent of whichever of the released stack entries
+ * has the lowest index
+ * 2) Accumulate all instrumentation to the currently active instrumentation,
+ * so that callers get a complete picture of activity, even after an abort
+ */
void
-InstrStopNode(Instrumentation *instr, double nTuples)
+InstrStopFinalize(Instrumentation *instr)
{
- double save_tuplecount = instr->tuplecount;
- instr_time endtime;
+ if (instr->on_stack)
+ {
+ int idx = -1;
- /* count the returned tuples */
- instr->tuplecount += nTuples;
+ for (int i = instr_stack.stack_size - 1; i >= 0; i--)
+ {
+ if (instr_stack.entries[i] == instr)
+ {
+ idx = i;
+ break;
+ }
+ }
+
+ if (idx < 0)
+ elog(ERROR, "instrumentation entry not found on stack");
+
+ /* Clear on_stack for any intermediate entries we're skipping over */
+ for (int i = instr_stack.stack_size - 1; i > idx; i--)
+ instr_stack.entries[i]->on_stack = false;
+
+ while (instr_stack.stack_size > idx + 1)
+ instr_stack.stack_size--;
+
+ InstrPopStack(instr);
+ }
- /* let's update the time only if the timer was requested */
if (instr->need_timer)
- {
- if (INSTR_TIME_IS_ZERO(instr->starttime))
- elog(ERROR, "InstrStopNode called without start");
+ InstrStopTimer(instr);
+
+ InstrAccumStack(instr_stack.current, instr);
+}
- INSTR_TIME_SET_CURRENT(endtime);
- INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
+/*
+ * Finalize child instrumentation by accumulating buffer/WAL usage to the
+ * provided instrumentation, which may be the current entry, or one the caller
+ * treats as a parent and will add to the totals later.
+ *
+ * Also deletes the unfinalized entry to avoid double counting in an abort
+ * situation, e.g. during executor finish.
+ */
+void
+InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent)
+{
+ if (instr->need_stack)
+ {
+ if (!dlist_node_is_detached(&instr->unfinalized_entry))
+ dlist_delete_thoroughly(&instr->unfinalized_entry);
- INSTR_TIME_SET_ZERO(instr->starttime);
+ InstrAccumStack(parent, instr);
}
+}
- /* Add delta of buffer usage since entry to node's totals */
- if (instr->need_bufusage)
- BufferUsageAccumDiff(&instr->bufusage,
- &pgBufferUsage, &instr->bufusage_start);
- if (instr->need_walusage)
- WalUsageAccumDiff(&instr->walusage,
- &pgWalUsage, &instr->walusage_start);
+/* Query instrumentation handling */
- /* Is this the first tuple of this cycle? */
- if (!instr->running)
+/*
+ * Use ResourceOwner mechanism to correctly reset instr_stack on abort.
+ */
+static void ResOwnerReleaseInstrumentation(Datum res);
+static const ResourceOwnerDesc instrumentation_resowner_desc =
+{
+ .name = "instrumentation",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_INSTRUMENTATION,
+ .ReleaseResource = ResOwnerReleaseInstrumentation,
+ .DebugPrint = NULL, /* default message is fine */
+};
+
+static inline void
+ResourceOwnerRememberInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static inline void
+ResourceOwnerForgetInstrumentation(ResourceOwner owner, QueryInstrumentation *qinstr)
+{
+ ResourceOwnerForget(owner, PointerGetDatum(qinstr), &instrumentation_resowner_desc);
+}
+
+static void
+ResOwnerReleaseInstrumentation(Datum res)
+{
+ QueryInstrumentation *qinstr = (QueryInstrumentation *) DatumGetPointer(res);
+ MemoryContext instr_cxt = qinstr->instr_cxt;
+ dlist_mutable_iter iter;
+
+ /* Accumulate data from all unfinalized child entries (nodes, triggers) */
+ dlist_foreach_modify(iter, &qinstr->unfinalized_entries)
{
- instr->running = true;
- instr->firsttuple = instr->counter;
+ Instrumentation *child = dlist_container(Instrumentation, unfinalized_entry, iter.cur);
+
+ InstrAccumStack(&qinstr->instr, child);
}
+
+ /* Ensure the stack is reset as expected, and we accumulate to the parent */
+ InstrStopFinalize(&qinstr->instr);
+
+ /*
+ * Destroy the dedicated instrumentation context, which frees the
+ * QueryInstrumentation and all child allocations.
+ */
+ MemoryContextDelete(instr_cxt);
+}
+
+QueryInstrumentation *
+InstrQueryAlloc(int instrument_options)
+{
+ QueryInstrumentation *instr;
+ MemoryContext instr_cxt;
+
+ /*
+ * When the instrumentation stack is used, create a dedicated memory
+ * context for this query's instrumentation allocations. This context is a
+ * child of TopMemoryContext so it survives transaction abort —
+ * ResourceOwner release needs to access it.
+ *
+ * For simpler cases (timer/rows only), use the current memory context.
+ *
+ * All child instrumentation allocations (nodes, triggers, etc) must be
+ * allocated within this context to ensure correct clean up on abort.
+ */
+ if (InstrNeedStack(instrument_options))
+ instr_cxt = AllocSetContextCreate(TopMemoryContext,
+ "Instrumentation",
+ ALLOCSET_SMALL_SIZES);
else
+ instr_cxt = CurrentMemoryContext;
+
+ instr = MemoryContextAllocZero(instr_cxt, sizeof(QueryInstrumentation));
+ instr->instrument_options = instrument_options;
+ instr->instr_cxt = instr_cxt;
+
+ InstrInitOptions(&instr->instr, instrument_options);
+ dlist_init(&instr->unfinalized_entries);
+
+ return instr;
+}
+
+void
+InstrQueryStart(QueryInstrumentation *qinstr)
+{
+ InstrStart(&qinstr->instr);
+
+ if (qinstr->instr.need_stack)
+ {
+ Assert(CurrentResourceOwner != NULL);
+ qinstr->owner = CurrentResourceOwner;
+
+ ResourceOwnerEnlarge(qinstr->owner);
+ ResourceOwnerRememberInstrumentation(qinstr->owner, qinstr);
+ }
+}
+
+void
+InstrQueryStop(QueryInstrumentation *qinstr)
+{
+ InstrStop(&qinstr->instr);
+
+ if (qinstr->instr.need_stack)
{
- /*
- * In async mode, if the plan node hadn't emitted any tuples before,
- * this might be the first tuple
- */
- if (instr->async_mode && save_tuplecount < 1.0)
- instr->firsttuple = instr->counter;
+ Assert(qinstr->owner != NULL);
+ ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+ qinstr->owner = NULL;
}
}
+void
+InstrQueryStopFinalize(QueryInstrumentation *qinstr)
+{
+ InstrStopFinalize(&qinstr->instr);
+
+ if (!qinstr->instr.need_stack)
+ {
+ Assert(qinstr->owner == NULL);
+ return;
+ }
+
+ Assert(qinstr->owner != NULL);
+ ResourceOwnerForgetInstrumentation(qinstr->owner, qinstr);
+ qinstr->owner = NULL;
+
+ /*
+ * Reparent the dedicated instrumentation context under the current memory
+ * context, so that its lifetime is now tied to the caller's context
+ * rather than TopMemoryContext.
+ */
+ MemoryContextSetParent(qinstr->instr_cxt, CurrentMemoryContext);
+}
+
+/*
+ * Register a child Instrumentation entry for abort processing.
+ *
+ * On abort, ResOwnerReleaseInstrumentation will walk the parent's list to
+ * recover buffer/WAL data from entries that were never finalized, in order for
+ * aggregate totals to be accurate despite the query erroring out.
+ */
+void
+InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *child)
+{
+ if (child->need_stack)
+ dlist_push_head(&parent->unfinalized_entries, &child->unfinalized_entry);
+}
+
+/* start instrumentation during parallel executor startup */
+QueryInstrumentation *
+InstrStartParallelQuery(void)
+{
+ QueryInstrumentation *qinstr = InstrQueryAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+
+ InstrQueryStart(qinstr);
+ return qinstr;
+}
+
+/* report usage after parallel executor shutdown */
+void
+InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst)
+{
+ InstrQueryStopFinalize(qinstr);
+ dst->need_stack = qinstr->instr.need_stack;
+ memcpy(&dst->bufusage, &qinstr->instr.bufusage, sizeof(BufferUsage));
+ memcpy(&dst->walusage, &qinstr->instr.walusage, sizeof(WalUsage));
+}
+
+/*
+ * Accumulate work done by parallel workers in the leader's stats.
+ *
+ * Note that what gets added here effectively depends on whether per-node
+ * instrumentation is active. If it's active the parallel worker intentionally
+ * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would
+ * cause double counting. Instead, this only accumulates any extra activity
+ * outside of nodes.
+ *
+ * Otherwise this is responsible for making sure that the complete query
+ * activity is accumulated.
+ */
+void
+InstrAccumParallelQuery(Instrumentation *instr)
+{
+ InstrAccumStack(instr_stack.current, instr);
+
+ WalUsageAdd(&pgWalUsage, &instr->walusage);
+}
+
+/* Node instrumentation handling */
+
+/* Allocate new node instrumentation structure */
+NodeInstrumentation *
+InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode)
+{
+ NodeInstrumentation *instr = MemoryContextAlloc(qinstr->instr_cxt, sizeof(NodeInstrumentation));
+
+ InstrInitNode(instr, qinstr->instrument_options);
+ instr->async_mode = async_mode;
+
+ InstrQueryRememberChild(qinstr, &instr->instr);
+
+ return instr;
+}
+
+/* Initialize a pre-allocated instrumentation structure. */
+void
+InstrInitNode(NodeInstrumentation *instr, int instrument_options)
+{
+ memset(instr, 0, sizeof(NodeInstrumentation));
+ InstrInitOptions(&instr->instr, instrument_options);
+}
+
+/* Entry to a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStartNode(NodeInstrumentation *instr)
+{
+ InstrStart(&instr->instr);
+}
+
+/*
+ * Updates the node instrumentation time counter.
+ *
+ * Note this is different from InstrStop because total is only updated in
+ * InstrEndLoop. We need the separate counter variable because we need to
+ * calculate start-up time for the first tuple in each cycle, and then
+ * accumulate it together.
+ */
+static inline void
+InstrStopNodeTimer(NodeInstrumentation *instr)
+{
+ instr_time endtime;
+
+ Assert(!INSTR_TIME_IS_ZERO(instr->instr.starttime));
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->instr.starttime);
+ INSTR_TIME_SET_ZERO(instr->instr.starttime);
+
+ /*
+ * Is this the first tuple of this cycle?
+ *
+ * In async mode, if the plan node hadn't emitted any tuples before, this
+ * might be the first tuple
+ */
+ if (!instr->running || (instr->async_mode && instr->tuplecount < 1.0))
+ instr->firsttuple = instr->counter;
+}
+
+/* Exit from a plan node. If you modify this, check InstrNodeSetupExecProcNode. */
+void
+InstrStopNode(NodeInstrumentation *instr, double nTuples)
+{
+ if (instr->instr.need_timer)
+ InstrStopNodeTimer(instr);
+
+ /* Only pop the stack, accumulation runs in InstrFinalizeNode */
+ if (instr->instr.need_stack)
+ InstrPopStack(&instr->instr);
+
+ instr->running = true;
+
+ /* count the returned tuples */
+ instr->tuplecount += nTuples;
+}
+
/* Update tuple count */
void
-InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
{
/* count the returned tuples */
instr->tuplecount += nTuples;
@@ -141,47 +455,40 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
/* Finish a run cycle for a plan node */
void
-InstrEndLoop(Instrumentation *instr)
+InstrEndLoop(NodeInstrumentation *instr)
{
/* Skip if nothing has happened, or already shut down */
if (!instr->running)
return;
- if (!INSTR_TIME_IS_ZERO(instr->starttime))
- elog(ERROR, "InstrEndLoop called on running node");
+ /* Ensure InstrNodeStop was called */
+ Assert(INSTR_TIME_IS_ZERO(instr->instr.starttime));
/* Accumulate per-cycle statistics into totals */
INSTR_TIME_ADD(instr->startup, instr->firsttuple);
- INSTR_TIME_ADD(instr->total, instr->counter);
+ INSTR_TIME_ADD(instr->instr.total, instr->counter);
instr->ntuples += instr->tuplecount;
instr->nloops += 1;
/* Reset for next cycle (if any) */
instr->running = false;
- INSTR_TIME_SET_ZERO(instr->starttime);
+ INSTR_TIME_SET_ZERO(instr->instr.starttime);
INSTR_TIME_SET_ZERO(instr->counter);
INSTR_TIME_SET_ZERO(instr->firsttuple);
instr->tuplecount = 0;
}
-/* aggregate instrumentation information */
+/*
+ * Aggregate instrumentation from parallel workers. Must be called after
+ * InstrEndLoop.
+ */
void
-InstrAggNode(Instrumentation *dst, Instrumentation *add)
+InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
{
- if (!dst->running && add->running)
- {
- dst->running = true;
- dst->firsttuple = add->firsttuple;
- }
- else if (dst->running && add->running &&
- INSTR_TIME_GT(dst->firsttuple, add->firsttuple))
- dst->firsttuple = add->firsttuple;
+ Assert(!add->running);
- INSTR_TIME_ADD(dst->counter, add->counter);
-
- dst->tuplecount += add->tuplecount;
INSTR_TIME_ADD(dst->startup, add->startup);
- INSTR_TIME_ADD(dst->total, add->total);
+ INSTR_TIME_ADD(dst->instr.total, add->instr.total);
dst->ntuples += add->ntuples;
dst->ntuples2 += add->ntuples2;
dst->nloops += add->nloops;
@@ -189,41 +496,167 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
dst->nfiltered2 += add->nfiltered2;
/* Add delta of buffer usage since entry to node's totals */
- if (dst->need_bufusage)
- BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ if (dst->instr.need_stack)
+ InstrAccumStack(&dst->instr, &add->instr);
+}
+
+/*
+ * Specialized handling of instrumented ExecProcNode
+ *
+ * These functions are equivalent to running ExecProcNodeReal wrapped in
+ * InstrStartNode and InstrStopNode, but avoid the conditionals in the hot path
+ * by checking the instrumentation options when the ExecProcNode pointer gets
+ * first set, and then using a special-purpose function for each. This results
+ * in a more optimized set of compiled instructions.
+ */
+
+#include "executor/tuptable.h"
+#include "nodes/execnodes.h"
+
+/* Simplified pop: restore saved state instead of re-deriving from array */
+static inline void
+InstrPopStackTo(Instrumentation *prev)
+{
+ Assert(instr_stack.stack_size > 0);
+ Assert(instr_stack.stack_size > 1 ? instr_stack.entries[instr_stack.stack_size - 2] == prev : &instr_top == prev);
+ instr_stack.entries[instr_stack.stack_size - 1]->on_stack = false;
+ instr_stack.stack_size--;
+ instr_stack.current = prev;
+}
+
+static pg_attribute_always_inline TupleTableSlot *
+ExecProcNodeInstr(PlanState *node, bool need_timer, bool need_stack)
+{
+ NodeInstrumentation *instr = node->instrument;
+ Instrumentation *prev = instr_stack.current;
+ TupleTableSlot *result;
+
+ if (need_stack)
+ InstrPushStack(&instr->instr);
+ if (need_timer)
+ InstrStartTimer(&instr->instr);
+
+ result = node->ExecProcNodeReal(node);
+
+ if (need_timer)
+ InstrStopNodeTimer(instr);
+ if (need_stack)
+ InstrPopStackTo(prev);
+
+ instr->running = true;
+ if (!TupIsNull(result))
+ instr->tuplecount += 1.0;
+
+ return result;
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrFull(PlanState *node)
+{
+ return ExecProcNodeInstr(node, true, true);
+}
- if (dst->need_walusage)
- WalUsageAdd(&dst->walusage, &add->walusage);
+static TupleTableSlot *
+ExecProcNodeInstrRowsStackOnly(PlanState *node)
+{
+ return ExecProcNodeInstr(node, false, true);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsTimerOnly(PlanState *node)
+{
+ return ExecProcNodeInstr(node, true, false);
+}
+
+static TupleTableSlot *
+ExecProcNodeInstrRowsOnly(PlanState *node)
+{
+ return ExecProcNodeInstr(node, false, false);
+}
+
+/*
+ * Returns an ExecProcNode wrapper that performs instrumentation calls,
+ * tailored to the instrumentation options enabled for the node.
+ */
+ExecProcNodeMtd
+InstrNodeSetupExecProcNode(NodeInstrumentation *instr)
+{
+ bool need_timer = instr->instr.need_timer;
+ bool need_stack = instr->instr.need_stack;
+
+ if (need_timer && need_stack)
+ return ExecProcNodeInstrFull;
+ else if (need_stack)
+ return ExecProcNodeInstrRowsStackOnly;
+ else if (need_timer)
+ return ExecProcNodeInstrRowsTimerOnly;
+ else
+ return ExecProcNodeInstrRowsOnly;
+}
+
+/* Trigger instrumentation handling */
+TriggerInstrumentation *
+InstrAllocTrigger(QueryInstrumentation *qinstr, int n)
+{
+ TriggerInstrumentation *tginstr;
+ int i;
+
+ /*
+ * Allocate in the query's dedicated instrumentation context so all
+ * instrumentation data is grouped together and cleaned up as a unit.
+ */
+ Assert(qinstr != NULL && qinstr->instr_cxt != NULL);
+ tginstr = MemoryContextAllocZero(qinstr->instr_cxt,
+ n * sizeof(TriggerInstrumentation));
+
+ for (i = 0; i < n; i++)
+ InstrInitOptions(&tginstr[i].instr, qinstr->instrument_options);
+
+ return tginstr;
}
-/* note current values during parallel executor startup */
void
-InstrStartParallelQuery(void)
+InstrStartTrigger(QueryInstrumentation *qinstr, TriggerInstrumentation *tginstr)
{
- save_pgBufferUsage = pgBufferUsage;
- save_pgWalUsage = pgWalUsage;
+ InstrStart(&tginstr->instr);
+
+ /*
+ * On first call, register with the parent QueryInstrumentation for abort
+ * recovery.
+ */
+ if (qinstr && tginstr->instr.need_stack &&
+ dlist_node_is_detached(&tginstr->instr.unfinalized_entry))
+ dlist_push_head(&qinstr->unfinalized_entries,
+ &tginstr->instr.unfinalized_entry);
}
-/* report usage after parallel executor shutdown */
void
-InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
{
- memset(bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
- memset(walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
+ /*
+ * This trigger may be called again, so we don't finalize instrumentation
+ * here. Accumulation to the parent happens at ExecutorFinish through
+ * ExecFinalizeTriggerInstrumentation.
+ */
+ InstrStop(&tginstr->instr);
+ tginstr->firings += firings;
}
-/* accumulate work done by workers in leader's stats */
void
-InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrAccumStack(Instrumentation *dst, Instrumentation *add)
{
- BufferUsageAdd(&pgBufferUsage, bufusage);
- WalUsageAdd(&pgWalUsage, walusage);
+ Assert(dst != NULL);
+ Assert(add != NULL);
+
+ if (!add->need_stack)
+ return;
+
+ BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ WalUsageAdd(&dst->walusage, &add->walusage);
}
/* dst += add */
-static void
+void
BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
{
dst->shared_blks_hit += add->shared_blks_hit;
@@ -244,39 +677,9 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
}
-/* dst += add - sub */
+/* dst += add */
void
-BufferUsageAccumDiff(BufferUsage *dst,
- const BufferUsage *add,
- const BufferUsage *sub)
-{
- dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
- dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
- dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
- dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
- dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
- dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
- dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
- dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
- dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
- dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
- INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
- add->shared_blk_read_time, sub->shared_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
- add->shared_blk_write_time, sub->shared_blk_write_time);
- INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
- add->local_blk_read_time, sub->local_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
- add->local_blk_write_time, sub->local_blk_write_time);
- INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
- add->temp_blk_read_time, sub->temp_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
- add->temp_blk_write_time, sub->temp_blk_write_time);
-}
-
-/* helper functions for WAL usage accumulation */
-static void
-WalUsageAdd(WalUsage *dst, WalUsage *add)
+WalUsageAdd(WalUsage *dst, const WalUsage *add)
{
dst->wal_bytes += add->wal_bytes;
dst->wal_records += add->wal_records;
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 70c55ee6d614d..63e24a0bcd4cf 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -276,7 +276,7 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
/* Set up instrumentation of bitmap index scans if requested */
if (estate->es_instrument)
- indexstate->biss_Instrument = palloc0_object(IndexScanInstrumentation);
+ indexstate->biss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
/* Open the index relation. */
lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index de6154fd54139..9e64ce2bd2da7 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -67,6 +67,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
IndexScanDesc scandesc;
TupleTableSlot *slot;
ItemPointer tid;
+ Instrumentation *table_instr = NULL;
/*
* extract necessary information from index scan node
@@ -83,6 +84,9 @@ IndexOnlyNext(IndexOnlyScanState *node)
econtext = node->ss.ps.ps_ExprContext;
slot = node->ss.ss_ScanTupleSlot;
+ if (node->ioss_Instrument && node->ioss_Instrument->table_instr.need_stack)
+ table_instr = &node->ioss_Instrument->table_instr;
+
if (scandesc == NULL)
{
/*
@@ -165,11 +169,22 @@ IndexOnlyNext(IndexOnlyScanState *node)
ItemPointerGetBlockNumber(tid),
&node->ioss_VMBuffer))
{
+ bool found;
+
/*
* Rats, we have to visit the heap to check visibility.
*/
InstrCountTuples2(node, 1);
- if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+
+ if (table_instr)
+ InstrPushStack(table_instr);
+
+ found = index_fetch_heap(scandesc, node->ioss_TableSlot);
+
+ if (table_instr)
+ InstrPopStack(table_instr);
+
+ if (!found)
continue; /* no visible tuple, try next index entry */
ExecClearTuple(node->ioss_TableSlot);
@@ -436,6 +451,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node)
* which will have a new IndexOnlyScanState and zeroed stats.
*/
winstrument->nsearches += node->ioss_Instrument->nsearches;
+ InstrAccumStack(&winstrument->table_instr, &node->ioss_Instrument->table_instr);
}
/*
@@ -610,7 +626,21 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
/* Set up instrumentation of index-only scans if requested */
if (estate->es_instrument)
- indexstate->ioss_Instrument = palloc0_object(IndexScanInstrumentation);
+ {
+ indexstate->ioss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
+
+ /*
+ * Track table and index access separately. We intentionally don't
+ * collect timing (even if enabled), since we don't need it, and
+ * IndexOnlyNext calls InstrPushStack / InstrPopStack (instead of the
+ * full InstrNode*) to reduce overhead.
+ */
+ if ((estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0)
+ {
+ InstrInitOptions(&indexstate->ioss_Instrument->table_instr, INSTRUMENT_BUFFERS);
+ InstrQueryRememberChild(estate->es_instrument, &indexstate->ioss_Instrument->table_instr);
+ }
+ }
/* Open the index relation. */
lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
@@ -899,4 +929,11 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
node->ioss_SharedInfo = palloc(size);
memcpy(node->ioss_SharedInfo, SharedInfo, size);
+
+ /* Aggregate workers' table buffer/WAL usage into leader's entry */
+ for (int i = 0; i < node->ioss_SharedInfo->num_workers; i++)
+ {
+ InstrAccumStack(&node->ioss_Instrument->table_instr,
+ &node->ioss_SharedInfo->winstrument[i].table_instr);
+ }
}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 1620d14607173..02ef9d124a368 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -85,7 +85,10 @@ IndexNext(IndexScanState *node)
ExprContext *econtext;
ScanDirection direction;
IndexScanDesc scandesc;
+ ItemPointer tid;
TupleTableSlot *slot;
+ bool found;
+ Instrumentation *table_instr = NULL;
/*
* extract necessary information from index scan node
@@ -102,6 +105,9 @@ IndexNext(IndexScanState *node)
econtext = node->ss.ps.ps_ExprContext;
slot = node->ss.ss_ScanTupleSlot;
+ if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack)
+ table_instr = &node->iss_Instrument->table_instr;
+
if (scandesc == NULL)
{
/*
@@ -132,8 +138,24 @@ IndexNext(IndexScanState *node)
/*
* ok, now that we have what we need, fetch the next tuple.
*/
- while (index_getnext_slot(scandesc, direction, slot))
+ while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
{
+ if (table_instr)
+ InstrPushStack(table_instr);
+
+ for (;;)
+ {
+ found = index_fetch_heap(scandesc, slot);
+ if (found || !scandesc->xs_heap_continue)
+ break;
+ }
+
+ if (table_instr)
+ InstrPopStack(table_instr);
+
+ if (unlikely(!found))
+ continue;
+
CHECK_FOR_INTERRUPTS();
/*
@@ -181,6 +203,7 @@ IndexNextWithReorder(IndexScanState *node)
Datum *lastfetched_vals;
bool *lastfetched_nulls;
int cmp;
+ Instrumentation *table_instr = NULL;
estate = node->ss.ps.state;
@@ -200,6 +223,9 @@ IndexNextWithReorder(IndexScanState *node)
econtext = node->ss.ps.ps_ExprContext;
slot = node->ss.ss_ScanTupleSlot;
+ if (node->iss_Instrument && node->iss_Instrument->table_instr.need_stack)
+ table_instr = &node->iss_Instrument->table_instr;
+
if (scandesc == NULL)
{
/*
@@ -263,36 +289,67 @@ IndexNextWithReorder(IndexScanState *node)
}
/*
- * Fetch next tuple from the index.
+ * Fetch next valid tuple from the index.
*/
-next_indextuple:
- if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+ for (;;)
{
+ ItemPointer tid;
+ bool found;
+
+ /* Time to fetch the next TID from the index */
+ tid = index_getnext_tid(scandesc, ForwardScanDirection);
+
+ /* If we're out of index entries, we're done */
+ if (tid == NULL)
+ {
+ /*
+ * No more tuples from the index. But we still need to drain
+ * any remaining tuples from the queue before we're done.
+ */
+ node->iss_ReachedEnd = true;
+ break;
+ }
+
+ Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid));
+
+ if (table_instr)
+ InstrPushStack(table_instr);
+
+ for (;;)
+ {
+ found = index_fetch_heap(scandesc, slot);
+ if (found || !scandesc->xs_heap_continue)
+ break;
+ }
+
+ if (table_instr)
+ InstrPopStack(table_instr);
+
/*
- * No more tuples from the index. But we still need to drain any
- * remaining tuples from the queue before we're done.
+ * If the index was lossy, we have to recheck the index quals and
+ * ORDER BY expressions using the fetched tuple.
*/
- node->iss_ReachedEnd = true;
- continue;
- }
-
- /*
- * If the index was lossy, we have to recheck the index quals and
- * ORDER BY expressions using the fetched tuple.
- */
- if (scandesc->xs_recheck)
- {
- econtext->ecxt_scantuple = slot;
- if (!ExecQualAndReset(node->indexqualorig, econtext))
+ if (found && scandesc->xs_recheck)
{
- /* Fails recheck, so drop it and loop back for another */
- InstrCountFiltered2(node, 1);
- /* allow this loop to be cancellable */
- CHECK_FOR_INTERRUPTS();
- goto next_indextuple;
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->indexqualorig, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ continue;
+ }
}
+
+ if (found)
+ break;
}
+ /* No more index entries, re-run to clear the reorder queue */
+ if (node->iss_ReachedEnd)
+ continue;
+
if (scandesc->xs_recheckorderby)
{
econtext->ecxt_scantuple = slot;
@@ -818,6 +875,7 @@ ExecEndIndexScan(IndexScanState *node)
* which will have a new IndexOnlyScanState and zeroed stats.
*/
winstrument->nsearches += node->iss_Instrument->nsearches;
+ InstrAccumStack(&winstrument->table_instr, &node->iss_Instrument->table_instr);
}
/*
@@ -980,7 +1038,21 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
/* Set up instrumentation of index scans if requested */
if (estate->es_instrument)
- indexstate->iss_Instrument = palloc0_object(IndexScanInstrumentation);
+ {
+ indexstate->iss_Instrument = MemoryContextAllocZero(estate->es_instrument->instr_cxt, sizeof(IndexScanInstrumentation));
+
+ /*
+ * Track table and index access separately. We intentionally don't
+ * collect timing (even if enabled), since we don't need it, and
+ * IndexNext / IndexNextWithReorder call InstrPushStack /
+ * InstrPopStack (instead of the full InstrNode*) to reduce overhead.
+ */
+ if ((estate->es_instrument->instrument_options & INSTRUMENT_BUFFERS) != 0)
+ {
+ InstrInitOptions(&indexstate->iss_Instrument->table_instr, INSTRUMENT_BUFFERS);
+ InstrQueryRememberChild(estate->es_instrument, &indexstate->iss_Instrument->table_instr);
+ }
+ }
/* Open the index relation. */
lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
@@ -1834,4 +1906,11 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node)
SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
node->iss_SharedInfo = palloc(size);
memcpy(node->iss_SharedInfo, SharedInfo, size);
+
+ /* Aggregate workers' table buffer/WAL usage into leader's entry */
+ for (int i = 0; i < node->iss_SharedInfo->num_workers; i++)
+ {
+ InstrAccumStack(&node->iss_Instrument->table_instr,
+ &node->iss_SharedInfo->winstrument[i].table_instr);
+ }
}
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index b38170f0fbe99..3ca0a7a635dba 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -904,7 +904,7 @@ create_edata_for_relation(LogicalRepRelMapEntry *rel)
* Use Relation opened by logicalrep_rel_open() instead of opening it
* again.
*/
- InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
+ InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, NULL);
/*
* We put the ResultRelInfo in the es_opened_result_relations list, even
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 39984df31b458..9f76d2683c0c9 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -409,7 +409,6 @@ static int
pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
{
struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
- int in_flight_before = dclist_count(&pgaio_my_backend->in_flight_ios);
Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
@@ -425,27 +424,6 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
pgaio_io_prepare_submit(ioh);
pgaio_uring_sq_from_io(ioh, sqe);
-
- /*
- * io_uring executes IO in process context if possible. That's
- * generally good, as it reduces context switching. When performing a
- * lot of buffered IO that means that copying between page cache and
- * userspace memory happens in the foreground, as it can't be
- * offloaded to DMA hardware as is possible when using direct IO. When
- * executing a lot of buffered IO this causes io_uring to be slower
- * than worker mode, as worker mode parallelizes the copying. io_uring
- * can be told to offload work to worker threads instead.
- *
- * If an IO is buffered IO and we already have IOs in flight or
- * multiple IOs are being submitted, we thus tell io_uring to execute
- * the IO in the background. We don't do so for the first few IOs
- * being submitted as executing in this process' context has lower
- * latency.
- */
- if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
- io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
-
- in_flight_before++;
}
while (true)
@@ -701,10 +679,65 @@ pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation)
LWLockRelease(&owner_context->completion_lock);
}
+/*
+ * io_uring executes IO in process context if possible. That's generally good,
+ * as it reduces context switching. When performing a lot of buffered IO that
+ * means that copying between page cache and userspace memory happens in the
+ * foreground, as it can't be offloaded to DMA hardware as is possible when
+ * using direct IO. When executing a lot of buffered IO this causes io_uring
+ * to be slower than worker mode, as worker mode parallelizes the
+ * copying. io_uring can be told to offload work to worker threads instead.
+ *
+ * If the IOs are small, we only benefit from forcing things into the
+ * background if there is a lot of IO, as otherwise the overhead from context
+ * switching is higher than the gain.
+ *
+ * If IOs are large, there is benefit from asynchronous processing at lower
+ * queue depths, as IO latency is less of a crucial factor and parallelizing
+ * memory copies is more important. In addition, it is important to trigger
+ * asynchronous processing even at low queue depth, as with foreground
+ * processing we might never actually reach deep enough IO depths to trigger
+ * asynchronous processing, which in turn would deprive readahead control
+ * logic of information about whether a deeper look-ahead distance would be
+ * advantageous.
+ *
+ * We have done some basic benchmarking to validate the thresholds used, but
+ * it's quite plausible that there are better values. See
+ * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg
+ * for some details of this benchmarking.
+ */
+static bool
+pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size)
+{
+ /*
+ * With DIO there's no benefit from forcing asynchronous processing, as
+ * io_uring will never execute direct IO synchronously during submission.
+ */
+ if (!(ioh->flags & PGAIO_HF_BUFFERED))
+ return false;
+
+ /*
+ * Once the IO queue depth is not that shallow anymore, the overhead of
+ * dispatching to the background is a less significant factor.
+ */
+ if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4)
+ return true;
+
+ /*
+ * If the IO is larger, the gains from parallelizing the memory copy are
+ * larger and typically the impact of the latency is smaller.
+ */
+ if (io_size >= (BLCKSZ * 4))
+ return true;
+
+ return false;
+}
+
static void
pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
{
struct iovec *iov;
+ size_t io_size = 0;
switch ((PgAioOp) ioh->op)
{
@@ -717,6 +750,8 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
iov->iov_base,
iov->iov_len,
ioh->op_data.read.offset);
+
+ io_size = iov->iov_len;
}
else
{
@@ -726,7 +761,13 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
ioh->op_data.read.iov_length,
ioh->op_data.read.offset);
+ for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++)
+ io_size += iov->iov_len;
}
+
+ if (pgaio_uring_should_use_async(ioh, io_size))
+ io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
+
break;
case PGAIO_OP_WRITEV:
@@ -747,6 +788,12 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
ioh->op_data.write.iov_length,
ioh->op_data.write.offset);
}
+
+ /*
+ * For now don't trigger use of IOSQE_ASYNC for writes, it's not
+ * clear there is a performance benefit in doing so.
+ */
+
break;
case PGAIO_OP_INVALID:
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index efe38e9f1134f..e24357a7a0a23 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -239,8 +239,8 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh)
|| !pgaio_io_can_reopen(ioh);
}
-static void
-pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
+static int
+pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
{
PgAioHandle **synchronous_ios = NULL;
int nsync = 0;
@@ -249,6 +249,9 @@ pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
+ for (int i = 0; i < num_staged_ios; i++)
+ pgaio_io_prepare_submit(staged_ios[i]);
+
if (LWLockConditionalAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE))
{
for (int i = 0; i < num_staged_ios; ++i)
@@ -299,19 +302,6 @@ pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
pgaio_io_perform_synchronously(synchronous_ios[i]);
}
}
-}
-
-static int
-pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
-{
- for (int i = 0; i < num_staged_ios; i++)
- {
- PgAioHandle *ioh = staged_ios[i];
-
- pgaio_io_prepare_submit(ioh);
- }
-
- pgaio_worker_submit_internal(num_staged_ios, staged_ios);
return num_staged_ios;
}
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 31f9e35dee310..0b6cdf7c8730d 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -18,11 +18,13 @@
* to StartReadBuffers() so that a new one can begin to form.
*
* The algorithm for controlling the look-ahead distance is based on recent
- * cache hit and miss history. When no I/O is necessary, there is no benefit
- * in looking ahead more than one block. This is the default initial
- * assumption, but when blocks needing I/O are streamed, the distance is
- * increased rapidly to try to benefit from I/O combining and concurrency. It
- * is reduced gradually when cached blocks are streamed.
+ * cache / miss history, as well as whether we need to wait for I/O completion
+ * after a miss. When no I/O is necessary, there is no benefit in looking
+ * ahead more than one block. This is the default initial assumption. When
+ * blocks needing I/O are streamed, the combine distance is increased to
+ * benefit from I/O combining and the read-ahead distance is increased
+ * whenever we need to wait for I/O to try to benefit from increased I/O
+ * concurrency. Both are reduced gradually when cached blocks are streamed.
*
* The main data structure is a circular queue of buffers of size
* max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -98,10 +100,23 @@ struct ReadStream
int16 max_pinned_buffers;
int16 forwarded_buffers;
int16 pinned_buffers;
- int16 distance;
+
+ /*
+ * Limit of how far, in blocks, to look-ahead for IO combining and for
+ * read-ahead.
+ *
+ * The limits for read-ahead and combining are handled separately to allow
+ * for IO combining even in cases where the I/O subsystem can keep up at a
+ * low read-ahead distance, as doing larger IOs is more efficient.
+ *
+ * Set to 0 when the end of the stream is reached.
+ */
+ int16 combine_distance;
+ int16 readahead_distance;
uint16 distance_decay_holdoff;
int16 initialized_buffers;
- int16 resume_distance;
+ int16 resume_readahead_distance;
+ int16 resume_combine_distance;
int read_buffers_flags;
bool sync_mode; /* using io_method=sync */
bool batch_mode; /* READ_STREAM_USE_BATCHING */
@@ -332,8 +347,8 @@ read_stream_start_pending_read(ReadStream *stream)
/* Shrink distance: no more look-ahead until buffers are released. */
new_distance = stream->pinned_buffers + buffer_limit;
- if (stream->distance > new_distance)
- stream->distance = new_distance;
+ if (stream->readahead_distance > new_distance)
+ stream->readahead_distance = new_distance;
/* Unless we have nothing to give the consumer, stop here. */
if (stream->pinned_buffers > 0)
@@ -374,12 +389,29 @@ read_stream_start_pending_read(ReadStream *stream)
* perform IO asynchronously when starting out with a small look-ahead
* distance.
*/
- if (stream->distance > 1 && stream->ios_in_progress == 0)
+ if (stream->ios_in_progress == 0)
{
- if (stream->distance_decay_holdoff == 0)
- stream->distance--;
- else
+ if (stream->distance_decay_holdoff > 0)
stream->distance_decay_holdoff--;
+ else
+ {
+ if (stream->readahead_distance > 1)
+ stream->readahead_distance--;
+
+ /*
+ * For now we reduce the IO combine distance after
+ * sufficiently many buffer hits. There is no clear
+ * performance argument for doing so, but at the moment we
+ * need to do so to make the entrance into fast_path work
+ * correctly: We require combine_distance == 1 to enter
+ * fast-path, as without that condition we would wrongly
+ * re-enter fast-path when readahead_distance == 1 and
+ * pinned_buffers == 1, as we would not yet have prepared
+ * another IO in that situation.
+ */
+ if (stream->combine_distance > 1)
+ stream->combine_distance--;
+ }
}
}
else
@@ -440,6 +472,114 @@ read_stream_start_pending_read(ReadStream *stream)
return true;
}
+/*
+ * Should we continue to perform look ahead? Looking ahead may allow us to
+ * make the pending IO larger via IO combining or to issue more read-ahead.
+ */
+static inline bool
+read_stream_should_look_ahead(ReadStream *stream)
+{
+ /* If the callback has signaled end-of-stream, we're done */
+ if (stream->readahead_distance == 0)
+ return false;
+
+ /* never start more IOs than our cap */
+ if (stream->ios_in_progress >= stream->max_ios)
+ return false;
+
+ /*
+ * Allow looking further ahead if we are in the process of building a
+ * larger IO, the IO is not yet big enough, and we don't yet have IO in
+ * flight.
+ *
+ * We do so to allow building larger reads when readahead_distance is
+ * small (e.g. because the I/O subsystem is keeping up or
+ * effective_io_concurrency is small). That's a useful goal because larger
+ * reads are more CPU efficient than smaller reads, even if the system is
+ * not IO bound.
+ *
+ * The reason we do *not* do so when we already have a read prepared (i.e.
+ * why we check for pinned_buffers == 0) is once we are actually reading
+ * ahead, we don't need it:
+ *
+ * - We won't issue unnecessarily small reads as
+ * read_stream_should_issue_now() will return false until the IO is
+ * suitably sized. The issuance of the pending read will be delayed until
+ * enough buffers have been consumed.
+ *
+ * - If we are not reading ahead aggressively enough, future
+ * WaitReadBuffers() calls will return true, leading to readahead_distance
+ * being increased. After that more full-sized IOs can be issued.
+ *
+ * Furthermore, if we did not have the pinned_buffers == 0 condition, we
+ * might end up issuing I/O more aggressively than we need.
+ *
+ * Note that a return of true here can lead to exceeding the read-ahead
+ * limit, but we won't exceed the buffer pin limit (because pinned_buffers
+ * == 0 and combine_distance is capped by max_pinned_buffers).
+ */
+ if (stream->pending_read_nblocks > 0 &&
+ stream->pinned_buffers == 0 &&
+ stream->pending_read_nblocks < stream->combine_distance)
+ return true;
+
+ /*
+ * Don't start more read-ahead if that'd put us over the distance limit
+ * for doing read-ahead. As stream->readahead_distance is capped by
+ * max_pinned_buffers, this prevents us from looking ahead so far that it
+ * would put us over the pin limit.
+ */
+ if (stream->pinned_buffers + stream->pending_read_nblocks >= stream->readahead_distance)
+ return false;
+
+ return true;
+}
+
+/*
+ * We don't start the pending read just because we've hit the distance limit,
+ * preferring to give it another chance to grow to full io_combine_limit size
+ * once more buffers have been consumed. But this is not desirable in all
+ * situations - see below.
+ */
+static inline bool
+read_stream_should_issue_now(ReadStream *stream)
+{
+ int16 pending_read_nblocks = stream->pending_read_nblocks;
+
+ /* there is no pending IO that could be issued */
+ if (pending_read_nblocks == 0)
+ return false;
+
+ /* never start more IOs than our cap */
+ if (stream->ios_in_progress >= stream->max_ios)
+ return false;
+
+ /*
+ * If the callback has signaled end-of-stream, start the pending read
+ * immediately. There is no further potential for IO combining.
+ */
+ if (stream->readahead_distance == 0)
+ return true;
+
+ /*
+ * If we've already reached combine_distance, there's no chance of growing
+ * the read further.
+ */
+ if (pending_read_nblocks >= stream->combine_distance)
+ return true;
+
+ /*
+ * If we currently have no reads in flight or prepared, issue the IO once
+ * we are not looking ahead further. This ensures there's always at least
+ * one IO prepared.
+ */
+ if (stream->pinned_buffers == 0 &&
+ !read_stream_should_look_ahead(stream))
+ return true;
+
+ return false;
+}
+
static void
read_stream_look_ahead(ReadStream *stream)
{
@@ -452,14 +592,13 @@ read_stream_look_ahead(ReadStream *stream)
if (stream->batch_mode)
pgaio_enter_batchmode();
- while (stream->ios_in_progress < stream->max_ios &&
- stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
+ while (read_stream_should_look_ahead(stream))
{
BlockNumber blocknum;
int16 buffer_index;
void *per_buffer_data;
- if (stream->pending_read_nblocks == stream->io_combine_limit)
+ if (read_stream_should_issue_now(stream))
{
read_stream_start_pending_read(stream);
continue;
@@ -479,7 +618,8 @@ read_stream_look_ahead(ReadStream *stream)
if (blocknum == InvalidBlockNumber)
{
/* End of stream. */
- stream->distance = 0;
+ stream->readahead_distance = 0;
+ stream->combine_distance = 0;
break;
}
@@ -511,21 +651,13 @@ read_stream_look_ahead(ReadStream *stream)
}
/*
- * We don't start the pending read just because we've hit the distance
- * limit, preferring to give it another chance to grow to full
- * io_combine_limit size once more buffers have been consumed. However,
- * if we've already reached io_combine_limit, or we've reached the
- * distance limit and there isn't anything pinned yet, or the callback has
- * signaled end-of-stream, we start the read immediately. Note that the
- * pending read can exceed the distance goal, if the latter was reduced
- * after hitting the per-backend buffer limit.
+ * Check if the pending read should be issued now, or if we should give it
+ * another chance to grow to the full size.
+ *
+ * Note that the pending read can exceed the distance goal, if the latter
+ * was reduced after hitting the per-backend buffer limit.
*/
- if (stream->pending_read_nblocks > 0 &&
- (stream->pending_read_nblocks == stream->io_combine_limit ||
- (stream->pending_read_nblocks >= stream->distance &&
- stream->pinned_buffers == 0) ||
- stream->distance == 0) &&
- stream->ios_in_progress < stream->max_ios)
+ if (read_stream_should_issue_now(stream))
read_stream_start_pending_read(stream);
/*
@@ -534,7 +666,7 @@ read_stream_look_ahead(ReadStream *stream)
* stream. In the worst case we can always make progress one buffer at a
* time.
*/
- Assert(stream->pinned_buffers > 0 || stream->distance == 0);
+ Assert(stream->pinned_buffers > 0 || stream->readahead_distance == 0);
if (stream->batch_mode)
pgaio_exit_batchmode();
@@ -724,10 +856,17 @@ read_stream_begin_impl(int flags,
* doing full io_combine_limit sized reads.
*/
if (flags & READ_STREAM_FULL)
- stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
+ {
+ stream->readahead_distance = Min(max_pinned_buffers, stream->io_combine_limit);
+ stream->combine_distance = Min(max_pinned_buffers, stream->io_combine_limit);
+ }
else
- stream->distance = 1;
- stream->resume_distance = stream->distance;
+ {
+ stream->readahead_distance = 1;
+ stream->combine_distance = 1;
+ }
+ stream->resume_readahead_distance = stream->readahead_distance;
+ stream->resume_combine_distance = stream->combine_distance;
/*
* Since we always access the same relation, we can initialize parts of
@@ -826,7 +965,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Assert(stream->ios_in_progress == 0);
Assert(stream->forwarded_buffers == 0);
Assert(stream->pinned_buffers == 1);
- Assert(stream->distance == 1);
+ Assert(stream->readahead_distance == 1);
+ Assert(stream->combine_distance == 1);
Assert(stream->pending_read_nblocks == 0);
Assert(stream->per_buffer_data_size == 0);
Assert(stream->initialized_buffers > stream->oldest_buffer_index);
@@ -900,7 +1040,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
else
{
/* No more blocks, end of stream. */
- stream->distance = 0;
+ stream->readahead_distance = 0;
+ stream->combine_distance = 0;
stream->oldest_buffer_index = stream->next_buffer_index;
stream->pinned_buffers = 0;
stream->buffers[oldest_buffer_index] = InvalidBuffer;
@@ -916,7 +1057,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Assert(stream->oldest_buffer_index == stream->next_buffer_index);
/* End of stream reached? */
- if (stream->distance == 0)
+ if (stream->readahead_distance == 0)
return InvalidBuffer;
/*
@@ -930,7 +1071,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
/* End of stream reached? */
if (stream->pinned_buffers == 0)
{
- Assert(stream->distance == 0);
+ Assert(stream->readahead_distance == 0);
return InvalidBuffer;
}
}
@@ -951,27 +1092,59 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
stream->ios[stream->oldest_io_index].buffer_index == oldest_buffer_index)
{
int16 io_index = stream->oldest_io_index;
- int32 distance; /* wider temporary value, clamped below */
+ bool needed_wait;
/* Sanity check that we still agree on the buffers. */
Assert(stream->ios[io_index].op.buffers ==
&stream->buffers[oldest_buffer_index]);
- WaitReadBuffers(&stream->ios[io_index].op);
+ needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
Assert(stream->ios_in_progress > 0);
stream->ios_in_progress--;
if (++stream->oldest_io_index == stream->max_ios)
stream->oldest_io_index = 0;
- /* Look-ahead distance ramps up rapidly after we do I/O. */
- distance = stream->distance * 2;
- distance = Min(distance, stream->max_pinned_buffers);
- stream->distance = distance;
+ /*
+ * If the IO was executed synchronously, we will never see
+ * WaitReadBuffers() block. Treat it as if it did block. This is
+ * particularly crucial when effective_io_concurrency=0 is used, as
+ * all IO will be synchronous. Without treating synchronous IO as
+ * having waited, we'd never allow the distance to get large enough to
+ * allow for IO combining, resulting in bad performance.
+ */
+ if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY)
+ needed_wait = true;
/*
- * As we needed IO, prevent distance from being reduced within our
- * maximum look-ahead window. This avoids having distance collapse too
+ * Have the read-ahead distance ramp up rapidly after we needed to
+ * wait for IO. We only increase the read-ahead-distance when we
+ * needed to wait, to avoid increasing the distance further than
+ * necessary, as looking ahead too far can be costly, both due to the
+ * cost of unnecessarily pinning many buffers and due to doing IOs
+ * that may never be consumed if the stream is ended/reset before
+ * completion.
+ *
+ * If we did not need to wait, the current distance was evidently
+ * sufficient.
+ *
+ * NB: Must not increase the distance if we already reached the end of
+ * the stream, as stream->readahead_distance == 0 is used to keep
+ * track of having reached the end.
+ */
+ if (stream->readahead_distance > 0 && needed_wait)
+ {
+ /* wider temporary value, due to overflow risk */
+ int32 readahead_distance;
+
+ readahead_distance = stream->readahead_distance * 2;
+ readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
+ stream->readahead_distance = readahead_distance;
+ }
+
+ /*
+ * As we needed IO, prevent distances from being reduced within our
+ * maximum look-ahead window. This avoids collapsing distances too
* quickly in workloads where most of the required blocks are cached,
* but where the remaining IOs are a sufficient enough factor to cause
* a substantial slowdown if executed synchronously.
@@ -983,6 +1156,30 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
*/
stream->distance_decay_holdoff = stream->max_pinned_buffers;
+ /*
+ * Whether we needed to wait or not, allow for more IO combining if we
+ * needed to do IO. The reason to do so independent of needing to wait
+ * is that when the data is resident in the kernel page cache, IO
+ * combining reduces the syscall / dispatch overhead, making it
+ * worthwhile regardless of needing to wait.
+ *
+ * It is also important with io_uring as it will never signal the need
+ * to wait for reads if all the data is in the page cache. There are
+ * heuristics to deal with that in method_io_uring.c, but they only
+ * work when the IO gets large enough.
+ */
+ if (stream->combine_distance > 0 &&
+ stream->combine_distance < stream->io_combine_limit)
+ {
+ /* wider temporary value, due to overflow risk */
+ int32 combine_distance;
+
+ combine_distance = stream->combine_distance * 2;
+ combine_distance = Min(combine_distance, stream->io_combine_limit);
+ combine_distance = Min(combine_distance, stream->max_pinned_buffers);
+ stream->combine_distance = combine_distance;
+ }
+
/*
* If we've reached the first block of a sequential region we're
* issuing advice for, cancel that until the next jump. The kernel
@@ -1048,7 +1245,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
if (stream->ios_in_progress == 0 &&
stream->forwarded_buffers == 0 &&
stream->pinned_buffers == 1 &&
- stream->distance == 1 &&
+ stream->readahead_distance == 1 &&
+ stream->combine_distance == 1 &&
stream->pending_read_nblocks == 0 &&
stream->per_buffer_data_size == 0)
{
@@ -1094,8 +1292,10 @@ read_stream_next_block(ReadStream *stream, BufferAccessStrategy *strategy)
BlockNumber
read_stream_pause(ReadStream *stream)
{
- stream->resume_distance = stream->distance;
- stream->distance = 0;
+ stream->resume_readahead_distance = stream->readahead_distance;
+ stream->resume_combine_distance = stream->combine_distance;
+ stream->readahead_distance = 0;
+ stream->combine_distance = 0;
return InvalidBlockNumber;
}
@@ -1107,7 +1307,8 @@ read_stream_pause(ReadStream *stream)
void
read_stream_resume(ReadStream *stream)
{
- stream->distance = stream->resume_distance;
+ stream->readahead_distance = stream->resume_readahead_distance;
+ stream->combine_distance = stream->resume_combine_distance;
}
/*
@@ -1123,7 +1324,8 @@ read_stream_reset(ReadStream *stream)
Buffer buffer;
/* Stop looking ahead. */
- stream->distance = 0;
+ stream->readahead_distance = 0;
+ stream->combine_distance = 0;
/* Forget buffered block number and fast path state. */
stream->buffered_blocknum = InvalidBlockNumber;
@@ -1155,8 +1357,10 @@ read_stream_reset(ReadStream *stream)
Assert(stream->ios_in_progress == 0);
/* Start off assuming data is cached. */
- stream->distance = 1;
- stream->resume_distance = stream->distance;
+ stream->readahead_distance = 1;
+ stream->combine_distance = 1;
+ stream->resume_readahead_distance = stream->readahead_distance;
+ stream->resume_combine_distance = stream->combine_distance;
stream->distance_decay_holdoff = 0;
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3cc0b0bdd929f..cf4f4246ca2b4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -840,7 +840,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
PinLocalBuffer(bufHdr, true);
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
return true;
}
@@ -861,7 +861,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
if (BufferTagsEqual(&tag, &bufHdr->tag))
{
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
return true;
}
UnpinBuffer(bufHdr);
@@ -1266,9 +1266,9 @@ PinBufferForBlock(Relation rel,
if (rel)
{
/*
- * While pgBufferUsage's "read" counter isn't bumped unless we reach
- * WaitReadBuffers() (so, not for hits, and not for buffers that are
- * zeroed instead), the per-relation stats always count them.
+ * While the current buffer usage "read" counter isn't bumped unless
+ * we reach WaitReadBuffers() (so, not for hits, and not for buffers
+ * that are zeroed instead), the per-relation stats always count them.
*/
pgstat_count_buffer_read(rel);
}
@@ -1684,9 +1684,9 @@ TrackBufferHit(IOObject io_object, IOContext io_context,
true);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
else
- pgBufferUsage.shared_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
@@ -2148,9 +2148,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
io_start, 1, io_buffers_len * BLCKSZ);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len);
else
- pgBufferUsage.shared_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len);
/*
* Track vacuum cost when issuing IO, not after waiting for it. Otherwise
@@ -3043,7 +3043,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
}
- pgBufferUsage.shared_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by);
*extended_by = extend_by;
@@ -3189,7 +3189,7 @@ MarkBufferDirty(Buffer buffer)
*/
if (!(old_buf_state & BM_DIRTY))
{
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
@@ -4601,7 +4601,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
IOOP_WRITE, io_start, 1, BLCKSZ);
- pgBufferUsage.shared_blks_written++;
+ INSTR_BUFUSAGE_INCR(shared_blks_written);
/*
* Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
@@ -5796,7 +5796,7 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
UnlockBufHdr(bufHdr);
}
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 396da84b25c55..851b99056d571 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -218,7 +218,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
/* Mark not-dirty */
TerminateLocalBufferIO(bufHdr, true, 0, false);
- pgBufferUsage.local_blks_written++;
+ INSTR_BUFUSAGE_INCR(local_blks_written);
}
static Buffer
@@ -479,7 +479,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
*extended_by = extend_by;
- pgBufferUsage.local_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(local_blks_written, extend_by);
return first_block;
}
@@ -510,7 +510,7 @@ MarkLocalBufferDirty(Buffer buffer)
buf_state = pg_atomic_read_u64(&bufHdr->state);
if (!(buf_state & BM_DIRTY))
- pgBufferUsage.local_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(local_blks_dirtied);
buf_state |= BM_DIRTY;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index c4afe4d368a34..8b501dfcadd02 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -475,13 +475,13 @@ BufFileLoadBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start);
}
/* we choose not to advance curOffset here */
if (file->nbytes > 0)
- pgBufferUsage.temp_blks_read++;
+ INSTR_BUFUSAGE_INCR(temp_blks_read);
}
/*
@@ -549,13 +549,13 @@ BufFileDumpBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start);
}
file->curOffset += bytestowrite;
wpos += bytestowrite;
- pgBufferUsage.temp_blks_written++;
+ INSTR_BUFUSAGE_INCR(temp_blks_written);
}
file->dirty = false;
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 2be26e9228361..e7fc7f071d84e 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -114,9 +114,9 @@ pgstat_prepare_io_time(bool track_io_guc)
* pg_stat_database only counts block read and write times, these are done for
* IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
*
- * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats
- * for shared, local and temporary blocks. pg_stat_io does not track the
- * activity of temporary blocks, so these are ignored here.
+ * Executor instrumentation is used for EXPLAIN. Buffer usage tracked there has
+ * write and read stats for shared, local and temporary blocks. pg_stat_io
+ * does not track the activity of temporary blocks, so these are ignored here.
*/
void
pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
@@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
{
pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time);
}
else if (io_op == IOOP_READ)
{
pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time);
}
}
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index f0b4d795071af..a8cbdf247c866 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -459,6 +459,7 @@ extern TimestampTz GetCurrentTransactionStopTimestamp(void);
extern void SetCurrentStatementStartTimestamp(void);
extern int GetCurrentTransactionNestLevel(void);
extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
+extern int GetTopReadOnlyTransactionNestLevel(void);
extern void CommandCounterIncrement(void);
extern void ForceSyncCommit(void);
extern void StartTransactionCommand(void);
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index a38e95bc0eb59..9aee822634781 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -71,6 +71,7 @@ extern void index_check_primary_key(Relation heapRel,
#define INDEX_CREATE_IF_NOT_EXISTS (1 << 4)
#define INDEX_CREATE_PARTITIONED (1 << 5)
#define INDEX_CREATE_INVALID (1 << 6)
+#define INDEX_CREATE_SUPPRESS_PROGRESS (1 << 7)
extern Oid index_create(Relation heapRelation,
const char *indexRelationName,
@@ -101,10 +102,9 @@ extern Oid index_create(Relation heapRelation,
#define INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS (1 << 4)
#define INDEX_CONSTR_CREATE_WITHOUT_OVERLAPS (1 << 5)
-extern Oid index_concurrently_create_copy(Relation heapRelation,
- Oid oldIndexId,
- Oid tablespaceOid,
- const char *newName);
+extern Oid index_create_copy(Relation heapRelation, uint16 flags,
+ Oid oldIndexId, Oid tablespaceOid,
+ const char *newName);
extern void index_concurrently_build(Oid heapRelationId,
Oid indexRelationId);
@@ -149,7 +149,8 @@ extern void index_build(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool isreindex,
- bool parallel);
+ bool parallel,
+ bool progress);
extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
diff --git a/src/include/commands/explain_dr.h b/src/include/commands/explain_dr.h
index f98eaae186457..ab5c53023e1e6 100644
--- a/src/include/commands/explain_dr.h
+++ b/src/include/commands/explain_dr.h
@@ -23,11 +23,10 @@ typedef struct ExplainState ExplainState;
typedef struct SerializeMetrics
{
uint64 bytesSent; /* # of bytes serialized */
- instr_time timeSpent; /* time spent serializing */
- BufferUsage bufferUsage; /* buffers accessed during serialization */
+ Instrumentation instr; /* time and buffer usage */
} SerializeMetrics;
extern DestReceiver *CreateExplainSerializeDestReceiver(ExplainState *es);
-extern SerializeMetrics GetSerializationMetrics(DestReceiver *dest);
+extern SerializeMetrics *GetSerializationMetrics(DestReceiver *dest);
#endif
diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h
index 5a2034811d563..6c8b602d07f98 100644
--- a/src/include/executor/execParallel.h
+++ b/src/include/executor/execParallel.h
@@ -25,9 +25,8 @@ typedef struct ParallelExecutorInfo
{
PlanState *planstate; /* plan subtree we're running in parallel */
ParallelContext *pcxt; /* parallel context we're using */
- BufferUsage *buffer_usage; /* points to bufusage area in DSM */
- WalUsage *wal_usage; /* walusage area in DSM */
- SharedExecutorInstrumentation *instrumentation; /* optional */
+ Instrumentation *instrumentation; /* instrumentation area in DSM */
+ SharedExecutorInstrumentation *node_instrumentation; /* optional */
struct SharedJitInstrumentation *jit_instrumentation; /* optional */
dsa_area *area; /* points to DSA area in DSM */
dsa_pointer param_exec; /* serialized PARAM_EXEC parameters */
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index d3a572428449d..340029a203422 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -51,8 +51,8 @@ typedef struct QueryDesc
/* This field is set by ExecutePlan */
bool already_executed; /* true if previously executed */
- /* This is always set NULL by the core system, but plugins can change it */
- struct Instrumentation *totaltime; /* total time spent in ExecutorRun */
+ /* This field is set by ExecutorRun, or plugins */
+ struct QueryInstrumentation *totaltime; /* total time spent in ExecutorRun */
} QueryDesc;
/* in pquery.c */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 491c48865066a..03f0e864176eb 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -233,6 +233,7 @@ ExecGetJunkAttribute(TupleTableSlot *slot, AttrNumber attno, bool *isNull)
/*
* prototypes from functions in execMain.c
*/
+typedef struct QueryInstrumentation QueryInstrumentation;
extern void ExecutorStart(QueryDesc *queryDesc, int eflags);
extern void standard_ExecutorStart(QueryDesc *queryDesc, int eflags);
extern void ExecutorRun(QueryDesc *queryDesc,
@@ -254,7 +255,7 @@ extern void InitResultRelInfo(ResultRelInfo *resultRelInfo,
Relation resultRelationDesc,
Index resultRelationIndex,
ResultRelInfo *partition_root_rri,
- int instrument_options);
+ QueryInstrumentation *qinstr);
extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid,
ResultRelInfo *rootRelInfo);
extern List *ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo);
@@ -301,6 +302,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function);
extern Node *MultiExecProcNode(PlanState *node);
extern void ExecEndNode(PlanState *node);
extern void ExecShutdownNode(PlanState *node);
+extern void ExecFinalizeNodeInstrumentation(PlanState *node);
+extern void ExecFinalizeWorkerInstrumentation(PlanState *node);
extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9759f3ea5d8d9..bae8a9b0e62ed 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -13,6 +13,7 @@
#ifndef INSTRUMENT_H
#define INSTRUMENT_H
+#include "lib/ilist.h"
#include "portability/instr_time.h"
@@ -67,56 +68,270 @@ typedef enum InstrumentOption
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
+/*
+ * Instrumentation base class for capturing time and WAL/buffer usage
+ *
+ * If used directly:
+ * - Allocate on the stack and zero initialize the struct
+ * - Call InstrInitOptions to set instrumentation options
+ * - Call InstrStart before the activity you want to measure
+ * - Call InstrStop / InstrStopFinalize after the activity to capture totals
+ *
+ * InstrStart/InstrStop may be called multiple times. The last stop call must
+ * be to InstrStopFinalize to ensure parent stack entries get the accumulated
+ * totals. If there is risk of transaction aborts you must call
+ * InstrStopFinalize in a PG_TRY/PG_FINALLY block to avoid corrupting the
+ * instrumentation stack.
+ *
+ * In a query context use QueryInstrumentation instead, which handles aborts
+ * using the resource owner logic.
+ */
typedef struct Instrumentation
{
- /* Parameters set at node creation: */
+ /* Parameters set at creation: */
bool need_timer; /* true if we need timer data */
- bool need_bufusage; /* true if we need buffer usage data */
- bool need_walusage; /* true if we need WAL usage data */
+ bool need_stack; /* true if we need WAL/buffer usage data */
+ /* Internal state keeping: */
+ bool on_stack; /* true if currently on instr_stack */
+ instr_time starttime; /* start time of last InstrStart */
+ /* Accumulated statistics: */
+ instr_time total; /* total runtime */
+ BufferUsage bufusage; /* total buffer usage */
+ WalUsage walusage; /* total WAL usage */
+ /* Abort handling: link in parent QueryInstrumentation's unfinalized list */
+ dlist_node unfinalized_entry;
+} Instrumentation;
+
+/*
+ * Query-related instrumentation tracking.
+ *
+ * Usage:
+ * - Allocate on the heap using InstrQueryAlloc (required for abort handling)
+ * - Call InstrQueryStart before the activity you want to measure
+ * - Call InstrQueryStop / InstrQueryStopFinalize afterwards to capture totals
+ *
+ * InstrQueryStart/InstrQueryStop may be called multiple times. The last stop
+ * call must be to InstrQueryStopFinalize to ensure parent stack entries get
+ * the accumulated totals.
+ *
+ * Uses resource owner mechanism for handling aborts, as such, the caller
+ * *must* not exit out of the top level transaction after having called
+ * InstrQueryStart, without first calling InstrQueryStop or
+ * InstrQueryStopFinalize. In the case of a transaction abort, logic equivalent
+ * to InstrQueryStopFinalize will be called automatically.
+ */
+struct ResourceOwnerData;
+typedef struct QueryInstrumentation
+{
+ Instrumentation instr;
+
+ /* Original instrument_options flags used to create this instrumentation */
+ int instrument_options;
+
+ /* Resource owner used for cleanup for aborts between InstrStart/InstrStop */
+ struct ResourceOwnerData *owner;
+
+ /*
+ * Dedicated memory context for all instrumentation allocations belonging
+ * to this query (node instrumentation, trigger instrumentation, etc.).
+ * Initially a child of TopMemoryContext so it survives transaction abort
+ * for ResourceOwner cleanup, which is then reassigned to the current
+ * memory context on InstrQueryStopFinalize.
+ */
+ MemoryContext instr_cxt;
+
+ /*
+ * Child entries that need to be cleaned up on abort, since they are not
+ * registered as a resource owner themselves. Contains both node and
+ * trigger instrumentation entries linked via instr.unfinalized_entry.
+ */
+ dlist_head unfinalized_entries;
+} QueryInstrumentation;
+
+/*
+ * Specialized instrumentation for per-node execution statistics
+ *
+ * Relies on an outer QueryInstrumentation having been set up to handle the
+ * stack used for WAL/buffer usage statistics, and relies on it for managing
+ * aborts. Solely intended for the executor and anyone reporting about its
+ * activities (e.g. EXPLAIN ANALYZE).
+ */
+typedef struct NodeInstrumentation
+{
+ Instrumentation instr;
+ /* Parameters set at node creation: */
bool async_mode; /* true if node is in async mode */
/* Info about current plan cycle: */
bool running; /* true if we've completed first tuple */
- instr_time starttime; /* start time of current iteration of node */
instr_time counter; /* accumulated runtime for this node */
instr_time firsttuple; /* time for first tuple of this cycle */
double tuplecount; /* # of tuples emitted so far this cycle */
- BufferUsage bufusage_start; /* buffer usage at start */
- WalUsage walusage_start; /* WAL usage at start */
/* Accumulated statistics across all completed cycles: */
instr_time startup; /* total startup time */
- instr_time total; /* total time */
double ntuples; /* total tuples produced */
double ntuples2; /* secondary node-specific tuple counter */
double nloops; /* # of run cycles for this node */
double nfiltered1; /* # of tuples removed by scanqual or joinqual */
double nfiltered2; /* # of tuples removed by "other" quals */
- BufferUsage bufusage; /* total buffer usage */
- WalUsage walusage; /* total WAL usage */
-} Instrumentation;
+} NodeInstrumentation;
-typedef struct WorkerInstrumentation
+/*
+ * Care must be taken with any pointers contained within this struct, as this
+ * gets copied across processes during parallel query execution.
+ */
+typedef struct WorkerNodeInstrumentation
{
int num_workers; /* # of structures that follow */
- Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
-} WorkerInstrumentation;
+ NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} WorkerNodeInstrumentation;
+
+typedef struct TriggerInstrumentation
+{
+ Instrumentation instr;
+ int firings; /* number of times the instrumented trigger
+ * was fired */
+} TriggerInstrumentation;
+
+/*
+ * Dynamic array-based stack for tracking current WAL/buffer usage context.
+ *
+ * When the stack is empty, 'current' points to instr_top which accumulates
+ * session-level totals.
+ */
+typedef struct InstrStackState
+{
+ int stack_space; /* allocated capacity of entries array */
+ int stack_size; /* current number of entries */
+
+ Instrumentation **entries; /* dynamic array of pointers */
+ Instrumentation *current; /* top of stack, or &instr_top when empty */
+} InstrStackState;
-extern PGDLLIMPORT BufferUsage pgBufferUsage;
extern PGDLLIMPORT WalUsage pgWalUsage;
-extern Instrumentation *InstrAlloc(int n, int instrument_options,
- bool async_mode);
-extern void InstrInit(Instrumentation *instr, int instrument_options);
-extern void InstrStartNode(Instrumentation *instr);
-extern void InstrStopNode(Instrumentation *instr, double nTuples);
-extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
-extern void InstrEndLoop(Instrumentation *instr);
-extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
-extern void InstrStartParallelQuery(void);
-extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void BufferUsageAccumDiff(BufferUsage *dst,
- const BufferUsage *add, const BufferUsage *sub);
+/*
+ * The top instrumentation represents a running total of the current backend
+ * WAL/buffer usage information. This will not be updated immediately, but
+ * rather when the current stack entry gets accumulated which typically happens
+ * at query end.
+ *
+ * Care must be taken when utilizing this in the parallel worker context:
+ * Parallel workers will report back their instrumentation to the caller,
+ * and this gets added to the caller's stack. If this were to be used in the
+ * shared memory stats infrastructure it would need to be skipped on parallel
+ * workers to avoid double counting.
+ */
+extern PGDLLIMPORT Instrumentation instr_top;
+
+/*
+ * The instrumentation stack state. The 'current' field points to the
+ * currently active stack entry that is getting updated as activity happens,
+ * and will be accumulated to parent stacks when it gets finalized by
+ * InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation
+ * (executor finish) or ResOwnerReleaseInstrumentation on abort.
+ */
+extern PGDLLIMPORT InstrStackState instr_stack;
+
+extern void InstrStackGrow(void);
+
+/*
+ * Pushes the stack so that all WAL/buffer usage updates go to the passed in
+ * instrumentation entry.
+ *
+ * See note on InstrPopStack regarding safe use of these functions.
+ */
+static inline void
+InstrPushStack(Instrumentation *instr)
+{
+ if (unlikely(instr_stack.stack_size == instr_stack.stack_space))
+ InstrStackGrow();
+
+ instr_stack.entries[instr_stack.stack_size++] = instr;
+ instr_stack.current = instr;
+ instr->on_stack = true;
+}
+
+/*
+ * Pops the stack entry back to the previous one that was effective at
+ * InstrPushStack.
+ *
+ * Callers must ensure that no intermediate stack entries are skipped, to
+ * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY
+ * block, consider instead using InstrStart + InstrStopFinalize which can skip
+ * intermediate stack entries.
+ */
+static inline void
+InstrPopStack(Instrumentation *instr)
+{
+ Assert(instr_stack.stack_size > 0);
+ Assert(instr_stack.entries[instr_stack.stack_size - 1] == instr);
+ instr_stack.stack_size--;
+ instr_stack.current = instr_stack.stack_size > 0
+ ? instr_stack.entries[instr_stack.stack_size - 1]
+ : &instr_top;
+ instr->on_stack = false;
+}
+
+extern void InstrInitOptions(Instrumentation *instr, int instrument_options);
+extern void InstrStart(Instrumentation *instr);
+extern void InstrStop(Instrumentation *instr);
+extern void InstrStopFinalize(Instrumentation *instr);
+extern void InstrFinalizeChild(Instrumentation *instr, Instrumentation *parent);
+extern void InstrAccumStack(Instrumentation *dst, Instrumentation *add);
+
+extern QueryInstrumentation *InstrQueryAlloc(int instrument_options);
+extern void InstrQueryStart(QueryInstrumentation *instr);
+extern void InstrQueryStop(QueryInstrumentation *instr);
+extern void InstrQueryStopFinalize(QueryInstrumentation *instr);
+extern void InstrQueryRememberChild(QueryInstrumentation *parent, Instrumentation *instr);
+
+pg_nodiscard extern QueryInstrumentation *InstrStartParallelQuery(void);
+extern void InstrEndParallelQuery(QueryInstrumentation *qinstr, Instrumentation *dst);
+extern void InstrAccumParallelQuery(Instrumentation *instr);
+
+extern NodeInstrumentation *InstrAllocNode(QueryInstrumentation *qinstr, bool async_mode);
+extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);
+extern void InstrStartNode(NodeInstrumentation *instr);
+extern void InstrStopNode(NodeInstrumentation *instr, double nTuples);
+extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
+extern void InstrEndLoop(NodeInstrumentation *instr);
+extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
+
+typedef struct TupleTableSlot TupleTableSlot;
+typedef struct PlanState PlanState;
+typedef TupleTableSlot *(*ExecProcNodeMtd) (PlanState *pstate);
+extern ExecProcNodeMtd InstrNodeSetupExecProcNode(NodeInstrumentation *instr);
+
+extern TriggerInstrumentation *InstrAllocTrigger(QueryInstrumentation *qinstr, int n);
+extern void InstrStartTrigger(QueryInstrumentation *qinstr,
+ TriggerInstrumentation *tginstr);
+extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings);
+
+extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
+extern void WalUsageAdd(WalUsage *dst, const WalUsage *add);
extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
const WalUsage *sub);
+#define INSTR_BUFUSAGE_INCR(fld) do { \
+ instr_stack.current->bufusage.fld++; \
+ } while(0)
+#define INSTR_BUFUSAGE_ADD(fld,val) do { \
+ instr_stack.current->bufusage.fld += (val); \
+ } while(0)
+#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
+ INSTR_TIME_ADD(instr_stack.current->bufusage.fld, val); \
+ } while (0)
+#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
+ INSTR_TIME_ACCUM_DIFF(instr_stack.current->bufusage.fld, endval, startval); \
+ } while (0)
+
+#define INSTR_WALUSAGE_INCR(fld) do { \
+ pgWalUsage.fld++; \
+ instr_stack.current->walusage.fld++; \
+ } while(0)
+#define INSTR_WALUSAGE_ADD(fld,val) do { \
+ pgWalUsage.fld += (val); \
+ instr_stack.current->walusage.fld += (val); \
+ } while(0)
+
#endif /* INSTRUMENT_H */
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 2a0ff377a7312..e2315cef384c6 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -18,6 +18,8 @@
#ifndef INSTRUMENT_NODE_H
#define INSTRUMENT_NODE_H
+#include "executor/instrument.h"
+
/* ---------------------
* Instrumentation information for aggregate function execution
@@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation
{
/* Index search count (incremented with pgstat_count_index_scan call) */
uint64 nsearches;
+
+ /* Instrumentation utilized for tracking buffer usage during table access */
+ Instrumentation table_instr;
} IndexScanInstrumentation;
/*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 090cfccf65fa0..b28288aa1e8db 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -54,12 +54,15 @@ typedef struct Instrumentation Instrumentation;
typedef struct pairingheap pairingheap;
typedef struct PlanState PlanState;
typedef struct QueryEnvironment QueryEnvironment;
+typedef struct QueryInstrumentation QueryInstrumentation;
typedef struct RelationData *Relation;
typedef Relation *RelationPtr;
typedef struct ScanKeyData ScanKeyData;
typedef struct SnapshotData *Snapshot;
typedef struct SortSupportData *SortSupport;
typedef struct TIDBitmap TIDBitmap;
+typedef struct NodeInstrumentation NodeInstrumentation;
+typedef struct TriggerInstrumentation TriggerInstrumentation;
typedef struct TupleConversionMap TupleConversionMap;
typedef struct TupleDescData *TupleDesc;
typedef struct Tuplesortstate Tuplesortstate;
@@ -67,7 +70,7 @@ typedef struct Tuplestorestate Tuplestorestate;
typedef struct TupleTableSlot TupleTableSlot;
typedef struct TupleTableSlotOps TupleTableSlotOps;
typedef struct WalUsage WalUsage;
-typedef struct WorkerInstrumentation WorkerInstrumentation;
+typedef struct WorkerNodeInstrumentation WorkerNodeInstrumentation;
/* ----------------
@@ -552,7 +555,7 @@ typedef struct ResultRelInfo
ExprState **ri_TrigWhenExprs;
/* optional runtime measurements for triggers */
- Instrumentation *ri_TrigInstrument;
+ TriggerInstrumentation *ri_TrigInstrument;
/* On-demand created slots for triggers / returning processing */
TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */
@@ -751,7 +754,7 @@ typedef struct EState
* ExecutorRun() calls. */
int es_top_eflags; /* eflags passed to ExecutorStart */
- int es_instrument; /* OR of InstrumentOption flags */
+ QueryInstrumentation *es_instrument; /* query-level instrumentation */
bool es_finished; /* true when ExecutorFinish is done */
List *es_exprcontexts; /* List of ExprContexts within EState */
@@ -1206,8 +1209,10 @@ typedef struct PlanState
ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a
* wrapper */
- Instrumentation *instrument; /* Optional runtime stats for this node */
- WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
+ NodeInstrumentation *instrument; /* Optional runtime stats for this
+ * node */
+ WorkerNodeInstrumentation *worker_instrument; /* per-worker
+ * instrumentation */
/* Per-worker JIT instrumentation */
struct SharedJitInstrumentation *worker_jit_instrument;
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
index ccded021433b0..a22a83a2f237c 100644
--- a/src/include/replication/snapbuild.h
+++ b/src/include/replication/snapbuild.h
@@ -15,6 +15,14 @@
#include "access/xlogdefs.h"
#include "utils/snapmgr.h"
+/*
+ * forward declarations in this file
+ */
+typedef struct ReorderBuffer ReorderBuffer;
+typedef struct SnapBuild SnapBuild;
+typedef struct xl_heap_new_cid xl_heap_new_cid;
+typedef struct xl_running_xacts xl_running_xacts;
+
/*
* Please keep get_snapbuild_state_desc() (located in the pg_logicalinspect
* module) updated if a change needs to be made to SnapBuildState.
@@ -50,20 +58,11 @@ typedef enum
SNAPBUILD_CONSISTENT = 2,
} SnapBuildState;
-/* forward declare so we don't have to include snapbuild_internal.h */
-struct SnapBuild;
-typedef struct SnapBuild SnapBuild;
-
-/* forward declare so we don't have to include reorderbuffer.h */
-struct ReorderBuffer;
-/* forward declare so we don't have to include heapam_xlog.h */
-struct xl_heap_new_cid;
-struct xl_running_xacts;
extern void CheckPointSnapBuild(void);
-extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *reorder,
+extern SnapBuild *AllocateSnapshotBuilder(ReorderBuffer *reorder,
TransactionId xmin_horizon, XLogRecPtr start_lsn,
bool need_full_snapshot,
bool in_slot_creation,
@@ -91,9 +90,9 @@ extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
XLogRecPtr lsn);
extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
XLogRecPtr lsn,
- struct xl_heap_new_cid *xlrec);
+ xl_heap_new_cid *xlrec);
extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
- struct xl_running_xacts *running);
+ xl_running_xacts *running);
extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
extern bool SnapBuildSnapshotExists(XLogRecPtr lsn);
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index eb6033b4fdb65..5463bc921f06e 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority;
#define RELEASE_PRIO_SNAPSHOT_REFS 500
#define RELEASE_PRIO_FILES 600
#define RELEASE_PRIO_WAITEVENTSETS 700
+#define RELEASE_PRIO_INSTRUMENTATION 800
/* 0 is considered invalid */
#define RELEASE_PRIO_FIRST 1
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index 72d70aea1e164..591e23df44b45 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -108,7 +108,8 @@ pg_crc32c_armv8_available(void)
#endif
}
-static inline bool
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+static bool
pg_pmull_available(void)
{
#if defined(__aarch64__) && defined(HWCAP_PMULL)
@@ -128,6 +129,7 @@ pg_pmull_available(void)
return false;
#endif
}
+#endif /* USE_PMULL_CRC32C_WITH_RUNTIME_CHECK */
/*
* This gets called on the first call. It replaces the function pointer
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 864b407abcff7..c5ace162fe23c 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -48,6 +48,7 @@ SUBDIRS = \
test_resowner \
test_rls_hooks \
test_saslprep \
+ test_session_buffer_usage \
test_shm_mq \
test_slru \
test_tidstore \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index e5acacd508368..802cc93d71a48 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -49,6 +49,7 @@ subdir('test_regex')
subdir('test_resowner')
subdir('test_rls_hooks')
subdir('test_saslprep')
+subdir('test_session_buffer_usage')
subdir('test_shm_mq')
subdir('test_slru')
subdir('test_tidstore')
diff --git a/src/test/modules/test_session_buffer_usage/Makefile b/src/test/modules/test_session_buffer_usage/Makefile
new file mode 100644
index 0000000000000..1252b222cb9f8
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_session_buffer_usage/Makefile
+
+MODULE_big = test_session_buffer_usage
+OBJS = \
+ $(WIN32RES) \
+ test_session_buffer_usage.o
+
+EXTENSION = test_session_buffer_usage
+DATA = test_session_buffer_usage--1.0.sql
+PGFILEDESC = "test_session_buffer_usage - show buffer usage statistics for the current session"
+
+REGRESS = test_session_buffer_usage
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_session_buffer_usage
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out b/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
new file mode 100644
index 0000000000000..5f7d349871af8
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/expected/test_session_buffer_usage.out
@@ -0,0 +1,342 @@
+LOAD 'test_session_buffer_usage';
+CREATE EXTENSION test_session_buffer_usage;
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM test_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+ AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+ AND local_blks_hit >= 0 AND local_blks_read >= 0
+ AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+ AND temp_blks_read >= 0 AND temp_blks_written >= 0
+ AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+ AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+ AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+ ok
+----
+ t
+(1 row)
+
+-- Verify counters increase after buffer activity
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+ count
+-------
+ 1000
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM test_session_buffer_usage();
+ blocks_increased
+------------------
+ t
+(1 row)
+
+DROP TABLE test_buf_activity;
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+ AS leader_buffers_match
+FROM test_session_buffer_usage(), dc_serial_result s;
+ leader_buffers_match
+----------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_dc_tab, dc_serial_result;
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 FROM ins;
+ERROR: division by zero
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM test_session_buffer_usage();
+ exception_buffers_visible
+---------------------------
+ t
+(1 row)
+
+DROP TABLE exc_tab;
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+ERROR: duplicate key value violates unique constraint "unique_tab_a_key"
+DETAIL: Key (a)=(1) already exists.
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM test_session_buffer_usage();
+ unique_violation_buffers_visible
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE unique_tab;
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+ BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+ SELECT a / 0 FROM ins';
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT subxact_exc_func();
+ subxact_exc_func
+------------------
+ caught
+(1 row)
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM test_session_buffer_usage();
+ subxact_buffers_visible
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+ rec record;
+ cnt int := 0;
+BEGIN
+ BEGIN
+ FOR rec IN SELECT * FROM cursor_tab LOOP
+ cnt := cnt + 1;
+ IF cnt = 250 THEN
+ PERFORM 1 / 0;
+ END IF;
+ END LOOP;
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught after ' || cnt || ' rows';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT cursor_exc_func();
+ cursor_exc_func
+-----------------------
+ caught after 250 rows
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0
+ AS cursor_subxact_buffers_visible
+FROM test_session_buffer_usage();
+ cursor_subxact_buffers_visible
+--------------------------------
+ t
+(1 row)
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a
+-- trigger that throws an error is still properly propagated.
+CREATE TEMP TABLE trig_err_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int, b char(200));
+INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+-- Warm local buffers so trig_work_tab reads become hits
+SELECT count(*) FROM trig_work_tab;
+ count
+-------
+ 500
+(1 row)
+
+CREATE FUNCTION trig_err_func() RETURNS trigger AS $$
+BEGIN
+ PERFORM count(*) FROM trig_work_tab;
+ RAISE EXCEPTION 'trigger error';
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab
+ FOR EACH ROW EXECUTE FUNCTION trig_err_func();
+-- Measure how many local buffer hits a scan of trig_work_tab produces
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT count(*) FROM trig_work_tab;
+ count
+-------
+ 500
+(1 row)
+
+CREATE TEMP TABLE trig_serial_result AS
+SELECT local_blks_hit AS serial_hits FROM test_session_buffer_usage();
+-- Now trigger the same scan via a trigger that errors
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ INSERT INTO trig_err_tab VALUES (1);
+ERROR: trigger error
+CONTEXT: PL/pgSQL function trig_err_func() line 4 at RAISE
+-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran.
+-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated.
+SELECT local_blks_hit >= s.serial_hits / 2
+ AS trigger_abort_buffers_propagated
+FROM test_session_buffer_usage(), trig_serial_result s;
+ trigger_abort_buffers_propagated
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result;
+DROP FUNCTION trig_err_func;
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+SELECT test_session_buffer_usage_reset();
+ test_session_buffer_usage_reset
+---------------------------------
+
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CONTEXT: parallel worker
+RESET debug_parallel_query;
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+ AS worker_abort_buffers_not_propagated
+FROM test_session_buffer_usage(), par_abort_serial_result s;
+ worker_abort_buffers_not_propagated
+-------------------------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_abort_tab, par_abort_serial_result;
+-- Cleanup
+DROP EXTENSION test_session_buffer_usage;
diff --git a/src/test/modules/test_session_buffer_usage/meson.build b/src/test/modules/test_session_buffer_usage/meson.build
new file mode 100644
index 0000000000000..b96f67dc7fe37
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_session_buffer_usage_sources = files(
+ 'test_session_buffer_usage.c',
+)
+
+if host_system == 'windows'
+ test_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'test_session_buffer_usage',
+ '--FILEDESC', 'test_session_buffer_usage - show buffer usage statistics for the current session',])
+endif
+
+test_session_buffer_usage = shared_module('test_session_buffer_usage',
+ test_session_buffer_usage_sources,
+ kwargs: pg_test_mod_args,
+)
+test_install_libs += test_session_buffer_usage
+
+test_install_data += files(
+ 'test_session_buffer_usage.control',
+ 'test_session_buffer_usage--1.0.sql',
+)
+
+tests += {
+ 'name': 'test_session_buffer_usage',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'test_session_buffer_usage',
+ ],
+ },
+}
diff --git a/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql b/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
new file mode 100644
index 0000000000000..daf2159c4a653
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/sql/test_session_buffer_usage.sql
@@ -0,0 +1,245 @@
+LOAD 'test_session_buffer_usage';
+CREATE EXTENSION test_session_buffer_usage;
+
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM test_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+ AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+ AND local_blks_hit >= 0 AND local_blks_read >= 0
+ AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+ AND temp_blks_read >= 0 AND temp_blks_written >= 0
+ AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+ AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+ AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+
+-- Verify counters increase after buffer activity
+SELECT test_session_buffer_usage_reset();
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM test_session_buffer_usage();
+
+DROP TABLE test_buf_activity;
+
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+SELECT count(*) FROM par_dc_tab;
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+ AS leader_buffers_match
+FROM test_session_buffer_usage(), dc_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_dc_tab, dc_serial_result;
+
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+
+SELECT test_session_buffer_usage_reset();
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 FROM ins;
+
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP TABLE exc_tab;
+
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+
+SELECT test_session_buffer_usage_reset();
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP TABLE unique_tab;
+
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+ BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+ SELECT a / 0 FROM ins';
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT test_session_buffer_usage_reset();
+SELECT subxact_exc_func();
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+ rec record;
+ cnt int := 0;
+BEGIN
+ BEGIN
+ FOR rec IN SELECT * FROM cursor_tab LOOP
+ cnt := cnt + 1;
+ IF cnt = 250 THEN
+ PERFORM 1 / 0;
+ END IF;
+ END LOOP;
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught after ' || cnt || ' rows';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT test_session_buffer_usage_reset();
+SELECT cursor_exc_func();
+
+SELECT local_blks_hit + local_blks_read > 0
+ AS cursor_subxact_buffers_visible
+FROM test_session_buffer_usage();
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+
+-- Trigger abort under EXPLAIN ANALYZE: verify that buffer activity from a
+-- trigger that throws an error is still properly propagated.
+CREATE TEMP TABLE trig_err_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int, b char(200));
+INSERT INTO trig_work_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+-- Warm local buffers so trig_work_tab reads become hits
+SELECT count(*) FROM trig_work_tab;
+
+CREATE FUNCTION trig_err_func() RETURNS trigger AS $$
+BEGIN
+ PERFORM count(*) FROM trig_work_tab;
+ RAISE EXCEPTION 'trigger error';
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trig_err BEFORE INSERT ON trig_err_tab
+ FOR EACH ROW EXECUTE FUNCTION trig_err_func();
+
+-- Measure how many local buffer hits a scan of trig_work_tab produces
+SELECT test_session_buffer_usage_reset();
+SELECT count(*) FROM trig_work_tab;
+
+CREATE TEMP TABLE trig_serial_result AS
+SELECT local_blks_hit AS serial_hits FROM test_session_buffer_usage();
+
+-- Now trigger the same scan via a trigger that errors
+SELECT test_session_buffer_usage_reset();
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ INSERT INTO trig_err_tab VALUES (1);
+
+-- The trigger scanned trig_work_tab but errored before InstrStopTrigger ran.
+-- InstrStopFinalize in the PG_CATCH ensures buffer data is still propagated.
+SELECT local_blks_hit >= s.serial_hits / 2
+ AS trigger_abort_buffers_propagated
+FROM test_session_buffer_usage(), trig_serial_result s;
+
+DROP TABLE trig_err_tab, trig_work_tab, trig_serial_result;
+DROP FUNCTION trig_err_func;
+
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT test_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM test_session_buffer_usage();
+
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+
+SELECT test_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+RESET debug_parallel_query;
+
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+ AS worker_abort_buffers_not_propagated
+FROM test_session_buffer_usage(), par_abort_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_abort_tab, par_abort_serial_result;
+
+-- Cleanup
+DROP EXTENSION test_session_buffer_usage;
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
new file mode 100644
index 0000000000000..e9833be470ae5
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql
@@ -0,0 +1,31 @@
+/* src/test/modules/test_session_buffer_usage/test_session_buffer_usage--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_session_buffer_usage" to load this file. \quit
+
+CREATE FUNCTION test_session_buffer_usage(
+ OUT shared_blks_hit bigint,
+ OUT shared_blks_read bigint,
+ OUT shared_blks_dirtied bigint,
+ OUT shared_blks_written bigint,
+ OUT local_blks_hit bigint,
+ OUT local_blks_read bigint,
+ OUT local_blks_dirtied bigint,
+ OUT local_blks_written bigint,
+ OUT temp_blks_read bigint,
+ OUT temp_blks_written bigint,
+ OUT shared_blk_read_time double precision,
+ OUT shared_blk_write_time double precision,
+ OUT local_blk_read_time double precision,
+ OUT local_blk_write_time double precision,
+ OUT temp_blk_read_time double precision,
+ OUT temp_blk_write_time double precision
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'test_session_buffer_usage'
+LANGUAGE C PARALLEL RESTRICTED;
+
+CREATE FUNCTION test_session_buffer_usage_reset()
+RETURNS void
+AS 'MODULE_PATHNAME', 'test_session_buffer_usage_reset'
+LANGUAGE C PARALLEL RESTRICTED;
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
new file mode 100644
index 0000000000000..50eb1a2ffe621
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
@@ -0,0 +1,95 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_session_buffer_usage.c
+ * show buffer usage statistics for the current session
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * src/test/modules/test_session_buffer_usage/test_session_buffer_usage.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/instrument.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC_EXT(
+ .name = "test_session_buffer_usage",
+ .version = PG_VERSION
+);
+
+#define NUM_BUFFER_USAGE_COLUMNS 16
+
+PG_FUNCTION_INFO_V1(test_session_buffer_usage);
+PG_FUNCTION_INFO_V1(test_session_buffer_usage_reset);
+
+#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack
+ * change */
+
+/*
+ * SQL function: test_session_buffer_usage()
+ *
+ * Returns a single row with all BufferUsage counters accumulated since the
+ * start of the session. Excludes any usage not yet added to the top of the
+ * stack (e.g. if this gets called inside a statement that also had buffer
+ * activity).
+ */
+Datum
+test_session_buffer_usage(PG_FUNCTION_ARGS)
+{
+ TupleDesc tupdesc;
+ Datum values[NUM_BUFFER_USAGE_COLUMNS];
+ bool nulls[NUM_BUFFER_USAGE_COLUMNS];
+ BufferUsage *usage;
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ memset(nulls, 0, sizeof(nulls));
+
+#if HAVE_INSTR_STACK
+ usage = &instr_top.bufusage;
+#else
+ usage = &pgBufferUsage;
+#endif
+
+ values[0] = Int64GetDatum(usage->shared_blks_hit);
+ values[1] = Int64GetDatum(usage->shared_blks_read);
+ values[2] = Int64GetDatum(usage->shared_blks_dirtied);
+ values[3] = Int64GetDatum(usage->shared_blks_written);
+ values[4] = Int64GetDatum(usage->local_blks_hit);
+ values[5] = Int64GetDatum(usage->local_blks_read);
+ values[6] = Int64GetDatum(usage->local_blks_dirtied);
+ values[7] = Int64GetDatum(usage->local_blks_written);
+ values[8] = Int64GetDatum(usage->temp_blks_read);
+ values[9] = Int64GetDatum(usage->temp_blks_written);
+ values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time));
+ values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time));
+ values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time));
+ values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time));
+ values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+ values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * SQL function: test_session_buffer_usage_reset()
+ *
+ * Resets all BufferUsage counters on the top instrumentation stack to zero.
+ * Useful in tests to avoid the baseline/delta pattern.
+ */
+Datum
+test_session_buffer_usage_reset(PG_FUNCTION_ARGS)
+{
+#if HAVE_INSTR_STACK
+ memset(&instr_top.bufusage, 0, sizeof(BufferUsage));
+#else
+ memset(&pgBufferUsage, 0, sizeof(BufferUsage));
+#endif
+
+ PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control
new file mode 100644
index 0000000000000..41cfb15a7650a
--- /dev/null
+++ b/src/test/modules/test_session_buffer_usage/test_session_buffer_usage.control
@@ -0,0 +1,5 @@
+# test_session_buffer_usage extension
+comment = 'show buffer usage statistics for the current session'
+default_version = '1.0'
+module_pathname = '$libdir/test_session_buffer_usage'
+relocatable = true
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 7c1f26b182cb0..f630acd5f54fc 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -822,3 +822,298 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
(9 rows)
reset work_mem;
+-- Test parallel bitmap heap scan reports per-worker heap block stats.
+CREATE FUNCTION check_parallel_bitmap_heap_scan() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+BEGIN
+ SET LOCAL enable_seqscan = off;
+ SET LOCAL enable_indexscan = off;
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL min_parallel_index_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1 WHERE hundred > 1' INTO plan_json;
+
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Bitmap Heap Scan' LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+
+ RETURN COALESCE((node->>'Exact Heap Blocks')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
+ parallel_bitmap_instrumentation
+---------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_parallel_bitmap_heap_scan;
+-- Test parallel index-only scan reports per-worker index search stats.
+CREATE FUNCTION check_parallel_indexonly_scan() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+BEGIN
+ SET LOCAL enable_seqscan = off;
+ SET LOCAL enable_bitmapscan = off;
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_index_scan_size = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1 WHERE thousand > 95' INTO plan_json;
+
+ -- Drill down to the Index Only Scan node
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Index Only Scan' LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+
+ RETURN COALESCE((node->>'Index Searches')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
+ parallel_indexonly_instrumentation
+------------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_parallel_indexonly_scan;
+-- Test parallel query reports similar buffer stats to a serial run
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+ plan_json json;
+ serial_buffers int;
+ parallel_buffers int;
+ node json;
+BEGIN
+ -- Serial --
+ SET LOCAL max_parallel_workers_per_gather = 0;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ serial_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ -- Parallel --
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ parallel_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_parallel_explain_buffers();
+ ratio
+-------
+ 1
+(1 row)
+
+DROP FUNCTION check_parallel_explain_buffers;
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT explain_exc_func()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+ exception_buffers_visible
+---------------------------
+ t
+(1 row)
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT check_explain_exception_buffers()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+ exception_buffers_nested_visible
+----------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly reported.
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+ plan_json json;
+ node json;
+ direct_buf int;
+ noscroll_buf int;
+ scroll_buf int;
+BEGIN
+ -- Direct scan: get leaf Seq Scan node buffers as baseline
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT * FROM cursor_buf_test' INTO plan_json;
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+ direct_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_noscroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ noscroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Scrollable cursor path: ExecShutdownNode is skipped
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_scroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ scroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Both cursor paths should report buffer counts about as high as
+ -- the direct scan (same data plus minor catalog overhead), and not
+ -- double-counted (< 2x the direct scan)
+ RETURN QUERY SELECT
+ (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+ (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_cursor_explain_buffers();
+ noscroll_ok | scroll_ok
+-------------+-----------
+ t | t
+(1 row)
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+-- Test trigger instrumentation.
+CREATE TEMP TABLE trig_test_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int);
+INSERT INTO trig_work_tab VALUES (1);
+CREATE FUNCTION trig_test_func() RETURNS trigger AS $$
+BEGIN
+ PERFORM * FROM trig_work_tab;
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_test_trig
+ BEFORE INSERT ON trig_test_tab
+ FOR EACH ROW EXECUTE FUNCTION trig_test_func();
+CREATE FUNCTION check_trigger_explain_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ trig json;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ INSERT INTO trig_test_tab VALUES (1)' INTO plan_json;
+ trig := plan_json->0->'Triggers'->0;
+ RETURN COALESCE((trig->>'Calls')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_trigger_explain_buffers() AS trigger_buffers_visible;
+ trigger_buffers_visible
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_trigger_explain_buffers;
+DROP TRIGGER trig_test_trig ON trig_test_tab;
+DROP FUNCTION trig_test_func;
+DROP TABLE trig_test_tab;
+DROP TABLE trig_work_tab;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ebdab42604beb..74f605739f156 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -188,3 +188,292 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
-- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk)
select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))');
reset work_mem;
+
+-- Test parallel bitmap heap scan reports per-worker heap block stats.
+CREATE FUNCTION check_parallel_bitmap_heap_scan() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+BEGIN
+ SET LOCAL enable_seqscan = off;
+ SET LOCAL enable_indexscan = off;
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL min_parallel_index_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1 WHERE hundred > 1' INTO plan_json;
+
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Bitmap Heap Scan' LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+
+ RETURN COALESCE((node->>'Exact Heap Blocks')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_parallel_bitmap_heap_scan() AS parallel_bitmap_instrumentation;
+
+DROP FUNCTION check_parallel_bitmap_heap_scan;
+
+-- Test parallel index-only scan reports per-worker index search stats.
+CREATE FUNCTION check_parallel_indexonly_scan() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+BEGIN
+ SET LOCAL enable_seqscan = off;
+ SET LOCAL enable_bitmapscan = off;
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_index_scan_size = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1 WHERE thousand > 95' INTO plan_json;
+
+ -- Drill down to the Index Only Scan node
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL AND node->>'Node Type' != 'Index Only Scan' LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+
+ RETURN COALESCE((node->>'Index Searches')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_parallel_indexonly_scan() AS parallel_indexonly_instrumentation;
+
+DROP FUNCTION check_parallel_indexonly_scan;
+
+-- Test parallel query reports similar buffer stats to a serial run
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+ plan_json json;
+ serial_buffers int;
+ parallel_buffers int;
+ node json;
+BEGIN
+ -- Serial --
+ SET LOCAL max_parallel_workers_per_gather = 0;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ serial_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ -- Parallel --
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ parallel_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_parallel_explain_buffers();
+
+DROP FUNCTION check_parallel_explain_buffers;
+
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT explain_exc_func()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT check_explain_exception_buffers()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly reported.
+
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+ plan_json json;
+ node json;
+ direct_buf int;
+ noscroll_buf int;
+ scroll_buf int;
+BEGIN
+ -- Direct scan: get leaf Seq Scan node buffers as baseline
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT * FROM cursor_buf_test' INTO plan_json;
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+ direct_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_noscroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ noscroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Scrollable cursor path: ExecShutdownNode is skipped
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_scroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ scroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Both cursor paths should report buffer counts about as high as
+ -- the direct scan (same data plus minor catalog overhead), and not
+ -- double-counted (< 2x the direct scan)
+ RETURN QUERY SELECT
+ (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+ (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_cursor_explain_buffers();
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+
+-- Test trigger instrumentation.
+CREATE TEMP TABLE trig_test_tab (a int);
+CREATE TEMP TABLE trig_work_tab (a int);
+INSERT INTO trig_work_tab VALUES (1);
+
+CREATE FUNCTION trig_test_func() RETURNS trigger AS $$
+BEGIN
+ PERFORM * FROM trig_work_tab;
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trig_test_trig
+ BEFORE INSERT ON trig_test_tab
+ FOR EACH ROW EXECUTE FUNCTION trig_test_func();
+
+CREATE FUNCTION check_trigger_explain_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ trig json;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ INSERT INTO trig_test_tab VALUES (1)' INTO plan_json;
+ trig := plan_json->0->'Triggers'->0;
+ RETURN COALESCE((trig->>'Calls')::int, 0) > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_trigger_explain_buffers() AS trigger_buffers_visible;
+
+DROP FUNCTION check_trigger_explain_buffers;
+DROP TRIGGER trig_test_trig ON trig_test_tab;
+DROP FUNCTION trig_test_func;
+DROP TABLE trig_test_tab;
+DROP TABLE trig_work_tab;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c72f6c595730a..7393926e34d97 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1355,6 +1355,7 @@ InjectionPointSharedState
InjectionPointsCtl
InlineCodeBlock
InsertStmt
+InstrStackState
Instrumentation
Int128AggState
Int8TransTypeData
@@ -1822,6 +1823,7 @@ NextSampleBlock_function
NextSampleTuple_function
NextValueExpr
Node
+NodeInstrumentation
NodeTag
NonEmptyRange
NoneCompressorState
@@ -2476,6 +2478,7 @@ QueryCompletion
QueryDesc
QueryEnvironment
QueryInfo
+QueryInstrumentation
QueryItem
QueryItemType
QueryMode
@@ -3213,6 +3216,7 @@ TriggerDesc
TriggerEvent
TriggerFlags
TriggerInfo
+TriggerInstrumentation
TriggerTransition
TruncateStmt
TsmRoutine
@@ -3435,9 +3439,9 @@ WorkTableScan
WorkTableScanState
WorkerInfo
WorkerInfoData
-WorkerInstrumentation
WorkerJobDumpPtrType
WorkerJobRestorePtrType
+WorkerNodeInstrumentation
Working_State
WriteBufPtrType
WriteBytePtrType