From 9e8193a26229a2a578619fb3ee0687761aff2ea1 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 9 Mar 2026 07:15:26 +0900 Subject: [PATCH 01/32] Fix typo in stats_import.sql The test mentioned pg_stat_ext_exprs, but the correct catalog name is pg_stats_ext_exprs. Thinko in ba97bf9cb7b4. Discussion: https://postgr.es/m/CADkLM=eEhxJpSUP+eC=eMGZZsVOpnfKDvVkuCbsFg9CajYwDsA@mail.gmail.com --- src/test/regress/expected/stats_import.out | 2 +- src/test/regress/sql/stats_import.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/regress/expected/stats_import.out b/src/test/regress/expected/stats_import.out index 1f24e306f5b60..00a46353041c1 100644 --- a/src/test/regress/expected/stats_import.out +++ b/src/test/regress/expected/stats_import.out @@ -3421,7 +3421,7 @@ SELECT o.inherited, (0 rows) -- range_length_histogram, range_empty_frac, and range_bounds_histogram --- have been added to pg_stat_ext_exprs in PostgreSQL 19. When dumping +-- have been added to pg_stats_ext_exprs in PostgreSQL 19. When dumping -- expression statistics in a cluster with an older version, these fields -- are dumped as NULL, pg_restore_extended_stats() authorizing the partial -- restore state of the extended statistics data. This test emulates such diff --git a/src/test/regress/sql/stats_import.sql b/src/test/regress/sql/stats_import.sql index 61535a971dc1d..57363ab374843 100644 --- a/src/test/regress/sql/stats_import.sql +++ b/src/test/regress/sql/stats_import.sql @@ -2394,7 +2394,7 @@ SELECT o.inherited, o.statistics_name = 'test_mr_stat'; -- range_length_histogram, range_empty_frac, and range_bounds_histogram --- have been added to pg_stat_ext_exprs in PostgreSQL 19. When dumping +-- have been added to pg_stats_ext_exprs in PostgreSQL 19. When dumping -- expression statistics in a cluster with an older version, these fields -- are dumped as NULL, pg_restore_extended_stats() authorizing the partial -- restore state of the extended statistics data. This test emulates such From ccd7abaa456f34a41efe2e5198f0bb2227aa43d9 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 9 Mar 2026 08:46:06 +0900 Subject: [PATCH 02/32] Refactor tests for catalog diff comparisons in stats_import.sql The tests of stats_import.sql include a set of queries to do differential checks of the three statistics catalog relations, based on the comparison of a source relation and a target relation, used for the copy of the stats data with the restore functions: - pg_statistic - pg_stats_ext - pg_stats_ext_exprs This commit refactors the tests to reduce the bloat of such differential queries, by creating a set of objects that make the differential queries smaller: - View for a base relation type. - First function to retrieve stats data, that returns a type based on the view previously created. - Second function that checks the difference, based on two calls of the first function. This change leads to a nice reduction of stats_import.sql, with a larger effect on the output file. While on it, this adds some sanity checks for the three catalogs, to warn developers that the stats import facilities may need to be updated if any of the three catalogs change. These are rare in practice, see 918eee0c497c as one example. Another stylistic change is the use of the extended output format for the differential queries, so as we avoid long lines of output if a diff is caught. Author: Corey Huinker Discussion: https://postgr.es/m/CADkLM=eEhxJpSUP+eC=eMGZZsVOpnfKDvVkuCbsFg9CajYwDsA@mail.gmail.com --- src/test/regress/expected/stats_import.out | 501 ++++++++------------- src/test/regress/sql/stats_import.sql | 461 +++++++------------ 2 files changed, 357 insertions(+), 605 deletions(-) diff --git a/src/test/regress/expected/stats_import.out b/src/test/regress/expected/stats_import.out index 00a46353041c1..c7adb783da211 100644 --- a/src/test/regress/expected/stats_import.out +++ b/src/test/regress/expected/stats_import.out @@ -1,4 +1,158 @@ CREATE SCHEMA stats_import; +-- +-- Setup functions for set-difference convenience functions +-- +-- Test to detect any new columns added to pg_statistic. If any columns +-- are added, we may need to update pg_statistic_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_statistic'::regclass AND + attnum > 0; + count +------- + 31 +(1 row) + +-- Create a view that is used purely for the type based on pg_statistic. +CREATE VIEW stats_import.pg_statistic_flat_t AS + SELECT + a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, + s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, + s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, + s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, + s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, + s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, + s.stavalues5::text AS sv5 + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + WHERE FALSE; +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_statistic. +CREATE FUNCTION stats_import.pg_statistic_flat(p_relname text) +RETURNS SETOF stats_import.pg_statistic_flat_t +BEGIN ATOMIC + SELECT a.attname, s.stainherit, s.stanullfrac, s.stawidth, + s.stadistinct, s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, s.stacoll1, s.stacoll2, + s.stacoll3, s.stacoll4, s.stacoll5, s.stanumbers1, s.stanumbers2, + s.stanumbers3, s.stanumbers4, s.stanumbers5, s.stavalues1::text, + s.stavalues2::text, s.stavalues3::text, + s.stavalues4::text, s.stavalues5::text + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + JOIN pg_class c ON c.oid = a.attrelid + WHERE c.relnamespace = 'stats_import'::regnamespace + AND c.relname = p_relname; +END; +-- Comparison function for pg_statistic. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_statistic_get_difference(a text, b text) +RETURNS TABLE (relname text, stats stats_import.pg_statistic_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_statistic_flat(a)), + bset AS (SELECT * FROM stats_import.pg_statistic_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_statistic_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_statistic_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; +-- Test to detect any new columns added to pg_stats_ext. If any columns +-- are added, we may need to update pg_stats_ext_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext'::regclass AND + attnum > 0; + count +------- + 15 +(1 row) + +-- Create a view that is used purely for the type based on pg_stats_ext. +CREATE VIEW stats_import.pg_stats_ext_flat_t AS + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE FALSE; +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext. +CREATE FUNCTION stats_import.pg_stats_ext_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_flat_t +BEGIN ATOMIC + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE statistics_schemaname = 'stats_import' + AND statistics_name = p_statname; +END; +-- Comparison function for pg_stats_ext. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; +-- Test to detect any new columns added to pg_stats_ext_exprs. If any columns +-- are added, we may need to update pg_stats_ext_exprs_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext_exprs'::regclass AND + attnum > 0; + count +------- + 20 +(1 row) + +-- Create a view that is used purely for the type based on pg_stats_ext_exprs. +CREATE VIEW stats_import.pg_stats_ext_exprs_flat_t AS + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE FALSE; +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext_exprs. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_exprs_flat_t +BEGIN ATOMIC + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE n.statistics_schemaname = 'stats_import' AND + n.statistics_name = p_statname; +END; +-- Comparison function for pg_stats_ext_exprs. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_exprs_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; +-- +-- Schema setup. +-- CREATE TYPE stats_import.complex_type AS ( a integer, b real, @@ -1220,124 +1374,14 @@ ORDER BY c.relname; test_clone | 5 (4 rows) --- check test minus test_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- -(0 rows) - --- check test_clone minus test -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- -(0 rows) - --- check is_odd minus is_odd_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('test', 'test_clone') +\gx (0 rows) --- check is_odd_clone minus is_odd -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('is_odd', 'is_odd_clone') +\gx (0 rows) -- attribute stats exist before a clear, but not after @@ -3169,108 +3213,14 @@ AND e.statistics_name = 'test_stat'; test_stat | t (1 row) --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_stat', 'test_stat_clone') +\gx (0 rows) --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_stat', 'test_stat_clone') +\gx (0 rows) ANALYZE stats_import.test_mr; @@ -3316,108 +3266,14 @@ AND e.statistics_name = 'test_mr_stat'; test_mr_stat | t (1 row) --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx (0 rows) --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx (0 rows) -- range_length_histogram, range_empty_frac, and range_bounds_histogram @@ -3506,8 +3362,17 @@ SELECT COUNT(*) FROM stats_import.test_range_expr_null (1 row) DROP SCHEMA stats_import CASCADE; -NOTICE: drop cascades to 9 other objects -DETAIL: drop cascades to type stats_import.complex_type +NOTICE: drop cascades to 18 other objects +DETAIL: drop cascades to view stats_import.pg_statistic_flat_t +drop cascades to function stats_import.pg_statistic_flat(text) +drop cascades to function stats_import.pg_statistic_get_difference(text,text) +drop cascades to view stats_import.pg_stats_ext_flat_t +drop cascades to function stats_import.pg_stats_ext_flat(text) +drop cascades to function stats_import.pg_stats_ext_get_difference(text,text) +drop cascades to view stats_import.pg_stats_ext_exprs_flat_t +drop cascades to function stats_import.pg_stats_ext_exprs_flat(text) +drop cascades to function stats_import.pg_stats_ext_exprs_get_difference(text,text) +drop cascades to type stats_import.complex_type drop cascades to table stats_import.test drop cascades to table stats_import.test_mr drop cascades to table stats_import.part_parent diff --git a/src/test/regress/sql/stats_import.sql b/src/test/regress/sql/stats_import.sql index 57363ab374843..0518bbf6f4256 100644 --- a/src/test/regress/sql/stats_import.sql +++ b/src/test/regress/sql/stats_import.sql @@ -1,5 +1,157 @@ CREATE SCHEMA stats_import; +-- +-- Setup functions for set-difference convenience functions +-- + +-- Test to detect any new columns added to pg_statistic. If any columns +-- are added, we may need to update pg_statistic_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_statistic'::regclass AND + attnum > 0; + +-- Create a view that is used purely for the type based on pg_statistic. +CREATE VIEW stats_import.pg_statistic_flat_t AS + SELECT + a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, + s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, + s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, + s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, + s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, + s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, + s.stavalues5::text AS sv5 + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + WHERE FALSE; + +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_statistic. +CREATE FUNCTION stats_import.pg_statistic_flat(p_relname text) +RETURNS SETOF stats_import.pg_statistic_flat_t +BEGIN ATOMIC + SELECT a.attname, s.stainherit, s.stanullfrac, s.stawidth, + s.stadistinct, s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, s.stacoll1, s.stacoll2, + s.stacoll3, s.stacoll4, s.stacoll5, s.stanumbers1, s.stanumbers2, + s.stanumbers3, s.stanumbers4, s.stanumbers5, s.stavalues1::text, + s.stavalues2::text, s.stavalues3::text, + s.stavalues4::text, s.stavalues5::text + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + JOIN pg_class c ON c.oid = a.attrelid + WHERE c.relnamespace = 'stats_import'::regnamespace + AND c.relname = p_relname; +END; + +-- Comparison function for pg_statistic. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_statistic_get_difference(a text, b text) +RETURNS TABLE (relname text, stats stats_import.pg_statistic_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_statistic_flat(a)), + bset AS (SELECT * FROM stats_import.pg_statistic_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_statistic_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_statistic_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; + +-- Test to detect any new columns added to pg_stats_ext. If any columns +-- are added, we may need to update pg_stats_ext_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext'::regclass AND + attnum > 0; + +-- Create a view that is used purely for the type based on pg_stats_ext. +CREATE VIEW stats_import.pg_stats_ext_flat_t AS + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE FALSE; + +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext. +CREATE FUNCTION stats_import.pg_stats_ext_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_flat_t +BEGIN ATOMIC + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE statistics_schemaname = 'stats_import' + AND statistics_name = p_statname; +END; + +-- Comparison function for pg_stats_ext. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; + +-- Test to detect any new columns added to pg_stats_ext_exprs. If any columns +-- are added, we may need to update pg_stats_ext_exprs_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext_exprs'::regclass AND + attnum > 0; + +-- Create a view that is used purely for the type based on pg_stats_ext_exprs. +CREATE VIEW stats_import.pg_stats_ext_exprs_flat_t AS + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE FALSE; + +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext_exprs. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_exprs_flat_t +BEGIN ATOMIC + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE n.statistics_schemaname = 'stats_import' AND + n.statistics_name = p_statname; +END; + +-- Comparison function for pg_stats_ext_exprs. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_exprs_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; + +-- +-- Schema setup. +-- CREATE TYPE stats_import.complex_type AS ( a integer, b real, @@ -884,113 +1036,13 @@ AND c.relname IN ('test', 'test_clone', 'is_odd', 'is_odd_clone') GROUP BY c.relname ORDER BY c.relname; --- check test minus test_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass; - --- check test_clone minus test -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass; - --- check is_odd minus is_odd_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass; - --- check is_odd_clone minus is_odd -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass; +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('test', 'test_clone') +\gx + +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('is_odd', 'is_odd_clone') +\gx -- attribute stats exist before a clear, but not after SELECT COUNT(*) @@ -2171,96 +2223,14 @@ CROSS JOIN LATERAL ( WHERE e.statistics_schemaname = 'stats_import' AND e.statistics_name = 'test_stat'; --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; - --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_stat', 'test_stat_clone') +\gx + +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_stat', 'test_stat_clone') +\gx + ANALYZE stats_import.test_mr; @@ -2302,96 +2272,13 @@ CROSS JOIN LATERAL ( WHERE e.statistics_schemaname = 'stats_import' AND e.statistics_name = 'test_mr_stat'; --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; - --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx + +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx -- range_length_histogram, range_empty_frac, and range_bounds_histogram -- have been added to pg_stats_ext_exprs in PostgreSQL 19. When dumping From 4da2afd01f938af35c1a52bbf6bc40baa52462f6 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 9 Mar 2026 13:46:27 +0900 Subject: [PATCH 03/32] Fix size underestimation of DSA pagemap for odd-sized segments When make_new_segment() creates an odd-sized segment, the pagemap was only sized based on a number of usable_pages entries, forgetting that a segment also contains metadata pages, and that the FreePageManager uses absolute page indices that cover the entire segment. This miscalculation could cause accesses to pagemap entries to be out of bounds. During subsequent reuse of the allocated segment, allocations landing on pages with indices higher than usable_pages could cause out-of-bounds pagemap reads and/or writes. On write, 'span' pointers are stored into the data area, corrupting the allocated objects. On read (aka during a dsa_free), garbage is interpreted as a span pointer, typically crashing the server in dsa_get_address(). The normal geometric path correctly sizes the pagemap for all pages in the segment. The odd-sized path needs to do the same, but it works forward from usable_pages rather than backward from total_size. This commit fixes the sizing of the odd-sized case by adding pagemap entries for the metadata pages after the initial metadata_bytes calculation, using an integer ceiling division to compute the exact number of additional entries needed in one go, avoiding any iteration in the calculation. An assertion is added in the code path for odd-sized segments, ensuring that the pagemap includes the metadata area, and that the result is appropriately sized. This problem would show up depending on the size requested for the allocation of a DSA segment. The reporter has noticed this issue when a parallel hash join makes a DSA allocation large enough to trigger the odd-sized segment path, but it could happen for anything that does a DSA allocation. A regression test is added to test_dsa, down to v17 where the test module has been introduced. This adds a set of cheap tests to check the problem, the new assertion being useful for this purpose. Sami has proposed a test that took a longer time than what I have done here; the test committed is faster and good enough to check the odd-sized allocation path. Author: Paul Bunn Reviewed-by: Sami Imseih Reviewed-by: Chao Li Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/044401dcabac$fe432490$fac96db0$@icloud.com Backpatch-through: 14 --- src/backend/utils/mmgr/dsa.c | 29 ++++++++++++++ .../modules/test_dsa/expected/test_dsa.out | 16 ++++++++ src/test/modules/test_dsa/sql/test_dsa.sql | 7 ++++ src/test/modules/test_dsa/test_dsa--1.0.sql | 4 ++ src/test/modules/test_dsa/test_dsa.c | 40 +++++++++++++++++++ 5 files changed, 96 insertions(+) diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index ce9ede4c19697..4b4f1e1965ba3 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -2196,6 +2196,8 @@ make_new_segment(dsa_area *area, size_t requested_pages) /* See if that is enough... */ if (requested_pages > usable_pages) { + size_t total_requested_pages PG_USED_FOR_ASSERTS_ONLY; + /* * We'll make an odd-sized segment, working forward from the requested * number of pages. @@ -2206,10 +2208,37 @@ make_new_segment(dsa_area *area, size_t requested_pages) MAXALIGN(sizeof(FreePageManager)) + usable_pages * sizeof(dsa_pointer); + /* + * We must also account for pagemap entries needed to cover the + * metadata pages themselves. The pagemap must track all pages in the + * segment, including the pages occupied by metadata. + * + * This formula uses integer ceiling division to compute the exact + * number of additional entries needed. The divisor (FPM_PAGE_SIZE - + * sizeof(dsa_pointer)) accounts for the fact that each metadata page + * consumes one pagemap entry of sizeof(dsa_pointer) bytes, leaving + * only (FPM_PAGE_SIZE - sizeof(dsa_pointer)) net bytes per metadata + * page. + */ + metadata_bytes += + ((metadata_bytes + (FPM_PAGE_SIZE - sizeof(dsa_pointer)) - 1) / + (FPM_PAGE_SIZE - sizeof(dsa_pointer))) * + sizeof(dsa_pointer); + /* Add padding up to next page boundary. */ if (metadata_bytes % FPM_PAGE_SIZE != 0) metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE; + total_requested_pages = total_size / FPM_PAGE_SIZE; + + /* + * Verify that we allocated enough pagemap entries for metadata and + * usable pages. This reverse-engineers the new calculation of + * "metadata_bytes" done based on the new "requested_pages" for an + * odd-sized segment. + */ + Assert((metadata_bytes - MAXALIGN(sizeof(dsa_segment_header)) - + MAXALIGN(sizeof(FreePageManager))) / sizeof(dsa_pointer) >= total_requested_pages); /* Is that too large for dsa_pointer's addressing scheme? */ if (total_size > DSA_MAX_SEGMENT_SIZE) diff --git a/src/test/modules/test_dsa/expected/test_dsa.out b/src/test/modules/test_dsa/expected/test_dsa.out index 266010e77fe9e..4b53a7de4a443 100644 --- a/src/test/modules/test_dsa/expected/test_dsa.out +++ b/src/test/modules/test_dsa/expected/test_dsa.out @@ -11,3 +11,19 @@ SELECT test_dsa_resowners(); (1 row) +-- Test allocations across a pre-defined range of pages. This covers enough +-- range to check for the case of odd-sized segments, without making the test +-- too slow. +SELECT test_dsa_allocate(1001, 2000, 100); + test_dsa_allocate +------------------- + +(1 row) + +-- Larger size with odd-sized segment. +SELECT test_dsa_allocate(6501, 6600, 100); + test_dsa_allocate +------------------- + +(1 row) + diff --git a/src/test/modules/test_dsa/sql/test_dsa.sql b/src/test/modules/test_dsa/sql/test_dsa.sql index c3d8db9437206..99b4a60dd14ca 100644 --- a/src/test/modules/test_dsa/sql/test_dsa.sql +++ b/src/test/modules/test_dsa/sql/test_dsa.sql @@ -2,3 +2,10 @@ CREATE EXTENSION test_dsa; SELECT test_dsa_basic(); SELECT test_dsa_resowners(); + +-- Test allocations across a pre-defined range of pages. This covers enough +-- range to check for the case of odd-sized segments, without making the test +-- too slow. +SELECT test_dsa_allocate(1001, 2000, 100); +-- Larger size with odd-sized segment. +SELECT test_dsa_allocate(6501, 6600, 100); diff --git a/src/test/modules/test_dsa/test_dsa--1.0.sql b/src/test/modules/test_dsa/test_dsa--1.0.sql index 2904cb23525e3..3ee2e44cc0068 100644 --- a/src/test/modules/test_dsa/test_dsa--1.0.sql +++ b/src/test/modules/test_dsa/test_dsa--1.0.sql @@ -10,3 +10,7 @@ CREATE FUNCTION test_dsa_basic() CREATE FUNCTION test_dsa_resowners() RETURNS pg_catalog.void AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION test_dsa_allocate(int, int, int) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c index ed2a07c962fc6..edcab105de621 100644 --- a/src/test/modules/test_dsa/test_dsa.c +++ b/src/test/modules/test_dsa/test_dsa.c @@ -16,6 +16,7 @@ #include "storage/dsm_registry.h" #include "storage/lwlock.h" #include "utils/dsa.h" +#include "utils/freepage.h" #include "utils/resowner.h" PG_MODULE_MAGIC; @@ -120,3 +121,42 @@ test_dsa_resowners(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * test_dsa_allocate + * + * Test DSA allocation across a range of sizes to exercise the pagemap + * sizing logic in make_new_segment(). A fresh DSA is created for each + * iteration so that each allocation triggers a new segment creation, + * including the odd-sized segment path. + */ +PG_FUNCTION_INFO_V1(test_dsa_allocate); +Datum +test_dsa_allocate(PG_FUNCTION_ARGS) +{ + int start_num_pages = PG_GETARG_INT32(0); + int end_num_pages = PG_GETARG_INT32(1); + int step = PG_GETARG_INT32(2); + size_t usable_pages; + int *tranche_id; + bool found; + dsa_area *a; + dsa_pointer dp; + + if (start_num_pages > end_num_pages) + elog(ERROR, "incorrect start and end parameters"); + + tranche_id = GetNamedDSMSegment("test_dsa", sizeof(int), + init_tranche, &found, NULL); + + for (usable_pages = start_num_pages; usable_pages < end_num_pages; usable_pages += step) + { + a = dsa_create(*tranche_id); + dp = dsa_allocate(a, usable_pages * FPM_PAGE_SIZE); + + dsa_free(a, dp); + dsa_detach(a); + } + + PG_RETURN_VOID(); +} From 173aa8c5e89130c757c029262f10fc537b6f68ae Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Mon, 9 Mar 2026 18:23:36 +0900 Subject: [PATCH 04/32] doc: Document IF NOT EXISTS option for ALTER FOREIGN TABLE ADD COLUMN. Commit 2cd40adb85d added the IF NOT EXISTS option to ALTER TABLE ADD COLUMN. This also enabled IF NOT EXISTS for ALTER FOREIGN TABLE ADD COLUMN, but the ALTER FOREIGN TABLE documentation was not updated to mention it. This commit updates the documentation to describe the IF NOT EXISTS option for ALTER FOREIGN TABLE ADD COLUMN. While updating that section, also this commit clarifies that the COLUMN keyword is optional in ALTER FOREIGN TABLE ADD/DROP COLUMN. Previously, part of the documentation could be read as if COLUMN were required. This commit adds regression tests covering these ALTER FOREIGN TABLE syntaxes. Backpatch to all supported versions. Suggested-by: Fujii Masao Author: Chao Li Reviewed-by: Robert Treat Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/CAHGQGwFk=rrhrwGwPtQxBesbT4DzSZ86Q3ftcwCu3AR5bOiXLw@mail.gmail.com Backpatch-through: 14 --- doc/src/sgml/ref/alter_foreign_table.sgml | 8 +++++--- src/test/regress/expected/foreign_data.out | 13 +++++++++++++ src/test/regress/sql/foreign_data.sql | 7 +++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/ref/alter_foreign_table.sgml b/doc/src/sgml/ref/alter_foreign_table.sgml index e6d99e99016e7..228067f087cee 100644 --- a/doc/src/sgml/ref/alter_foreign_table.sgml +++ b/doc/src/sgml/ref/alter_foreign_table.sgml @@ -32,7 +32,7 @@ ALTER FOREIGN TABLE [ IF EXISTS ] namewhere action is one of: - ADD [ COLUMN ] column_name data_type [ COLLATE collation ] [ column_constraint [ ... ] ] + ADD [ COLUMN ] [ IF NOT EXISTS ] column_name data_type [ COLLATE collation ] [ column_constraint [ ... ] ] DROP [ COLUMN ] [ IF EXISTS ] column_name [ RESTRICT | CASCADE ] ALTER [ COLUMN ] column_name [ SET DATA ] TYPE data_type [ COLLATE collation ] ALTER [ COLUMN ] column_name SET DEFAULT expression @@ -67,11 +67,13 @@ ALTER FOREIGN TABLE [ IF EXISTS ] name - ADD COLUMN + ADD [ COLUMN ] [ IF NOT EXISTS ] This form adds a new column to the foreign table, using the same syntax as CREATE FOREIGN TABLE. + If IF NOT EXISTS is specified and a column already + exists with this name, no error is thrown. Unlike the case when adding a column to a regular table, nothing happens to the underlying storage: this action simply declares that some new column is now accessible through the foreign table. @@ -80,7 +82,7 @@ ALTER FOREIGN TABLE [ IF EXISTS ] name - DROP COLUMN [ IF EXISTS ] + DROP [ COLUMN ] [ IF EXISTS ] This form drops a column from a foreign table. diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index cce49e509abe4..6af54d9803f6d 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -828,10 +828,13 @@ COMMENT ON COLUMN ft1.c1 IS NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c4 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c5 integer DEFAULT 0; ALTER FOREIGN TABLE ft1 ADD COLUMN c6 integer; +ALTER FOREIGN TABLE ft1 ADD COLUMN IF NOT EXISTS c6 integer; +NOTICE: column "c6" of relation "ft1" already exists, skipping ALTER FOREIGN TABLE ft1 ADD COLUMN c7 integer NOT NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c8 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c9 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); +ALTER FOREIGN TABLE ft1 ADD c11 integer; ALTER FOREIGN TABLE ft1 ALTER COLUMN c4 SET DEFAULT 0; ALTER FOREIGN TABLE ft1 ALTER COLUMN c5 DROP DEFAULT; ALTER FOREIGN TABLE ft1 ALTER COLUMN c6 SET NOT NULL; @@ -863,6 +866,7 @@ ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET STORAGE PLAIN; c8 | text | | | | (p2 'V2') | plain | | c9 | integer | | | | | plain | | c10 | integer | | | | (p1 'v1') | plain | | + c11 | integer | | | | | plain | | Check constraints: "ft1_c2_check" CHECK (c2 <> ''::text) "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date) @@ -897,6 +901,7 @@ ERROR: column "no_column" of relation "ft1" does not exist ALTER FOREIGN TABLE ft1 DROP COLUMN IF EXISTS no_column; NOTICE: column "no_column" of relation "ft1" does not exist, skipping ALTER FOREIGN TABLE ft1 DROP COLUMN c9; +ALTER FOREIGN TABLE ft1 DROP c11; ALTER FOREIGN TABLE ft1 ADD COLUMN c11 serial; ALTER FOREIGN TABLE ft1 SET SCHEMA foreign_schema; ALTER FOREIGN TABLE ft1 SET TABLESPACE ts; -- ERROR @@ -931,6 +936,8 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c4 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c6 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN IF NOT EXISTS c6 integer; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c7 integer NOT NULL; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c8 integer; @@ -939,6 +946,8 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c9 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD c11 integer; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c6 SET NOT NULL; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c7 DROP NOT NULL; @@ -960,10 +969,14 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OWNER TO regress_test_role; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN no_column; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN IF EXISTS no_column; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN c9; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP c11; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 SET SCHEMA foreign_schema; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME c1 TO foreign_column_1; diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql index aa147b14a90a0..084d5559e098e 100644 --- a/src/test/regress/sql/foreign_data.sql +++ b/src/test/regress/sql/foreign_data.sql @@ -383,10 +383,12 @@ COMMENT ON COLUMN ft1.c1 IS NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c4 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c5 integer DEFAULT 0; ALTER FOREIGN TABLE ft1 ADD COLUMN c6 integer; +ALTER FOREIGN TABLE ft1 ADD COLUMN IF NOT EXISTS c6 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c7 integer NOT NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c8 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c9 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); +ALTER FOREIGN TABLE ft1 ADD c11 integer; ALTER FOREIGN TABLE ft1 ALTER COLUMN c4 SET DEFAULT 0; ALTER FOREIGN TABLE ft1 ALTER COLUMN c5 DROP DEFAULT; @@ -419,6 +421,7 @@ ALTER FOREIGN TABLE ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); ALTER FOREIGN TABLE ft1 DROP COLUMN no_column; -- ERROR ALTER FOREIGN TABLE ft1 DROP COLUMN IF EXISTS no_column; ALTER FOREIGN TABLE ft1 DROP COLUMN c9; +ALTER FOREIGN TABLE ft1 DROP c11; ALTER FOREIGN TABLE ft1 ADD COLUMN c11 serial; ALTER FOREIGN TABLE ft1 SET SCHEMA foreign_schema; ALTER FOREIGN TABLE ft1 SET TABLESPACE ts; -- ERROR @@ -430,10 +433,12 @@ ALTER FOREIGN TABLE foreign_schema.ft1 RENAME TO foreign_table_1; -- alter noexisting table ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c4 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c6 integer; +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN IF NOT EXISTS c6 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c7 integer NOT NULL; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c8 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c9 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD c11 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c6 SET NOT NULL; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c7 DROP NOT NULL; @@ -447,8 +452,10 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP CONSTRAINT IF EXISTS no_cons ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP CONSTRAINT ft1_c1_check; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OWNER TO regress_test_role; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN no_column; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN IF EXISTS no_column; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN c9; +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP c11; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 SET SCHEMA foreign_schema; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME c1 TO foreign_column_1; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME TO foreign_table_1; From 2799e29fb8b1346bd6eab625ce8ad508eab5dc81 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Mon, 9 Mar 2026 09:46:36 +0100 Subject: [PATCH 05/32] Move comment back to better place Commit f014b1b9bb8 inserted some new code in between existing code and a trailing comment. Move the comment back to near the code it belongs to. --- configure | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 4aaaf92ba0a12..42621ecd05189 100755 --- a/configure +++ b/configure @@ -5341,6 +5341,7 @@ if test x"$pgac_cv_prog_CC_cflags__Werror_vla" = x"yes"; then fi + # -Wvla is not applicable for C++ # On macOS, complain about usage of symbols newer than the deployment target { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Werror=unguarded-availability-new, for CFLAGS" >&5 @@ -5433,7 +5434,6 @@ if test x"$pgac_cv_prog_CXX_cxxflags__Werror_unguarded_availability_new" = x"yes fi - # -Wvla is not applicable for C++ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Wendif-labels, for CFLAGS" >&5 $as_echo_n "checking whether ${CC} supports -Wendif-labels, for CFLAGS... " >&6; } diff --git a/configure.ac b/configure.ac index 9bc457bac87a2..61ec895d23cf3 100644 --- a/configure.ac +++ b/configure.ac @@ -549,10 +549,10 @@ if test "$GCC" = yes -a "$ICC" = no; then AC_SUBST(PERMIT_DECLARATION_AFTER_STATEMENT) # Really don't want VLAs to be used in our dialect of C PGAC_PROG_CC_CFLAGS_OPT([-Werror=vla]) + # -Wvla is not applicable for C++ # On macOS, complain about usage of symbols newer than the deployment target PGAC_PROG_CC_CFLAGS_OPT([-Werror=unguarded-availability-new]) PGAC_PROG_CXX_CFLAGS_OPT([-Werror=unguarded-availability-new]) - # -Wvla is not applicable for C++ PGAC_PROG_CC_CFLAGS_OPT([-Wendif-labels]) PGAC_PROG_CXX_CFLAGS_OPT([-Wendif-labels]) PGAC_PROG_CC_CFLAGS_OPT([-Wmissing-format-attribute]) From 06d83022628e6bfa9c4bd1feabe2b41e0bdc1310 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 9 Mar 2026 15:10:03 +0530 Subject: [PATCH 06/32] Remove trailing period from errmsg in subscriptioncmds.c. Author: Sahitya Chandra Discussion: https://postgr.es/m/20260308142806.181309-1-sahityajb@gmail.com --- src/backend/commands/subscriptioncmds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 9e21d7a7df941..724637cff5bd4 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -2993,7 +2993,7 @@ check_pub_dead_tuple_retention(WalReceiverConn *wrconn) if (remote_in_recovery) ereport(ERROR, errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot enable retain_dead_tuples if the publisher is in recovery.")); + errmsg("cannot enable retain_dead_tuples if the publisher is in recovery")); ExecDropSingleTupleTableSlot(slot); From ffc226ab64d4ebdb089c278396d5df3d0a3f83b9 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 9 Mar 2026 06:36:42 -0400 Subject: [PATCH 07/32] Prevent restore of incremental backup from bloating VM fork. When I (rhaas) wrote the WAL summarizer code, I incorrectly believed that XLOG_SMGR_TRUNCATE truncates all forks to the same length. In fact, what other parts of the code do is compute the truncation length for the FSM and VM forks from the truncation length used for the main fork. But, because I was confused, I coded the WAL summarizer to set the limit block for the VM fork to the same value as for the main fork. (Incremental backup always copies FSM forks in full, so there is no similar issue in that case.) Doing that doesn't directly cause any data corruption, as far as I can see. However, it does create a serious risk of consuming a large amount of extra disk space, because pg_combinebackup's reconstruct.c believes that the reconstructed file should always be at least as long as the limit block value. We might want to be smarter about that at some point in the future, because it's always safe to omit all-zeroes blocks at the end of the last segment of a relation, and doing so could save disk space, but the current algorithm will rarely waste enough disk space to worry about unless we believe that a relation has been truncated to a length much longer than its actual length on disk, which is exactly what happens as a result of the problem mentioned in the previous paragraph. To fix, create a new visibilitymap helper function and use it to include the right limit block in the summary files. Incremental backups taken with existing summary files will still have this issue, but this should improve the situation going forward. Diagnosed-by: Oleg Tkachenko Diagnosed-by: Amul Sul Discussion: http://postgr.es/m/CAAJ_b97PqG89hvPNJ8cGwmk94gJ9KOf_pLsowUyQGZgJY32o9g@mail.gmail.com Discussion: http://postgr.es/m/6897DAF7-B699-41BF-A6FB-B818FCFFD585%40gmail.com Backpatch-through: 17 --- src/backend/access/heap/visibilitymap.c | 17 +++++++++++++ src/backend/postmaster/walsummarizer.c | 4 +++- .../pg_combinebackup/t/011_ib_truncation.pl | 24 ++++++++++++++++--- src/include/access/visibilitymap.h | 1 + 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3047bd46def96..e21b96281a637 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -116,6 +116,8 @@ /* Mapping from heap block number to the right bit in the visibility map */ #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_MAPBLOCK_LIMIT(x) \ + (((x) + HEAPBLOCKS_PER_PAGE - 1) / HEAPBLOCKS_PER_PAGE) #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) @@ -600,6 +602,21 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) return newnblocks; } +/* + * visibilitymap_truncation_length - + * compute truncation length for visibility map + * + * Given a proposed truncation length for the main fork, compute the + * correct truncation length for the visibility map. Should return the + * same answer as visibilitymap_prepare_truncate(), but without modifying + * anything. + */ +BlockNumber +visibilitymap_truncation_length(BlockNumber nheapblocks) +{ + return HEAPBLK_TO_MAPBLOCK_LIMIT(nheapblocks); +} + /* * Read a visibility map page. * diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 742137edad69f..e1aa102f41dce 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -23,6 +23,7 @@ #include "postgres.h" #include "access/timeline.h" +#include "access/visibilitymap.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogrecovery.h" @@ -1351,7 +1352,8 @@ SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) MAIN_FORKNUM, xlrec->blkno); if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0) BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, - VISIBILITYMAP_FORKNUM, xlrec->blkno); + VISIBILITYMAP_FORKNUM, + visibilitymap_truncation_length(xlrec->blkno)); } } diff --git a/src/bin/pg_combinebackup/t/011_ib_truncation.pl b/src/bin/pg_combinebackup/t/011_ib_truncation.pl index 47d84434452fb..c5e0124c04deb 100644 --- a/src/bin/pg_combinebackup/t/011_ib_truncation.pl +++ b/src/bin/pg_combinebackup/t/011_ib_truncation.pl @@ -1,7 +1,8 @@ # Copyright (c) 2025-2026, PostgreSQL Global Development Group # -# This test aims to validate that the calculated truncation block never exceeds -# the segment size. +# This test aims to validate two things: (1) that the calculated truncation +# block never exceeds the segment size and (2) that the correct limit block +# length is calculated for the VM fork. use strict; use warnings FATAL => 'all'; @@ -39,7 +40,7 @@ CREATE TABLE t ( id int, data text STORAGE PLAIN - ); + ) WITH (autovacuum_enabled = false); }); # The tuple size should be enough to prevent two tuples from being on the same @@ -83,6 +84,23 @@ $primary->backup('incr', backup_options => [ '--incremental', "$full_backup/backup_manifest" ]); +# We used to have a bug where the wrong limit block was calculated for the +# VM fork, so verify that the WAL summary records the correct VM fork +# truncation limit. We can't just check whether the restored VM fork is +# the right size on disk, because it's so small that the incremental backup +# code will send the entire file. +my $relfilenode = $primary->safe_psql('postgres', + "SELECT pg_relation_filenode('t');"); +my $vm_limits = $primary->safe_psql('postgres', + "SELECT string_agg(relblocknumber::text, ',') + FROM pg_available_wal_summaries() s, + pg_wal_summary_contents(s.tli, s.start_lsn, s.end_lsn) c + WHERE c.relfilenode = $relfilenode + AND c.relforknumber = 2 + AND c.is_limit_block;"); +is($vm_limits, '1', + 'WAL summary has correct VM fork truncation limit'); + # Combine full and incremental backups. Before the fix, this failed because # the INCREMENTAL file header contained an incorrect truncation_block value. my $restored = PostgreSQL::Test::Cluster->new('node2'); diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index a0166c5b41035..52cde56be8651 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -45,5 +45,6 @@ extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); extern BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks); +extern BlockNumber visibilitymap_truncation_length(BlockNumber nheapblocks); #endif /* VISIBILITYMAP_H */ From 8300d3ad4aa73dc6beec8dca7d9362dbc21c9a83 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 9 Mar 2026 08:16:30 -0400 Subject: [PATCH 08/32] Consider startup cost as a figure of merit for partial paths. Previously, the comments stated that there was no purpose to considering startup cost for partial paths, but this is not the case: it's perfectly reasonable to want a fast-start path for a plan that involves a LIMIT (perhaps over an aggregate, so that there is enough data being processed to justify parallel query but yet we don't want all the result rows). Accordingly, rewrite add_partial_path and add_partial_path_precheck to consider startup costs. This also fixes an independent bug in add_partial_path_precheck: commit e22253467942fdb100087787c3e1e3a8620c54b2 failed to update it to do anything with the new disabled_nodes field. That bug fix is formally separate from the rest of this patch and could be committed separately, but I think it makes more sense to fix both issues together, because then we can (as this commit does) just make add_partial_path_precheck do the cost comparisons in the same way as compare_path_costs_fuzzily, which hopefully reduces the chances of ending up with something that's still incorrect. This patch is based on earlier work on this topic by Tomas Vondra, but I have rewritten a great deal of it. Co-authored-by: Robert Haas Co-authored-by: Tomas Vondra Discussion: http://postgr.es/m/CA+TgmobRufbUSksBoxytGJS1P+mQY4rWctCk-d0iAUO6-k9Wrg@mail.gmail.com --- src/backend/optimizer/path/joinpath.c | 3 + src/backend/optimizer/util/pathnode.c | 165 +++++++++++------- src/include/optimizer/pathnode.h | 2 +- .../regress/expected/incremental_sort.out | 28 +-- src/test/regress/expected/join_hash.out | 13 +- src/test/regress/sql/join_hash.sql | 10 +- 6 files changed, 133 insertions(+), 88 deletions(-) diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index e0c00e26dd5d5..044560da7bf7a 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -1048,6 +1048,7 @@ try_partial_nestloop_path(PlannerInfo *root, initial_cost_nestloop(root, &workspace, jointype, nestloop_subtype, outer_path, inner_path, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1237,6 +1238,7 @@ try_partial_mergejoin_path(PlannerInfo *root, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1369,6 +1371,7 @@ try_partial_hashjoin_path(PlannerInfo *root, initial_cost_hashjoin(root, &workspace, jointype, hashclauses, outer_path, inner_path, extra, parallel_hash); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, NIL)) return; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ef8ef6e89d377..c94e077000f5a 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -778,10 +778,9 @@ add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * * Because we don't consider parameterized paths here, we also don't * need to consider the row counts as a measure of quality: every path will - * produce the same number of rows. Neither do we need to consider startup - * costs: parallelism is only used for plans that will be run to completion. - * Therefore, this routine is much simpler than add_path: it needs to - * consider only disabled nodes, pathkeys and total cost. + * produce the same number of rows. However, we do need to consider the + * startup costs: this partial path could be used beneath a Limit node, + * so a fast-start plan could be correct. * * As with add_path, we pfree paths that are found to be dominated by * another partial path; this requires that there be no other references to @@ -819,52 +818,41 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) /* Compare pathkeys. */ keyscmp = compare_pathkeys(new_path->pathkeys, old_path->pathkeys); - /* Unless pathkeys are incompatible, keep just one of the two paths. */ + /* + * Unless pathkeys are incompatible, see if one of the paths dominates + * the other (both in startup and total cost). It may happen that one + * path has lower startup cost, the other has lower total cost. + */ if (keyscmp != PATHKEYS_DIFFERENT) { - if (unlikely(new_path->disabled_nodes != old_path->disabled_nodes)) + PathCostComparison costcmp; + + /* + * Do a fuzzy cost comparison with standard fuzziness limit. + */ + costcmp = compare_path_costs_fuzzily(new_path, old_path, + STD_FUZZ_FACTOR); + if (costcmp == COSTS_BETTER1) { - if (new_path->disabled_nodes > old_path->disabled_nodes) - accept_new = false; - else + if (keyscmp != PATHKEYS_BETTER2) remove_old = true; } - else if (new_path->total_cost > old_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_BETTER2) { - /* New path costs more; keep it only if pathkeys are better. */ if (keyscmp != PATHKEYS_BETTER1) accept_new = false; } - else if (old_path->total_cost > new_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_EQUAL) { - /* Old path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER2) + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER1) - { - /* Costs are about the same, new path has better pathkeys. */ - remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER2) - { - /* Costs are about the same, old path has better pathkeys. */ - accept_new = false; - } - else if (old_path->total_cost > new_path->total_cost * 1.0000000001) - { - /* Pathkeys are the same, and the old path costs more. */ - remove_old = true; - } - else - { - /* - * Pathkeys are the same, and new path isn't materially - * cheaper. - */ - accept_new = false; + else if (keyscmp == PATHKEYS_BETTER2) + accept_new = false; + else if (compare_path_costs_fuzzily(new_path, old_path, + 1.0000000001) == COSTS_BETTER1) + remove_old = true; + else + accept_new = false; } } @@ -915,16 +903,16 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) * add_partial_path_precheck * Check whether a proposed new partial path could possibly get accepted. * - * Unlike add_path_precheck, we can ignore startup cost and parameterization, - * since they don't matter for partial paths (see add_partial_path). But - * we do want to make sure we don't add a partial path if there's already - * a complete path that dominates it, since in that case the proposed path - * is surely a loser. + * Unlike add_path_precheck, we can ignore parameterization, since it doesn't + * matter for partial paths (see add_partial_path). But we do want to make + * sure we don't add a partial path if there's already a complete path that + * dominates it, since in that case the proposed path is surely a loser. */ bool add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, - Cost total_cost, List *pathkeys) + Cost startup_cost, Cost total_cost, List *pathkeys) { + bool consider_startup = parent_rel->consider_startup; ListCell *p1; /* @@ -934,25 +922,81 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * is clearly superior to some existing partial path -- at least, modulo * final cost computations. If so, we definitely want to consider it. * - * Unlike add_path(), we always compare pathkeys here. This is because we - * expect partial_pathlist to be very short, and getting a definitive - * answer at this stage avoids the need to call add_path_precheck. + * Unlike add_path(), we never try to exit this loop early. This is + * because we expect partial_pathlist to be very short, and getting a + * definitive answer at this stage avoids the need to call + * add_path_precheck. */ foreach(p1, parent_rel->partial_pathlist) { Path *old_path = (Path *) lfirst(p1); + PathCostComparison costcmp; PathKeysComparison keyscmp; - keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); - if (keyscmp != PATHKEYS_DIFFERENT) + /* + * First, compare costs and disabled nodes. This logic should be + * identical to compare_path_costs_fuzzily, except that one of the + * paths hasn't been created yet, and the fuzz factor is always + * STD_FUZZ_FACTOR. + */ + if (unlikely(old_path->disabled_nodes != disabled_nodes)) + { + if (disabled_nodes < old_path->disabled_nodes) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_BETTER2; + } + else if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR) { - if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER1) - return false; - if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER2) - return true; + if (consider_startup && + old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER2; + } + else if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER1; } + else if (startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER2; + else if (old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_EQUAL; + + /* + * If one path wins on startup cost and the other on total cost, we + * can't say for sure which is better. + */ + if (costcmp == COSTS_DIFFERENT) + continue; + + /* + * If the two paths have different pathkeys, we can't say for sure + * which is better. + */ + keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); + if (keyscmp == PATHKEYS_DIFFERENT) + continue; + + /* + * If the existing path is cheaper and the pathkeys are equal or + * worse, the new path is not interesting. + */ + if (costcmp == COSTS_BETTER2 && keyscmp != PATHKEYS_BETTER1) + return false; + + /* + * If the new path is cheaper and the pathkeys are equal or better, it + * is definitely interesting. + */ + if (costcmp == COSTS_BETTER1 && keyscmp != PATHKEYS_BETTER2) + return true; } /* @@ -960,14 +1004,9 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * clearly good enough that it might replace one. Compare it to * non-parallel plans. If it loses even before accounting for the cost of * the Gather node, we should definitely reject it. - * - * Note that we pass the total_cost to add_path_precheck twice. This is - * because it's never advantageous to consider the startup cost of a - * partial path; the resulting plans, if run in parallel, will be run to - * completion. */ - if (!add_path_precheck(parent_rel, disabled_nodes, total_cost, total_cost, - pathkeys, NULL)) + if (!add_path_precheck(parent_rel, disabled_nodes, startup_cost, + total_cost, pathkeys, NULL)) return false; return true; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index cf8a654fa5368..938510400cc53 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -55,7 +55,7 @@ extern bool add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, List *pathkeys, Relids required_outer); extern void add_partial_path(RelOptInfo *parent_rel, Path *new_path); extern bool add_partial_path_precheck(RelOptInfo *parent_rel, - int disabled_nodes, + int disabled_nodes, Cost startup_cost, Cost total_cost, List *pathkeys); extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index fdec5b9ba52a9..1e6e020fea836 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1450,21 +1450,23 @@ explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1 set enable_incremental_sort = on; explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; - QUERY PLAN ----------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Limit -> Incremental Sort Sort Key: a, b, (sum(c)) Presorted Key: a, b - -> GroupAggregate + -> Finalize GroupAggregate Group Key: a, b -> Gather Merge Workers Planned: 2 - -> Incremental Sort - Sort Key: a, b - Presorted Key: a - -> Parallel Index Scan using t_a_idx on t -(12 rows) + -> Partial GroupAggregate + Group Key: a, b + -> Incremental Sort + Sort Key: a, b + Presorted Key: a + -> Parallel Index Scan using t_a_idx on t +(14 rows) -- Incremental sort vs. set operations with varno 0 set enable_hashagg to off; @@ -1580,8 +1582,8 @@ from tenk1 t1 join tenk1 t2 on t1.unique1 = t2.unique2 join tenk1 t3 on t2.unique1 = t3.unique1 order by count(*); - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------------- Sort Sort Key: (count(*)) -> Finalize Aggregate @@ -1591,10 +1593,10 @@ order by count(*); -> Parallel Hash Join Hash Cond: (t2.unique1 = t3.unique1) -> Parallel Hash Join - Hash Cond: (t1.unique1 = t2.unique2) - -> Parallel Index Only Scan using tenk1_unique1 on tenk1 t1 + Hash Cond: (t2.unique2 = t1.unique1) + -> Parallel Index Scan using tenk1_unique2 on tenk1 t2 -> Parallel Hash - -> Parallel Index Scan using tenk1_unique2 on tenk1 t2 + -> Parallel Index Only Scan using tenk1_unique1 on tenk1 t1 -> Parallel Hash -> Parallel Index Only Scan using tenk1_unique1 on tenk1 t3 (15 rows) diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 4749f6ed70d5f..bc7cc76467ffa 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -76,8 +76,8 @@ insert into extremely_skewed update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 -- batch, we stick to that number, and peak memory usage stays within @@ -922,7 +922,7 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); QUERY PLAN ---------------------------------------------------------------- Finalize Aggregate @@ -934,10 +934,11 @@ explain (costs off) -> Parallel Seq Scan on wide -> Parallel Hash -> Parallel Seq Scan on wide wide_1 -(9 rows) + Filter: (id < 3) +(10 rows) select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); length -------- 320000 @@ -947,7 +948,7 @@ select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); multibatch ------------ diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index 49d3fd6185629..53db1754bb261 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -83,8 +83,8 @@ update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 @@ -496,14 +496,14 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); rollback to settings; From 91f33a2ae92a68ac89b36eb21e0c5c903d03a142 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 9 Mar 2026 09:48:26 -0400 Subject: [PATCH 09/32] Replace get_relation_info_hook with build_simple_rel_hook. For a long time, PostgreSQL has had a get_relation_info_hook which plugins can use to editorialize on the information that get_relation_info obtains from the catalogs. However, this hook is only called for baserels of type RTE_RELATION, and there is potential utility in a similar call back for other types of RTEs. This might have had utility even before commit 4020b370f214315b8c10430301898ac21658143f added pgs_mask to RelOptInfo, but it certainly has utility now. So, move the callback up one level, deleting get_relation_info_hook and adding build_simple_rel_hook instead. The new callback is called just slightly later than before and with slightly different arguments, but it should be fairly straightforward to adjust existing code that currently uses get_relation_info_hook: the values previously available as relationObjectId and inhparent are now available via rte->relid and rte->inh, and calls where rte->rtekind != RTE_RELATION can be ignored if desired. Reviewed-by: Alexandra Wang Discussion: http://postgr.es/m/CA%2BTgmoYg8uUWyco7Pb3HYLMBRQoO6Zh9hwgm27V39Pb6Pdf%3Dug%40mail.gmail.com --- src/backend/optimizer/util/plancat.c | 14 -------------- src/backend/optimizer/util/relnode.c | 15 +++++++++++++++ src/include/optimizer/pathnode.h | 6 ++++++ src/include/optimizer/plancat.h | 8 -------- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d63e7390be764..b2fbd6a082bbc 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -57,9 +57,6 @@ /* GUC parameter */ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; -/* Hook for plugins to get control in get_relation_info() */ -get_relation_info_hook_type get_relation_info_hook = NULL; - typedef struct NotnullHashEntry { Oid relid; /* OID of the relation */ @@ -571,17 +568,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, set_relation_partition_info(root, rel, relation); table_close(relation, NoLock); - - /* - * Allow a plugin to editorialize on the info we obtained from the - * catalogs. Actions might include altering the assumed relation size, - * removing an index, or adding a hypothetical index to the indexlist. - * - * An extension can also modify rel->pgs_mask here to control path - * generation. - */ - if (get_relation_info_hook) - (*get_relation_info_hook) (root, relationObjectId, inhparent, rel); } /* diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index d21b4d3bb3563..91bcda34a3786 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -47,6 +47,9 @@ typedef struct JoinHashEntry RelOptInfo *join_rel; } JoinHashEntry; +/* Hook for plugins to get control in build_simple_rel() */ +build_simple_rel_hook_type build_simple_rel_hook = NULL; + /* Hook for plugins to get control during joinrel setup */ joinrel_setup_hook_type joinrel_setup_hook = NULL; @@ -394,6 +397,18 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) break; } + /* + * Allow a plugin to editorialize on the new RelOptInfo. This could + * involve editorializing on the information which get_relation_info + * obtained from the catalogs, such as altering the assumed relation size, + * removing an index, or adding a hypothetical index to the indexlist. + * + * An extension can also modify rel->pgs_mask here to control path + * generation. + */ + if (build_simple_rel_hook) + (*build_simple_rel_hook) (root, rel, rte); + /* * Apply the parent's quals to the child, with appropriate substitution of * variables. If any resulting clause is reduced to constant FALSE or diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 938510400cc53..da2d9b384b596 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -17,6 +17,12 @@ #include "nodes/bitmapset.h" #include "nodes/pathnodes.h" +/* Hook for plugins to get control in build_simple_rel() */ +typedef void (*build_simple_rel_hook_type) (PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +extern PGDLLIMPORT build_simple_rel_hook_type build_simple_rel_hook; + /* * Everything in subpaths or partial_subpaths will become part of the * Append node's subpaths list. Partial and non-partial subpaths can be diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 8d7cc6d9886b1..09baf1a691643 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -17,14 +17,6 @@ #include "nodes/pathnodes.h" #include "utils/relcache.h" -/* Hook for plugins to get control in get_relation_info() */ -typedef void (*get_relation_info_hook_type) (PlannerInfo *root, - Oid relationObjectId, - bool inhparent, - RelOptInfo *rel); -extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; - - extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); From 7c8280eeb5872f5c2663b562a9c6fcf8ec8a4b82 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 9 Mar 2026 11:37:46 -0500 Subject: [PATCH 10/32] pg_{dump,restore}: Refactor handling of conflicting options. This commit makes use of the function added by commit b2898baaf7 for these applications' handling of conflicting options. It doesn't fix any bugs, but it does trim several lines of code. Author: Jian He Reviewed-by: Steven Niu Reviewed-by: Zsolt Parragi Discussion: https://postgr.es/m/CACJufxHDYn%2B3-2jR_kwYB0U7UrNP%2B0EPvAWzBBD5EfUzzr1uiw%40mail.gmail.com --- src/bin/pg_dump/pg_dump.c | 69 ++++++------- src/bin/pg_dump/pg_dumpall.c | 2 +- src/bin/pg_dump/pg_restore.c | 146 ++++++++++------------------ src/bin/pg_dump/t/001_basic.pl | 26 ++--- src/bin/pg_dump/t/002_pg_dump.pl | 4 +- src/bin/pg_dump/t/007_pg_dumpall.pl | 8 +- 6 files changed, 100 insertions(+), 155 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 8bde1b382defd..137161aa5e059 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -826,52 +826,39 @@ main(int argc, char **argv) if (dopt.column_inserts && dopt.dump_inserts == 0) dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT; - /* reject conflicting "-only" options */ - if (data_only && schema_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "-a/--data-only"); - if (schema_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics-only"); - if (data_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics-only"); - - /* reject conflicting "-only" and "no-" options */ - if (data_only && no_data) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--no-data"); - if (schema_only && no_schema) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--no-schema"); - if (statistics_only && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics-only", "--no-statistics"); - - /* reject conflicting "no-" options */ - if (with_statistics && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics", "--no-statistics"); - - /* reject conflicting "-only" options */ - if (data_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics"); - if (schema_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics"); - - if (schema_only && foreign_servers_include_patterns.head != NULL) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--include-foreign-data"); + /* *-only options are incompatible with each other */ + check_mut_excl_opts(data_only, "-a/--data-only", + schema_only, "-s/--schema-only", + statistics_only, "--statistics-only"); + + /* --no-* and *-only for same thing are incompatible */ + check_mut_excl_opts(data_only, "-a/--data-only", + no_data, "--no-data"); + check_mut_excl_opts(schema_only, "-s/--schema-only", + no_schema, "--no-schema"); + check_mut_excl_opts(statistics_only, "--statistics-only", + no_statistics, "--no-statistics"); + + /* --statistics and --no-statistics are incompatible */ + check_mut_excl_opts(with_statistics, "--statistics", + no_statistics, "--no-statistics"); + + /* --statistics is incompatible with *-only (except --statistics-only) */ + check_mut_excl_opts(with_statistics, "--statistics", + data_only, "-a/--data-only", + schema_only, "-s/--schema-only"); + + /* --include-foreign-data is incompatible with --schema-only */ + check_mut_excl_opts(foreign_servers_include_patterns.head, "--include-foreign-data", + schema_only, "-s/--schema-only"); if (numWorkers > 1 && foreign_servers_include_patterns.head != NULL) pg_fatal("option %s is not supported with parallel backup", "--include-foreign-data"); - if (data_only && dopt.outputClean) - pg_fatal("options %s and %s cannot be used together", - "-c/--clean", "-a/--data-only"); + /* --clean is incompatible with --data-only */ + check_mut_excl_opts(dopt.outputClean, "-c/--clean", + data_only, "-a/--data-only"); if (dopt.if_exists && !dopt.outputClean) pg_fatal("option %s requires option %s", diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 4ded902095288..b29eaa819388c 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -421,7 +421,7 @@ main(int argc, char *argv[]) exit_nicely(1); } - /* --exclude_database is incompatible with global *-only options */ + /* --exclude-database is incompatible with global *-only options */ check_mut_excl_opts(database_exclude_patterns.head, "--exclude-database", globals_only, "-g/--globals-only", roles_only, "-r/--roles-only", diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index 752d859e264b8..fb44c0cfdfe4a 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -385,31 +385,20 @@ main(int argc, char **argv) if (!opts->cparams.dbname && !opts->filename && !opts->tocSummary) pg_fatal("one of -d/--dbname and -f/--file must be specified"); - if (db_exclude_patterns.head != NULL && globals_only) - { - pg_log_error("option %s cannot be used together with %s", - "--exclude-database", "-g/--globals-only"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } + /* --exclude-database and --globals-only are incompatible */ + check_mut_excl_opts(db_exclude_patterns.head, "--exclude-database", + globals_only, "-g/--globals-only"); /* Should get at most one of -d and -f, else user is confused */ - if (opts->cparams.dbname) - { - if (opts->filename) - { - pg_log_error("options %s and %s cannot be used together", - "-d/--dbname", "-f/--file"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } + check_mut_excl_opts(opts->cparams.dbname, "-d/--dbname", + opts->filename, "-f/--file"); - if (opts->restrict_key) - pg_fatal("options %s and %s cannot be used together", - "-d/--dbname", "--restrict-key"); + /* --dbname and --restrict-key are incompatible */ + check_mut_excl_opts(opts->cparams.dbname, "-d/--dbname", + opts->restrict_key, "--restrict-key"); + if (opts->cparams.dbname) opts->useDB = 1; - } else { /* @@ -423,85 +412,54 @@ main(int argc, char **argv) pg_fatal("invalid restrict key"); } - /* reject conflicting "-only" options */ - if (data_only && schema_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "-a/--data-only"); - if (schema_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics-only"); - if (data_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics-only"); - - /* reject conflicting "-only" and "no-" options */ - if (data_only && no_data) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--no-data"); - if (schema_only && no_schema) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--no-schema"); - if (statistics_only && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics-only", "--no-statistics"); - - /* reject conflicting "no-" options */ - if (with_statistics && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics", "--no-statistics"); - - /* reject conflicting "only-" options */ - if (data_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics"); - if (schema_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics"); - - if (data_only && opts->dropSchema) - pg_fatal("options %s and %s cannot be used together", - "-c/--clean", "-a/--data-only"); - - if (opts->single_txn && opts->txn_size > 0) - pg_fatal("options %s and %s cannot be used together", - "-1/--single-transaction", "--transaction-size"); - - if (opts->single_txn && globals_only) - pg_fatal("options %s and %s cannot be used together when restoring an archive created by pg_dumpall", - "--single-transaction", "-g/--globals-only"); - - if (opts->txn_size && globals_only) - pg_fatal("options %s and %s cannot be used together when restoring an archive created by pg_dumpall", - "--transaction-size", "-g/--globals-only"); - - if (opts->exit_on_error && globals_only) - pg_fatal("options %s and %s cannot be used together when restoring an archive created by pg_dumpall", - "--exit-on-error", "-g/--globals-only"); - - if (data_only && globals_only) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "-g/--globals-only"); - if (schema_only && globals_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "-g/--globals-only"); - if (statistics_only && globals_only) - pg_fatal("options %s and %s cannot be used together", - "--statistics-only", "-g/--globals-only"); - if (with_statistics && globals_only) - pg_fatal("options %s and %s cannot be used together", - "--statistics", "-g/--globals-only"); - - if (no_globals && globals_only) - pg_fatal("options %s and %s cannot be used together", - "--no-globals", "-g/--globals-only"); + /* *-only options are incompatible with each other */ + check_mut_excl_opts(data_only, "-a/--data-only", + globals_only, "-g/--globals-only", + schema_only, "-s/--schema-only", + statistics_only, "--statistics-only"); + + /* --no-* and *-only for same thing are incompatible */ + check_mut_excl_opts(data_only, "-a/--data-only", + no_data, "--no-data"); + check_mut_excl_opts(globals_only, "-g/--globals-only", + no_globals, "--no-globals"); + check_mut_excl_opts(schema_only, "-s/--schema-only", + no_schema, "--no-schema"); + check_mut_excl_opts(statistics_only, "--statistics-only", + no_statistics, "--no-statistics"); + + /* --statistics and --no-statistics are incompatible */ + check_mut_excl_opts(with_statistics, "--statistics", + no_statistics, "--no-statistics"); + + /* --statistics is incompatible with *-only (except --statistics-only) */ + check_mut_excl_opts(with_statistics, "--statistics", + data_only, "-a/--data-only", + globals_only, "-g/--globals-only", + schema_only, "-s/--schema-only"); + + /* --clean and --data-only are incompatible */ + check_mut_excl_opts(opts->dropSchema, "-c/--clean", + data_only, "-a/--data-only"); + + /* + * --globals-only, --single-transaction, and --transaction-size are + * incompatible. + */ + check_mut_excl_opts(globals_only, "-g/--globals-only", + opts->single_txn, "-1/--single-transaction", + opts->txn_size, "--transaction-size"); + + /* --exit-on-error and --globals-only are incompatible */ + check_mut_excl_opts(opts->exit_on_error, "--exit-on-error", + globals_only, "-g/--globals-only"); /* * -C is not compatible with -1, because we can't create a database inside * a transaction block. */ - if (opts->createDB && opts->single_txn) - pg_fatal("options %s and %s cannot be used together", - "-C/--create", "-1/--single-transaction"); + check_mut_excl_opts(opts->createDB, "-C/--create", + opts->single_txn, "-1/--single-transaction"); /* Can't do single-txn mode with multiple connections */ if (opts->single_txn && numWorkers > 1) diff --git a/src/bin/pg_dump/t/001_basic.pl b/src/bin/pg_dump/t/001_basic.pl index 67131a674f446..2f5eb48e7b86c 100644 --- a/src/bin/pg_dump/t/001_basic.pl +++ b/src/bin/pg_dump/t/001_basic.pl @@ -46,8 +46,8 @@ command_fails_like( [ 'pg_dump', '-s', '-a' ], - qr/\Qpg_dump: error: options -s\/--schema-only and -a\/--data-only cannot be used together\E/, - 'pg_dump: options -s/--schema-only and -a/--data-only cannot be used together' + qr/\Qpg_dump: error: options -a\/--data-only and -s\/--schema-only cannot be used together\E/, + 'pg_dump: options -a/--data-only and -s/--schema-only cannot be used together' ); command_fails_like( @@ -64,8 +64,8 @@ command_fails_like( [ 'pg_dump', '-s', '--include-foreign-data=xxx' ], - qr/\Qpg_dump: error: options -s\/--schema-only and --include-foreign-data cannot be used together\E/, - 'pg_dump: options -s/--schema-only and --include-foreign-data cannot be used together' + qr/\Qpg_dump: error: options --include-foreign-data and -s\/--schema-only cannot be used together\E/, + 'pg_dump: options --include-foreign-data and -s/--schema-only cannot be used together' ); command_fails_like( @@ -87,8 +87,8 @@ command_fails_like( [ 'pg_restore', '-s', '-a', '-f -' ], - qr/\Qpg_restore: error: options -s\/--schema-only and -a\/--data-only cannot be used together\E/, - 'pg_restore: options -s/--schema-only and -a/--data-only cannot be used together' + qr/\Qpg_restore: error: options -a\/--data-only and -s\/--schema-only cannot be used together\E/, + 'pg_restore: options -a/--data-only and -s/--schema-only cannot be used together' ); command_fails_like( @@ -300,8 +300,8 @@ command_fails_like( [ 'pg_restore', '--exclude-database=foo', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: option --exclude-database cannot be used together with -g\/--globals-only\E/, - 'pg_restore: option --exclude-database cannot be used together with -g/--globals-only' + qr/\Qpg_restore: error: options --exclude-database and -g\/--globals-only cannot be used together\E/, + 'pg_restore: options --exclude-database and -g/--globals-only cannot be used together' ); command_fails_like( @@ -312,14 +312,14 @@ command_fails_like( [ 'pg_restore', '--schema-only', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: options -s\/--schema-only and -g\/--globals-only cannot be used together\E/, - 'pg_restore: error: options -s/--schema-only and -g/--globals-only cannot be used together' + qr/\Qpg_restore: error: options -g\/--globals-only and -s\/--schema-only cannot be used together\E/, + 'pg_restore: error: options -g/--globals-only and -s/--schema-only cannot be used together' ); command_fails_like( [ 'pg_restore', '--statistics-only', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: options --statistics-only and -g\/--globals-only cannot be used together\E/, - 'pg_restore: error: options --statistics-only and -g/--globals-only cannot be used together' + qr/\Qpg_restore: error: options -g\/--globals-only and --statistics-only cannot be used together\E/, + 'pg_restore: error: options -g/--globals-only and --statistics-only cannot be used together' ); command_fails_like( @@ -339,6 +339,6 @@ 'pg_restore', '--globals-only', '--no-globals', '-d', 'xxx', 'dumpdir' ], - qr/\Qpg_restore: error: options --no-globals and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and --no-globals cannot be used together\E/, 'options --no-globals and --globals-only cannot be used together'); done_testing(); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index e7cc998cfbad2..6d1d38128fcf7 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -5077,8 +5077,8 @@ '--schema-only', '--statistics', ], - qr/\Qpg_dump: error: options -s\/--schema-only and --statistics cannot be used together\E/, - 'cannot use --schema-only and --statistics together'); + qr/\Qpg_dump: error: options --statistics and -s\/--schema-only cannot be used together\E/, + 'cannot use --statistics and --schema-only together'); command_fails_like( [ diff --git a/src/bin/pg_dump/t/007_pg_dumpall.pl b/src/bin/pg_dump/t/007_pg_dumpall.pl index c16c27d7387c2..22f11a13a9a68 100644 --- a/src/bin/pg_dump/t/007_pg_dumpall.pl +++ b/src/bin/pg_dump/t/007_pg_dumpall.pl @@ -520,7 +520,7 @@ '--schema-only', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options -s\/--schema-only and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and -s\/--schema-only cannot be used together\E/, 'When --globals-only and --schema-only are used together'); # report an error when --globals-only and --statistics-only are used together @@ -533,7 +533,7 @@ '--statistics-only', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options --statistics-only and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and --statistics-only cannot be used together\E/, 'When --globals-only and --statistics-only are used together'); # report an error when --globals-only and --statistics are used together @@ -572,7 +572,7 @@ '--single-transaction', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options --single-transaction and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and -1\/--single-transaction cannot be used together\E/, 'When --globals-only and --single-transaction are used together'); # report an error when --globals-only and --transaction-size are used together @@ -585,7 +585,7 @@ '--transaction-size' => '100', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options --transaction-size and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and --transaction-size cannot be used together\E/, 'When --globals-only and --transaction-size are used together'); # verify map.dat preamble exists From 6307b096e2599edfe238816118b3f365a73fd12a Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 10 Mar 2026 07:05:32 +0900 Subject: [PATCH 11/32] Fix misuse of "volatile" in xml.c What should be used is not "volatile foo *ptr" but "foo *volatile ptr", The incorrect (former) style means that what the pointer variable points to is volatile. The correct (latter) style means that the pointer variable itself needs to be treated as volatile. The latter style is required to ensure a consistent treatment of these variables after a longjmp with the TRY/CATCH blocks. Some casts can be removed thanks to this change. Issue introduced by 2e947217474c, so no backpatch is required. A similar set of issues has been fixed in 93001888d85c for contrib/xml2/. Author: ChangAo Chen Discussion: https://postgr.es/m/tencent_5BE8DAD985EE140ED62EA728C8D4E1311F0A@qq.com --- src/backend/utils/adt/xml.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 2c8d5a81b751d..79f6cf7b4fa76 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -529,7 +529,7 @@ xmltext(PG_FUNCTION_ARGS) #ifdef USE_LIBXML text *arg = PG_GETARG_TEXT_PP(0); text *result; - volatile xmlChar *xmlbuf = NULL; + xmlChar *volatile xmlbuf = NULL; PgXmlErrorContext *xmlerrcxt; /* First we gotta spin up some error handling. */ @@ -544,19 +544,19 @@ xmltext(PG_FUNCTION_ARGS) "could not allocate xmlChar"); result = cstring_to_text_with_len((const char *) xmlbuf, - xmlStrlen((const xmlChar *) xmlbuf)); + xmlStrlen(xmlbuf)); } PG_CATCH(); { if (xmlbuf) - xmlFree((xmlChar *) xmlbuf); + xmlFree(xmlbuf); pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } PG_END_TRY(); - xmlFree((xmlChar *) xmlbuf); + xmlFree(xmlbuf); pg_xml_done(xmlerrcxt, false); PG_RETURN_XML_P(result); @@ -4247,7 +4247,7 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } else { - volatile xmlChar *str = NULL; + xmlChar *volatile str = NULL; PG_TRY(); { @@ -4267,7 +4267,7 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) PG_FINALLY(); { if (str) - xmlFree((xmlChar *) str); + xmlFree(str); } PG_END_TRY(); } From 03facc1211b0ff1550f41bcd4da09329080c30f9 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 10 Mar 2026 12:00:05 +0900 Subject: [PATCH 12/32] Switch to FATAL error for missing checkpoint record without backup_label Crash recovery started without a backup_label previously crashed with a PANIC if the checkpoint record could not be found. This commit lowers the report generated to be a FATAL instead. With recovery methods being more imaginative these days, this should provide more flexibility when handling PostgreSQL recovery processing in the event of a driver error, similarly to 15f68cebdcec. An extra benefit of this change is that it becomes possible to add a test to check that a FATAL is hit with an expected error message pattern. With the recovery code becoming more complicated over the last couple of years, I suspect that this will be benefitial to cover in the long-term. The original PANIC behavior has been introduced in the early days of crash recovery, as of 4d14fe0048cf (PANIC did not exist yet, the code used STOP). Author: Nitin Jadhav Discussion: https://postgr.es/m/CAMm1aWZbQ-Acp_xAxC7mX9uZZMH8+NpfepY9w=AOxbBVT9E=uA@mail.gmail.com --- src/backend/access/transam/xlogrecovery.c | 2 +- src/test/recovery/meson.build | 1 + .../t/052_checkpoint_segment_missing.pl | 59 +++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 src/test/recovery/t/052_checkpoint_segment_missing.pl diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index d55a534b13883..6d2c4a86b9600 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -735,7 +735,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * can't read the last checkpoint because this allows us to * simplify processing around checkpoints. */ - ereport(PANIC, + ereport(FATAL, errmsg("could not locate a valid checkpoint record at %X/%08X", LSN_FORMAT_ARGS(CheckPointLoc))); } diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 8d20488952e69..36d789720a3c8 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -60,6 +60,7 @@ tests += { 't/049_wait_for_lsn.pl', 't/050_redo_segment_missing.pl', 't/051_effective_wal_level.pl', + 't/052_checkpoint_segment_missing.pl', ], }, } diff --git a/src/test/recovery/t/052_checkpoint_segment_missing.pl b/src/test/recovery/t/052_checkpoint_segment_missing.pl new file mode 100644 index 0000000000000..da54d141f0dea --- /dev/null +++ b/src/test/recovery/t/052_checkpoint_segment_missing.pl @@ -0,0 +1,59 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group +# +# Verify crash recovery behavior when the WAL segment containing the +# checkpoint record referenced by pg_controldata is missing. This +# checks the code path where there is no backup_label file, where the +# startup process should fail with FATAL and log a message about the +# missing checkpoint record. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node = PostgreSQL::Test::Cluster->new('testnode'); +$node->init; +$node->append_conf('postgresql.conf', 'log_checkpoints = on'); +$node->start; + +# Force a checkpoint so as pg_controldata points to a checkpoint record we +# can target. +$node->safe_psql('postgres', 'CHECKPOINT;'); + +# Retrieve the checkpoint LSN and derive the WAL segment name. +my $checkpoint_walfile = $node->safe_psql('postgres', + "SELECT pg_walfile_name(checkpoint_lsn) FROM pg_control_checkpoint()"); + +ok($checkpoint_walfile ne '', + "derived checkpoint WAL file name: $checkpoint_walfile"); + +# Stop the node. +$node->stop('immediate'); + +# Remove the WAL segment containing the checkpoint record. +my $walpath = $node->data_dir . "/pg_wal/$checkpoint_walfile"; +ok(-f $walpath, "checkpoint WAL file exists before deletion: $walpath"); + +unlink $walpath + or die "could not remove WAL file $walpath: $!"; + +ok(!-e $walpath, "checkpoint WAL file removed: $walpath"); + +# Use run_log instead of node->start because this test expects that +# the server ends with an error during recovery. +run_log( + [ + 'pg_ctl', + '--pgdata' => $node->data_dir, + '--log' => $node->logfile, + 'start', + ]); + +# Confirm that recovery has failed as expected. +my $logfile = slurp_file($node->logfile()); +ok( $logfile =~ + qr/FATAL: .* could not locate a valid checkpoint record at .*/, + "FATAL logged for missing checkpoint record (no backup_label path)"); + +done_testing(); From 0fbfd37cefb7eb30c0fa8a158751c19ddeddf1f0 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Mar 2026 08:33:55 -0400 Subject: [PATCH 13/32] Allow extensions to mark an individual index as disabled. Up until now, the only way for a loadable module to disable the use of a particular index was to use build_simple_rel_hook (or, previous to yesterday's commit, get_relation_info_hook) to remove it from the index list. While that works, it has some disadvantages. First, the index becomes invisible for all purposes, and can no longer be used for optimizations such as self-join elimination or left join removal, which can severely degrade the resulting plan. Second, if the module attempts to compel the use of a certain index by removing all other indexes from the index list and disabling other scan types, but the planner is unable to use the chosen index for some reason, it will fall back to a sequential scan, because that is only disabled, whereas the other indexes are, from the planner's point of view, completely gone. While this situation ideally shouldn't occur, it's hard for a loadable module to be completely sure whether the planner will view a certain index as usable for a certain query. If it isn't, it may be better to fall back to a scan using a disabled index rather than falling back to an also-disabled sequential scan. Reviewed-by: Alexandra Wang Discussion: http://postgr.es/m/CA%2BTgmoYS4ZCVAF2jTce%3DbMP0Oq_db_srocR4cZyO0OBp9oUoGg%40mail.gmail.com --- src/backend/optimizer/util/pathnode.c | 8 ++++++++ src/include/nodes/pathnodes.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index c94e077000f5a..96cc72a776b8d 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1122,6 +1122,14 @@ create_index_path(PlannerInfo *root, cost_index(pathnode, root, loop_count, partial_path); + /* + * cost_index will set disabled_nodes to 1 if this rel is not allowed to + * use index scans in general, but it doesn't have the IndexOptInfo to + * know whether this specific index has been disabled. + */ + if (index->disabled) + pathnode->path.disabled_nodes = 1; + return pathnode; } diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index c175ee95b68c5..27758ec16fe66 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1412,6 +1412,8 @@ typedef struct IndexOptInfo bool nullsnotdistinct; /* is uniqueness enforced immediately? */ bool immediate; + /* true if paths using this index should be marked disabled */ + bool disabled; /* true if index doesn't really exist */ bool hypothetical; From 8080f44f96a978ce94f7e6b44df1158880525e01 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 10 Mar 2026 13:56:52 +0100 Subject: [PATCH 14/32] Rename grammar nonterminal to simplify reuse A list of expressions with optional AS-labels is useful in a few different places. Right now, this is available as xml_attribute_list because it was first used in the XMLATTRIBUTES construct, but it is already used elsewhere, and there are other possible future uses. To reduce possible confusion going forward, rename it to labeled_expr_list (like existing expr_list plus ColLabel). Discussion: https://www.postgresql.org/message-id/flat/a855795d-e697-4fa5-8698-d20122126567@eisentraut.org --- src/backend/parser/gram.y | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 9cbe8eafc4545..19d8a29a35ecd 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -620,8 +620,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type opt_provider security_label -%type xml_attribute_el -%type xml_attribute_list xml_attributes +%type labeled_expr +%type labeled_expr_list xml_attributes %type xml_root_version opt_xml_root_standalone %type xmlexists_argument %type document_or_content @@ -16317,7 +16317,7 @@ func_expr_common_subexpr: COERCE_SQL_SYNTAX, @1); } - | XMLFOREST '(' xml_attribute_list ')' + | XMLFOREST '(' labeled_expr_list ')' { $$ = makeXmlExpr(IS_XMLFOREST, NULL, $3, NIL, @1); } @@ -16542,14 +16542,14 @@ opt_xml_root_standalone: ',' STANDALONE_P YES_P { $$ = makeIntConst(XML_STANDALONE_OMITTED, -1); } ; -xml_attributes: XMLATTRIBUTES '(' xml_attribute_list ')' { $$ = $3; } +xml_attributes: XMLATTRIBUTES '(' labeled_expr_list ')' { $$ = $3; } ; -xml_attribute_list: xml_attribute_el { $$ = list_make1($1); } - | xml_attribute_list ',' xml_attribute_el { $$ = lappend($1, $3); } +labeled_expr_list: labeled_expr { $$ = list_make1($1); } + | labeled_expr_list ',' labeled_expr { $$ = lappend($1, $3); } ; -xml_attribute_el: a_expr AS ColLabel +labeled_expr: a_expr AS ColLabel { $$ = makeNode(ResTarget); $$->name = $3; From 59bae234352d10535da7e655bcd7bc8a1339f57f Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Tue, 10 Mar 2026 22:55:11 +0900 Subject: [PATCH 15/32] Remove duplicate initialization in initialize_brin_buildstate(). Commit dae761a added initialization of some BrinBuildState fields in initialize_brin_buildstate(). Later, commit b437571 inadvertently added the same initialization again. This commit removes that redundant initialization. No behavioral change is intended. Author: Chao Li Reviewed-by: Shinya Kato Discussion: https://postgr.es/m/CAEoWx2nmrca6-9SNChDvRYD6+r==fs9qg5J93kahS7vpoq8QVg@mail.gmail.com --- src/backend/access/brin/brin.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 146ee97a47dc4..1909c3254b5ba 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1689,9 +1689,6 @@ initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, state->bs_leader = NULL; state->bs_worker_id = 0; state->bs_sortstate = NULL; - state->bs_context = CurrentMemoryContext; - state->bs_emptyTuple = NULL; - state->bs_emptyTupleLen = 0; /* Remember the memory context to use for an empty tuple, if needed. */ state->bs_context = CurrentMemoryContext; From a198c26dede50a1fd4c22fb134b53ce8d4f8f5e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Tue, 10 Mar 2026 16:00:19 +0100 Subject: [PATCH 16/32] pg_dumpall: simplify coding of dropDBs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no need for a StringInfo when all you want is a string being constructed in a single pass. Author: Álvaro Herrera Reported-by: Ranier Vilela Reviewed-by: Yang Yuanzhuo <1197620467@qq.com> Reviewed-by: Michael Paquier Reviewed-by: Andrew Dunstan Discussion: https://postgr.es/m/CAEudQAq2wyXZRdsh+wVHcOrungPU+_aQeQU12wbcgrmE0bQovA@mail.gmail.com --- src/bin/pg_dump/pg_dumpall.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index b29eaa819388c..3d2a1d27aefdd 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -1833,7 +1833,6 @@ dropDBs(PGconn *conn) for (i = 0; i < PQntuples(res); i++) { char *dbname = PQgetvalue(res, i, 0); - PQExpBuffer delQry = createPQExpBuffer(); /* * Skip "postgres" and "template1"; dumpDatabases() will deal with @@ -1846,15 +1845,14 @@ dropDBs(PGconn *conn) { if (archDumpFormat == archNull) { - appendPQExpBuffer(delQry, "DROP DATABASE %s%s;\n", - if_exists ? "IF EXISTS " : "", - fmtId(dbname)); - fprintf(OPF, "%s", delQry->data); + fprintf(OPF, "DROP DATABASE %s%s;\n", + if_exists ? "IF EXISTS " : "", + fmtId(dbname)); } else { - appendPQExpBuffer(delQry, "DROP DATABASE IF EXISTS %s;\n", - fmtId(dbname)); + char *stmt = psprintf("DROP DATABASE IF EXISTS %s;\n", + fmtId(dbname)); ArchiveEntry(fout, nilCatalogId, /* catalog ID */ @@ -1862,10 +1860,9 @@ dropDBs(PGconn *conn) ARCHIVE_OPTS(.tag = psprintf("DATABASE %s", fmtId(dbname)), .description = "DROP_GLOBAL", .section = SECTION_PRE_DATA, - .createStmt = delQry->data)); + .createStmt = stmt)); + pg_free(stmt); } - - destroyPQExpBuffer(delQry); } } From f4a4ce52c0d1565c13c436ea17960d22787ea752 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 10 Mar 2026 10:06:09 -0400 Subject: [PATCH 17/32] heapam: Don't mimic MarkBufferDirtyHint() in inplace updates Previously heap_inplace_update_and_unlock() used an operation order similar to MarkBufferDirty(), to reduce the number of different approaches used for updating buffers. However, in an upcoming patch, MarkBufferDirtyHint() will switch to using the update protocol used by most other places (enabled by hint bits only being set while holding a share-exclusive lock). Luckily it's pretty easy to adjust heap_inplace_update_and_unlock(). As a comment already foresaw, we can use the normal order, with the slight change of updating the buffer contents after WAL logging. Reviewed-by: Heikki Linnakangas Reviewed-by: Noah Misch Discussion: https://postgr.es/m/5ubipyssiju5twkb7zgqwdr7q2vhpkpmuelxfpanetlk6ofnop@hvxb4g2amb2d --- src/backend/access/heap/heapam.c | 35 ++++++++++++-------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index a231563f0dfec..1ecc83308510c 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6613,11 +6613,11 @@ heap_inplace_update_and_unlock(Relation relation, /*---------- * NO EREPORT(ERROR) from here till changes are complete * - * Our buffer lock won't stop a reader having already pinned and checked - * visibility for this tuple. Hence, we write WAL first, then mutate the - * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(), - * checkpoint delay makes that acceptable. With the usual order of - * changes, a crash after memcpy() and before XLogInsert() could allow + * Our exclusive buffer lock won't stop a reader having already pinned and + * checked visibility for this tuple. With the usual order of changes + * (i.e. updating the buffer contents before WAL logging), a reader could + * observe our not-yet-persistent update to relfrozenxid and update + * datfrozenxid based on that. A crash in that moment could allow * datfrozenxid to overtake relfrozenxid: * * ["D" is a VACUUM (ONLY_DATABASE_STATS)] @@ -6629,21 +6629,15 @@ heap_inplace_update_and_unlock(Relation relation, * [crash] * [recovery restores datfrozenxid w/o relfrozenxid] * - * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(). - * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack. - * The stack copy facilitates a FPI of the post-mutation block before we - * accept other sessions seeing it. DELAY_CHKPT_START allows us to - * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint() - * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START. - * This function, however, likely could avoid it with the following order - * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use - * DELAY_CHKPT_START here, too, as a way to have fewer distinct code - * patterns to analyze. Inplace update isn't so frequent that it should - * pursue the small optimization of skipping DELAY_CHKPT_START. - */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + * We avoid that by using a temporary copy of the buffer to hide our + * change from other backends until the change has been WAL-logged. We + * apply our change to the temporary copy and WAL-log it, before modifying + * the real page. That way any action a reader of the in-place-updated + * value takes will be WAL logged after this change. + */ START_CRIT_SECTION(); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + MarkBufferDirty(buffer); /* XLOG stuff */ if (RelationNeedsWAL(relation)) @@ -6692,8 +6686,6 @@ heap_inplace_update_and_unlock(Relation relation, memcpy(dst, src, newlen); - MarkBufferDirty(buffer); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* @@ -6702,7 +6694,6 @@ heap_inplace_update_and_unlock(Relation relation, */ AtInplace_Inval(); - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock); From a596d27d807974778513158cb0eafc76bae33d97 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 10 Mar 2026 11:36:38 -0700 Subject: [PATCH 18/32] Fix grammar in short description of effective_wal_level. Align with the convention of using third-person singular (e.g., "Shows" instead of "Show") for GUC parameter descriptions. Author: Kyotaro Horiguchi Discussion: https://postgr.es/m/20260210.143752.1113524465620875233.horikyota.ntt@gmail.com --- src/backend/utils/misc/guc_parameters.dat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 5ee84a639d828..a5a0edf2534aa 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -806,7 +806,7 @@ }, { name => 'effective_wal_level', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', - short_desc => 'Show effective WAL level.', + short_desc => 'Shows effective WAL level.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', variable => 'effective_wal_level', boot_val => 'WAL_LEVEL_REPLICA', From ac58465e0618941842439eb3f5a2cf8bebd5a3f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Tue, 10 Mar 2026 19:56:39 +0100 Subject: [PATCH 19/32] Introduce the REPACK command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit REPACK absorbs the functionality of VACUUM FULL and CLUSTER in a single command. Because this functionality is completely different from regular VACUUM, having it separate from VACUUM makes it easier for users to understand; as for CLUSTER, the term is heavily overloaded in the IT world and even in Postgres itself, so it's good that we can avoid it. We retain those older commands, but de-emphasize them in the documentation, in favor of REPACK; the difference between VACUUM FULL and CLUSTER (namely, the fact that tuples are written in a specific ordering) is neatly absorbed as two different modes of REPACK. This allows us to introduce further functionality in the future that works regardless of whether an ordering is being applied, such as (and especially) a concurrent mode. Author: Antonin Houska Reviewed-by: Mihail Nikalayeu Reviewed-by: Álvaro Herrera Reviewed-by: Robert Treat Reviewed-by: Euler Taveira Reviewed-by: Matheus Alcantara Reviewed-by: Junwang Zhao Reviewed-by: jian he Discussion: https://postgr.es/m/82651.1720540558@antos Discussion: https://postgr.es/m/202507262156.sb455angijk6@alvherre.pgsql --- doc/src/sgml/monitoring.sgml | 226 +++++- doc/src/sgml/ref/allfiles.sgml | 1 + doc/src/sgml/ref/cluster.sgml | 99 +-- doc/src/sgml/ref/repack.sgml | 330 +++++++++ doc/src/sgml/ref/vacuum.sgml | 33 +- doc/src/sgml/reference.sgml | 1 + src/backend/access/heap/heapam_handler.c | 32 +- src/backend/catalog/index.c | 2 +- src/backend/catalog/system_views.sql | 29 +- src/backend/commands/cluster.c | 889 +++++++++++++++-------- src/backend/commands/vacuum.c | 6 +- src/backend/parser/gram.y | 86 ++- src/backend/tcop/utility.c | 29 +- src/backend/utils/adt/pgstatfuncs.c | 4 +- src/bin/psql/tab-complete.in.c | 43 +- src/include/catalog/catversion.h | 2 +- src/include/commands/cluster.h | 8 +- src/include/commands/progress.h | 50 +- src/include/nodes/parsenodes.h | 35 +- src/include/parser/kwlist.h | 1 + src/include/tcop/cmdtaglist.h | 1 + src/include/utils/backend_progress.h | 2 +- src/test/regress/expected/cluster.out | 137 +++- src/test/regress/expected/rules.out | 72 +- src/test/regress/sql/cluster.sql | 72 +- src/tools/pgindent/typedefs.list | 2 + 26 files changed, 1650 insertions(+), 542 deletions(-) create mode 100644 doc/src/sgml/ref/repack.sgml diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index b3d5355068801..cc014564c9704 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -413,6 +413,14 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_progress_repackpg_stat_progress_repack + One row for each backend running + REPACK, showing current progress. See + . + + + pg_stat_progress_basebackuppg_stat_progress_basebackup One row for each WAL sender process streaming a base backup, @@ -5796,9 +5804,9 @@ FROM pg_stat_get_backend_idset() AS backendid; PostgreSQL has the ability to report the progress of certain commands during command execution. Currently, the only commands which support progress reporting are ANALYZE, - CLUSTER, - CREATE INDEX, VACUUM, - COPY, + COPY, CREATE INDEX, + REPACK (and its obsolete spelling CLUSTER), + VACUUM, and (i.e., replication command that issues to take a base backup). @@ -6731,6 +6739,218 @@ FROM pg_stat_get_backend_idset() AS backendid; + + REPACK Progress Reporting + + + pg_stat_progress_repack + + + + Whenever REPACK is running, + the pg_stat_progress_repack view will contain a + row for each backend that is currently running the command. The tables + below describe the information that will be reported and provide + information about how to interpret it. + + + + <structname>pg_stat_progress_repack</structname> View + + + + + Column Type + + + Description + + + + + + + + pid integer + + + Process ID of backend. + + + + + + datid oid + + + OID of the database to which this backend is connected. + + + + + + datname name + + + Name of the database to which this backend is connected. + + + + + + relid oid + + + OID of the table being repacked. + + + + + + phase text + + + Current processing phase. See . + + + + + + repack_index_relid oid + + + If the table is being scanned using an index, this is the OID of the + index being used; otherwise, it is zero. + + + + + + heap_tuples_scanned bigint + + + Number of heap tuples scanned. + This counter only advances when the phase is + seq scanning heap, + index scanning heap + or writing new heap. + + + + + + heap_tuples_written bigint + + + Number of heap tuples written. + This counter only advances when the phase is + seq scanning heap, + index scanning heap + or writing new heap. + + + + + + heap_blks_total bigint + + + Total number of heap blocks in the table. This number is reported + as of the beginning of seq scanning heap. + + + + + + heap_blks_scanned bigint + + + Number of heap blocks scanned. This counter only advances when the + phase is seq scanning heap. + + + + + + index_rebuild_count bigint + + + Number of indexes rebuilt. This counter only advances when the phase + is rebuilding index. + + + + +
+ + + REPACK Phases + + + + + + Phase + Description + + + + + + initializing + + The command is preparing to begin scanning the heap. This phase is + expected to be very brief. + + + + seq scanning heap + + The command is currently scanning the table using a sequential scan. + + + + index scanning heap + + REPACK is currently scanning the table using an index scan. + + + + sorting tuples + + REPACK is currently sorting tuples. + + + + writing new heap + + REPACK is currently writing the new heap. + + + + swapping relation files + + The command is currently swapping newly-built files into place. + + + + rebuilding index + + The command is currently rebuilding an index. + + + + performing final cleanup + + The command is performing final cleanup. When this phase is + completed, REPACK will end. + + + + +
+
+ VACUUM Progress Reporting diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index e167406c74490..141ada9c50a05 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -167,6 +167,7 @@ Complete list of usable sgml source files in this directory. + diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml index 0b47460080b92..17778e9471c42 100644 --- a/doc/src/sgml/ref/cluster.sgml +++ b/doc/src/sgml/ref/cluster.sgml @@ -33,50 +33,9 @@ CLUSTER [ ( option [, ...] ) ] [ Description - CLUSTER instructs PostgreSQL - to cluster the table specified - by table_name - based on the index specified by - index_name. The index must - already have been defined on - table_name. - - - - When a table is clustered, it is physically reordered - based on the index information. Clustering is a one-time operation: - when the table is subsequently updated, the changes are - not clustered. That is, no attempt is made to store new or - updated rows according to their index order. (If one wishes, one can - periodically recluster by issuing the command again. Also, setting - the table's fillfactor storage parameter to less than - 100% can aid in preserving cluster ordering during updates, since updated - rows are kept on the same page if enough space is available there.) - - - - When a table is clustered, PostgreSQL - remembers which index it was clustered by. The form - CLUSTER table_name - reclusters the table using the same index as before. You can also - use the CLUSTER or SET WITHOUT CLUSTER - forms of ALTER TABLE to set the index to be used for - future cluster operations, or to clear any previous setting. - - - - CLUSTER without a - table_name reclusters all the - previously-clustered tables in the current database that the calling user - has privileges for. This form of CLUSTER cannot be - executed inside a transaction block. - - - - When a table is being clustered, an ACCESS - EXCLUSIVE lock is acquired on it. This prevents any other - database operations (both reads and writes) from operating on the - table until the CLUSTER is finished. + The CLUSTER command is equivalent to + with an USING INDEX + clause. See there for more details. @@ -136,63 +95,12 @@ CLUSTER [ ( option [, ...] ) ] [ - - In cases where you are accessing single rows randomly - within a table, the actual order of the data in the - table is unimportant. However, if you tend to access some - data more than others, and there is an index that groups - them together, you will benefit from using CLUSTER. - If you are requesting a range of indexed values from a table, or a - single indexed value that has multiple rows that match, - CLUSTER will help because once the index identifies the - table page for the first row that matches, all other rows - that match are probably already on the same table page, - and so you save disk accesses and speed up the query. - - - - CLUSTER can re-sort the table using either an index scan - on the specified index, or (if the index is a b-tree) a sequential - scan followed by sorting. It will attempt to choose the method that - will be faster, based on planner cost parameters and available statistical - information. - - While CLUSTER is running, the is temporarily changed to pg_catalog, pg_temp. - - When an index scan is used, a temporary copy of the table is created that - contains the table data in the index order. Temporary copies of each - index on the table are created as well. Therefore, you need free space on - disk at least equal to the sum of the table size and the index sizes. - - - - When a sequential scan and sort is used, a temporary sort file is - also created, so that the peak temporary space requirement is as much - as double the table size, plus the index sizes. This method is often - faster than the index scan method, but if the disk space requirement is - intolerable, you can disable this choice by temporarily setting to off. - - - - It is advisable to set to - a reasonably large value (but not more than the amount of RAM you can - dedicate to the CLUSTER operation) before clustering. - - - - Because the planner records statistics about the ordering of - tables, it is advisable to run ANALYZE - on the newly clustered table. - Otherwise, the planner might make poor choices of query plans. - - Because CLUSTER remembers which indexes are clustered, one can cluster the tables one wants clustered manually the first time, @@ -270,6 +178,7 @@ CLUSTER index_name ON See Also + diff --git a/doc/src/sgml/ref/repack.sgml b/doc/src/sgml/ref/repack.sgml new file mode 100644 index 0000000000000..8ccf7c7a417b5 --- /dev/null +++ b/doc/src/sgml/ref/repack.sgml @@ -0,0 +1,330 @@ + + + + + REPACK + + + + REPACK + 7 + SQL - Language Statements + + + + REPACK + rewrite a table to reclaim disk space + + + + +REPACK [ ( option [, ...] ) ] [ table_and_columns [ USING INDEX [ index_name ] ] ] +REPACK [ ( option [, ...] ) ] USING INDEX + +where option can be one of: + + VERBOSE [ boolean ] + ANALYZE [ boolean ] + +and table_and_columns is: + + table_name [ ( column_name [, ...] ) ] + + + + + Description + + + REPACK reclaims storage occupied by dead + tuples. Unlike VACUUM, it does so by rewriting the + entire contents of the table specified + by table_name into a new disk + file with no extra space (except for the space guaranteed by + the fillfactor storage parameter), allowing unused space + to be returned to the operating system. + + + + Without + a table_name, REPACK + processes every table and materialized view in the current database that + the current user has the MAINTAIN privilege on. This + form of REPACK cannot be executed inside a transaction + block. + + + + If a USING INDEX clause is specified, the rows are + physically reordered based on information from an index. Please see the + notes on clustering below. + + + + When a table is being repacked, an ACCESS EXCLUSIVE lock + is acquired on it. This prevents any other database operations (both reads + and writes) from operating on the table until the REPACK + is finished. + + + + Notes on Clustering + + + If the USING INDEX clause is specified, the rows in + the table are stored in the order that the index specifies; + clustering, because rows are physically clustered + afterwards. + If an index name is specified in the command, the order implied by that + index is used, and that index is configured as the index to cluster on. + (This also applies to an index given to the CLUSTER + command.) + If no index name is specified, then the index that has + been configured as the index to cluster on is used; an + error is thrown if none has. + An index can be set manually using ALTER TABLE ... CLUSTER ON, + and reset with ALTER TABLE ... SET WITHOUT CLUSTER. + + + + If no table name is specified in REPACK USING INDEX, + all tables which have a clustering index defined and which the calling + user has privileges for are processed. + + + + Clustering is a one-time operation: when the table is + subsequently updated, the changes are not clustered. That is, no attempt + is made to store new or updated rows according to their index order. (If + one wishes, one can periodically recluster by issuing the command again. + Also, setting the table's fillfactor storage parameter + to less than 100% can aid in preserving cluster ordering during updates, + since updated rows are kept on the same page if enough space is available + there.) + + + + In cases where you are accessing single rows randomly within a table, the + actual order of the data in the table is unimportant. However, if you tend + to access some data more than others, and there is an index that groups + them together, you will benefit from using clustering. If + you are requesting a range of indexed values from a table, or a single + indexed value that has multiple rows that match, + clustering will help because once the index identifies the + table page for the first row that matches, all other rows that match are + probably already on the same table page, and so you save disk accesses and + speed up the query. + + + + REPACK can re-sort the table using either an index scan + on the specified index (if the index is a b-tree), or a sequential scan + followed by sorting. It will attempt to choose the method that will be + faster, based on planner cost parameters and available statistical + information. + + + + Because the planner records statistics about the ordering of tables, it is + advisable to + run ANALYZE on the + newly repacked table. Otherwise, the planner might make poor choices of + query plans. + + + + + Notes on Resources + + + When an index scan or a sequential scan without sort is used, a temporary + copy of the table is created that contains the table data in the index + order. Temporary copies of each index on the table are created as well. + Therefore, you need free space on disk at least equal to the sum of the + table size and the index sizes. + + + + When a sequential scan and sort is used, a temporary sort file is also + created, so that the peak temporary space requirement is as much as double + the table size, plus the index sizes. This method is often faster than + the index scan method, but if the disk space requirement is intolerable, + you can disable this choice by temporarily setting + to off. + + + + It is advisable to set to a + reasonably large value (but not more than the amount of RAM you can + dedicate to the REPACK operation) before repacking. + + + + + + + Parameters + + + + table_name + + + The name (possibly schema-qualified) of a table. + + + + + + column_name + + + The name of a specific column to analyze. Defaults to all columns. + If a column list is specific, ANALYZE must also + be specified. + + + + + + index_name + + + The name of an index. + + + + + + VERBOSE + + + Prints a progress report as each table is repacked + at INFO level. + + + + + + ANALYZE + ANALYSE + + + Applies on the table after repacking. This is + currently only supported when a single (non-partitioned) table is specified. + + + + + + boolean + + + Specifies whether the selected option should be turned on or off. + You can write TRUE, ON, or + 1 to enable the option, and FALSE, + OFF, or 0 to disable it. The + boolean value can also + be omitted, in which case TRUE is assumed. + + + + + + + + Notes + + + To repack a table, one must have the MAINTAIN privilege + on the table. + + + + While REPACK is running, the is temporarily changed to pg_catalog, + pg_temp. + + + + Each backend running REPACK will report its progress + in the pg_stat_progress_repack view. See + for details. + + + + Repacking a partitioned table repacks each of its partitions. If an index + is specified, each partition is repacked using the partition of that + index. REPACK on a partitioned table cannot be executed + inside a transaction block. + + + + + + Examples + + + Repack the table employees: + +REPACK employees; + + + + + Repack the table employees on the basis of its + index employees_ind (Since index is used here, this is + effectively clustering): + +REPACK employees USING INDEX employees_ind; + + + + + Repack the table cases on physical ordering, + running an ANALYZE on the given columns once + repacking is done, showing informational messages: + +REPACK (ANALYZE, VERBOSE) cases (district, case_nr); + + + + + Repack all tables in the database on which you have + the MAINTAIN privilege: + +REPACK; + + + + + Repack all tables for which a clustering index has previously been + configured on which you have the MAINTAIN privilege, + showing informational messages: + +REPACK (VERBOSE) USING INDEX; + + + + + + + Compatibility + + + There is no REPACK statement in the SQL standard. + + + + + See Also + + + + + + + diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index 6d0fdd43cfb31..ac5d083d468e6 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -25,7 +25,6 @@ VACUUM [ ( option [, ...] ) ] [ where option can be one of: - FULL [ boolean ] FREEZE [ boolean ] VERBOSE [ boolean ] ANALYZE [ boolean ] @@ -39,6 +38,7 @@ VACUUM [ ( option [, ...] ) ] [ boolean ] ONLY_DATABASE_STATS [ boolean ] BUFFER_USAGE_LIMIT size + FULL [ boolean ] and table_and_columns is: @@ -95,20 +95,6 @@ VACUUM [ ( option [, ...] ) ] [ Parameters - - FULL - - - Selects full vacuum, which can reclaim more - space, but takes much longer and exclusively locks the table. - This method also requires extra disk space, since it writes a - new copy of the table and doesn't release the old copy until - the operation is complete. Usually this should only be used when a - significant amount of space needs to be reclaimed from within the table. - - - - FREEZE @@ -362,6 +348,23 @@ VACUUM [ ( option [, ...] ) ] [ + + FULL + + + This option, which is deprecated, makes VACUUM + behave like REPACK without a + USING INDEX clause. + This method of compacting the table takes much longer than + VACUUM and exclusively locks the table. + This method also requires extra disk space, since it writes a + new copy of the table and doesn't release the old copy until + the operation is complete. Usually this should only be used when a + significant amount of space needs to be reclaimed from within the table. + + + + boolean diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index 2cf02c37b17bd..d9fdbb5d254cc 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -195,6 +195,7 @@ &refreshMaterializedView; &reindex; &releaseSavepoint; + &repack; &reset; &revoke; &rollback; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 3ff36f59bf869..5137d2510ea4c 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -741,13 +741,13 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (OldIndex != NULL && !use_sort) { const int ci_index[] = { - PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_INDEX_RELID + PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_INDEX_RELID }; int64 ci_val[2]; /* Set phase and OIDOldIndex to columns */ - ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; + ci_val[0] = PROGRESS_REPACK_PHASE_INDEX_SCAN_HEAP; ci_val[1] = RelationGetRelid(OldIndex); pgstat_progress_update_multi_param(2, ci_index, ci_val); @@ -759,15 +759,15 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, else { /* In scan-and-sort mode and also VACUUM FULL, set phase */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP); tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); heapScan = (HeapScanDesc) tableScan; indexScan = NULL; /* Set total heap blocks */ - pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, + pgstat_progress_update_param(PROGRESS_REPACK_TOTAL_HEAP_BLKS, heapScan->rs_nblocks); } @@ -809,7 +809,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, * is manually updated to the correct value when the table * scan finishes. */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_BLKS_SCANNED, heapScan->rs_nblocks); break; } @@ -825,7 +825,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, */ if (prev_cblock != heapScan->rs_cblock) { - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_BLKS_SCANNED, (heapScan->rs_cblock + heapScan->rs_nblocks - heapScan->rs_startblock @@ -926,14 +926,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, * In scan-and-sort mode, report increase in number of tuples * scanned */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_SCANNED, *num_tuples); } else { const int ct_index[] = { - PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, - PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN + PROGRESS_REPACK_HEAP_TUPLES_SCANNED, + PROGRESS_REPACK_HEAP_TUPLES_WRITTEN }; int64 ct_val[2]; @@ -966,14 +966,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, double n_tuples = 0; /* Report that we are now sorting tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SORT_TUPLES); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_SORT_TUPLES); tuplesort_performsort(tuplesort); /* Report that we are now writing new heap */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_WRITE_NEW_HEAP); for (;;) { @@ -991,7 +991,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, values, isnull, rwstate); /* Report n_tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_WRITTEN, n_tuples); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 43de42ce39e28..5ee6389d39c47 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -4077,7 +4077,7 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, Assert(!ReindexIsProcessingIndex(indexOid)); /* Set index rebuild count */ - pgstat_progress_update_param(PROGRESS_CLUSTER_INDEX_REBUILD_COUNT, + pgstat_progress_update_param(PROGRESS_REPACK_INDEX_REBUILD_COUNT, i); i++; } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index ecb7c996e8646..339c016e510c7 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1311,14 +1311,15 @@ CREATE VIEW pg_stat_progress_vacuum AS FROM pg_stat_get_progress_info('VACUUM') AS S LEFT JOIN pg_database D ON S.datid = D.oid; -CREATE VIEW pg_stat_progress_cluster AS +CREATE VIEW pg_stat_progress_repack AS SELECT S.pid AS pid, S.datid AS datid, D.datname AS datname, S.relid AS relid, CASE S.param1 WHEN 1 THEN 'CLUSTER' - WHEN 2 THEN 'VACUUM FULL' + WHEN 2 THEN 'REPACK' + WHEN 3 THEN 'VACUUM FULL' END AS command, CASE S.param2 WHEN 0 THEN 'initializing' WHEN 1 THEN 'seq scanning heap' @@ -1329,15 +1330,35 @@ CREATE VIEW pg_stat_progress_cluster AS WHEN 6 THEN 'rebuilding index' WHEN 7 THEN 'performing final cleanup' END AS phase, - CAST(S.param3 AS oid) AS cluster_index_relid, + CAST(S.param3 AS oid) AS repack_index_relid, S.param4 AS heap_tuples_scanned, S.param5 AS heap_tuples_written, S.param6 AS heap_blks_total, S.param7 AS heap_blks_scanned, S.param8 AS index_rebuild_count - FROM pg_stat_get_progress_info('CLUSTER') AS S + FROM pg_stat_get_progress_info('REPACK') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +-- This view is as the one above, except for renaming a column and avoiding +-- 'REPACK' as a command name to report. +CREATE VIEW pg_stat_progress_cluster AS + SELECT + pid, + datid, + datname, + relid, + CASE WHEN command IN ('CLUSTER', 'VACUUM FULL') THEN command + WHEN repack_index_relid = 0 THEN 'VACUUM FULL' + ELSE 'CLUSTER' END AS command, + phase, + repack_index_relid AS cluster_index_relid, + heap_tuples_scanned, + heap_tuples_written, + heap_blks_total, + heap_blks_scanned, + index_rebuild_count + FROM pg_stat_progress_repack; + CREATE VIEW pg_stat_progress_create_index AS SELECT S.pid AS pid, S.datid AS datid, D.datname AS datname, diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 60a4617a5853f..3bfaa6636997e 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1,9 +1,8 @@ /*------------------------------------------------------------------------- * * cluster.c - * CLUSTER a table on an index. This is now also used for VACUUM FULL. - * - * There is hardly anything left of Paul Brown's original implementation... + * REPACK a table; formerly known as CLUSTER. VACUUM FULL also uses + * parts of this code. * * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group @@ -67,27 +66,35 @@ typedef struct Oid indexOid; } RelToCluster; - -static void cluster_multiple_rels(List *rtcs, ClusterParams *params); +static bool cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, + Oid indexOid, Oid userid, int options); static void rebuild_relation(Relation OldHeap, Relation index, bool verbose); static void copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verbose, bool *pSwapToastByContent, TransactionId *pFreezeXid, MultiXactId *pCutoffMulti); -static List *get_tables_to_cluster(MemoryContext cluster_context); -static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context, - Oid indexOid); -static bool cluster_is_permitted_for_relation(Oid relid, Oid userid); +static List *get_tables_to_repack(RepackCommand cmd, bool usingindex, + MemoryContext permcxt); +static List *get_tables_to_repack_partitioned(RepackCommand cmd, + Oid relid, bool rel_is_index, + MemoryContext permcxt); +static bool repack_is_permitted_for_relation(RepackCommand cmd, + Oid relid, Oid userid); +static Relation process_single_relation(RepackStmt *stmt, + ClusterParams *params); +static Oid determine_clustered_index(Relation rel, bool usingindex, + const char *indexname); +static const char *RepackCommandAsString(RepackCommand cmd); -/*--------------------------------------------------------------------------- - * This cluster code allows for clustering multiple tables at once. Because +/* + * The repack code allows for processing multiple tables at once. Because * of this, we cannot just run everything on a single transaction, or we * would be forced to acquire exclusive locks on all the tables being * clustered, simultaneously --- very likely leading to deadlock. * - * To solve this we follow a similar strategy to VACUUM code, - * clustering each relation in a separate transaction. For this to work, - * we need to: + * To solve this we follow a similar strategy to VACUUM code, processing each + * relation in a separate transaction. For this to work, we need to: + * * - provide a separate memory context so that we can pass information in * a way that survives across transactions * - start a new transaction every time a new relation is clustered @@ -98,197 +105,177 @@ static bool cluster_is_permitted_for_relation(Oid relid, Oid userid); * * The single-relation case does not have any such overhead. * - * We also allow a relation to be specified without index. In that case, - * the indisclustered bit will be looked up, and an ERROR will be thrown - * if there is no index with the bit set. - *--------------------------------------------------------------------------- + * We also allow a relation to be repacked following an index, but without + * naming a specific one. In that case, the indisclustered bit will be + * looked up, and an ERROR will be thrown if no so-marked index is found. */ void -cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) +ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel) { - ListCell *lc; ClusterParams params = {0}; - bool verbose = false; Relation rel = NULL; - Oid indexOid = InvalidOid; - MemoryContext cluster_context; + MemoryContext repack_context; List *rtcs; /* Parse option list */ - foreach(lc, stmt->params) + foreach_node(DefElem, opt, stmt->params) { - DefElem *opt = (DefElem *) lfirst(lc); - if (strcmp(opt->defname, "verbose") == 0) - verbose = defGetBoolean(opt); + params.options |= defGetBoolean(opt) ? CLUOPT_VERBOSE : 0; + else if (strcmp(opt->defname, "analyze") == 0 || + strcmp(opt->defname, "analyse") == 0) + params.options |= defGetBoolean(opt) ? CLUOPT_ANALYZE : 0; else ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized %s option \"%s\"", - "CLUSTER", opt->defname), - parser_errposition(pstate, opt->location))); + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized %s option \"%s\"", + RepackCommandAsString(stmt->command), + opt->defname), + parser_errposition(pstate, opt->location)); } - params.options = (verbose ? CLUOPT_VERBOSE : 0); - + /* + * If a single relation is specified, process it and we're done ... unless + * the relation is a partitioned table, in which case we fall through. + */ if (stmt->relation != NULL) { - /* This is the single-relation case. */ - Oid tableOid; - - /* - * Find, lock, and check permissions on the table. We obtain - * AccessExclusiveLock right away to avoid lock-upgrade hazard in the - * single-transaction case. - */ - tableOid = RangeVarGetRelidExtended(stmt->relation, - AccessExclusiveLock, - 0, - RangeVarCallbackMaintainsTable, - NULL); - rel = table_open(tableOid, NoLock); - - /* - * Reject clustering a remote temp table ... their local buffer - * manager is not going to cope. - */ - if (RELATION_IS_OTHER_TEMP(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster temporary tables of other sessions"))); - - if (stmt->indexname == NULL) - { - ListCell *index; - - /* We need to find the index that has indisclustered set. */ - foreach(index, RelationGetIndexList(rel)) - { - indexOid = lfirst_oid(index); - if (get_index_isclustered(indexOid)) - break; - indexOid = InvalidOid; - } - - if (!OidIsValid(indexOid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("there is no previously clustered index for table \"%s\"", - stmt->relation->relname))); - } - else - { - /* - * The index is expected to be in the same namespace as the - * relation. - */ - indexOid = get_relname_relid(stmt->indexname, - rel->rd_rel->relnamespace); - if (!OidIsValid(indexOid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("index \"%s\" for table \"%s\" does not exist", - stmt->indexname, stmt->relation->relname))); - } - - /* For non-partitioned tables, do what we came here to do. */ - if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - { - cluster_rel(rel, indexOid, ¶ms); - /* cluster_rel closes the relation, but keeps lock */ - - return; - } + rel = process_single_relation(stmt, ¶ms); + if (rel == NULL) + return; /* all done */ } + /* + * Don't allow ANALYZE in the multiple-relation case for now. Maybe we + * can add support for this later. + */ + if (params.options & CLUOPT_ANALYZE) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot execute %s on multiple tables", + "REPACK (ANALYZE)")); + /* * By here, we know we are in a multi-table situation. In order to avoid * holding locks for too long, we want to process each table in its own * transaction. This forces us to disallow running inside a user * transaction block. */ - PreventInTransactionBlock(isTopLevel, "CLUSTER"); + PreventInTransactionBlock(isTopLevel, RepackCommandAsString(stmt->command)); /* Also, we need a memory context to hold our list of relations */ - cluster_context = AllocSetContextCreate(PortalContext, - "Cluster", - ALLOCSET_DEFAULT_SIZES); + repack_context = AllocSetContextCreate(PortalContext, + "Repack", + ALLOCSET_DEFAULT_SIZES); + + params.options |= CLUOPT_RECHECK; /* - * Either we're processing a partitioned table, or we were not given any - * table name at all. In either case, obtain a list of relations to - * process. - * - * In the former case, an index name must have been given, so we don't - * need to recheck its "indisclustered" bit, but we have to check that it - * is an index that we can cluster on. In the latter case, we set the - * option bit to have indisclustered verified. - * - * Rechecking the relation itself is necessary here in all cases. + * If we don't have a relation yet, determine a relation list. If we do, + * then it must be a partitioned table, and we want to process its + * partitions. */ - params.options |= CLUOPT_RECHECK; - if (rel != NULL) + if (rel == NULL) { - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - check_index_is_clusterable(rel, indexOid, AccessShareLock); - rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid); - - /* close relation, releasing lock on parent table */ - table_close(rel, AccessExclusiveLock); + Assert(stmt->indexname == NULL); + rtcs = get_tables_to_repack(stmt->command, stmt->usingindex, + repack_context); + params.options |= CLUOPT_RECHECK_ISCLUSTERED; } else { - rtcs = get_tables_to_cluster(cluster_context); - params.options |= CLUOPT_RECHECK_ISCLUSTERED; - } + Oid relid; + bool rel_is_index; - /* Do the job. */ - cluster_multiple_rels(rtcs, ¶ms); + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - /* Start a new transaction for the cleanup work. */ - StartTransactionCommand(); + /* + * If USING INDEX was specified, resolve the index name now and pass + * it down. + */ + if (stmt->usingindex) + { + /* + * If no index name was specified when repacking a partitioned + * table, punt for now. Maybe we can improve this later. + */ + if (!stmt->indexname) + { + if (stmt->command == REPACK_COMMAND_CLUSTER) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("there is no previously clustered index for table \"%s\"", + RelationGetRelationName(rel))); + else + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on partitioned table \"%s\" USING INDEX with no index name", + RepackCommandAsString(stmt->command), + RelationGetRelationName(rel))); + } - /* Clean up working storage */ - MemoryContextDelete(cluster_context); -} + relid = determine_clustered_index(rel, stmt->usingindex, + stmt->indexname); + if (!OidIsValid(relid)) + elog(ERROR, "unable to determine index to cluster on"); + check_index_is_clusterable(rel, relid, AccessExclusiveLock); -/* - * Given a list of relations to cluster, process each of them in a separate - * transaction. - * - * We expect to be in a transaction at start, but there isn't one when we - * return. - */ -static void -cluster_multiple_rels(List *rtcs, ClusterParams *params) -{ - ListCell *lc; + rel_is_index = true; + } + else + { + relid = RelationGetRelid(rel); + rel_is_index = false; + } + + rtcs = get_tables_to_repack_partitioned(stmt->command, + relid, rel_is_index, + repack_context); + + /* close parent relation, releasing lock on it */ + table_close(rel, AccessExclusiveLock); + rel = NULL; + } /* Commit to get out of starting transaction */ PopActiveSnapshot(); CommitTransactionCommand(); /* Cluster the tables, each in a separate transaction */ - foreach(lc, rtcs) + Assert(rel == NULL); + foreach_ptr(RelToCluster, rtc, rtcs) { - RelToCluster *rtc = (RelToCluster *) lfirst(lc); - Relation rel; - /* Start a new transaction for each relation. */ StartTransactionCommand(); + /* + * Open the target table, coping with the case where it has been + * dropped. + */ + rel = try_table_open(rtc->tableOid, AccessExclusiveLock); + if (rel == NULL) + { + CommitTransactionCommand(); + continue; + } + /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); - rel = table_open(rtc->tableOid, AccessExclusiveLock); - /* Process this table */ - cluster_rel(rel, rtc->indexOid, params); + cluster_rel(stmt->command, rel, rtc->indexOid, ¶ms); /* cluster_rel closes the relation, but keeps lock */ PopActiveSnapshot(); CommitTransactionCommand(); } + + /* Start a new transaction for the cleanup work. */ + StartTransactionCommand(); + + /* Clean up working storage */ + MemoryContextDelete(repack_context); } /* @@ -304,11 +291,14 @@ cluster_multiple_rels(List *rtcs, ClusterParams *params) * them incrementally while we load the table. * * If indexOid is InvalidOid, the table will be rewritten in physical order - * instead of index order. This is the new implementation of VACUUM FULL, - * and error messages should refer to the operation as VACUUM not CLUSTER. + * instead of index order. + * + * 'cmd' indicates which command is being executed, to be used for error + * messages. */ void -cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) +cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid, + ClusterParams *params) { Oid tableOid = RelationGetRelid(OldHeap); Oid save_userid; @@ -323,13 +313,8 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) /* Check for user-requested abort. */ CHECK_FOR_INTERRUPTS(); - pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid); - if (OidIsValid(indexOid)) - pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, - PROGRESS_CLUSTER_COMMAND_CLUSTER); - else - pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, - PROGRESS_CLUSTER_COMMAND_VACUUM_FULL); + pgstat_progress_start_command(PROGRESS_COMMAND_REPACK, tableOid); + pgstat_progress_update_param(PROGRESS_REPACK_COMMAND, cmd); /* * Switch to the table owner's userid, so that any index functions are run @@ -350,86 +335,40 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) * *must* skip the one on indisclustered since it would reject an attempt * to cluster a not-previously-clustered index. */ - if (recheck) - { - /* Check that the user still has privileges for the relation */ - if (!cluster_is_permitted_for_relation(tableOid, save_userid)) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - - /* - * Silently skip a temp table for a remote session. Only doing this - * check in the "recheck" case is appropriate (which currently means - * somebody is executing a database-wide CLUSTER or on a partitioned - * table), because there is another check in cluster() which will stop - * any attempt to cluster remote temp tables by name. There is - * another check in cluster_rel which is redundant, but we leave it - * for extra safety. - */ - if (RELATION_IS_OTHER_TEMP(OldHeap)) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - - if (OidIsValid(indexOid)) - { - /* - * Check that the index still exists - */ - if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - - /* - * Check that the index is still the one with indisclustered set, - * if needed. - */ - if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 && - !get_index_isclustered(indexOid)) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - } - } + if (recheck && + !cluster_rel_recheck(cmd, OldHeap, indexOid, save_userid, + params->options)) + goto out; /* - * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER - * would work in most respects, but the index would only get marked as - * indisclustered in the current database, leading to unexpected behavior - * if CLUSTER were later invoked in another database. + * We allow repacking shared catalogs only when not using an index. It + * would work to use an index in most respects, but the index would only + * get marked as indisclustered in the current database, leading to + * unexpected behavior if CLUSTER were later invoked in another database. */ if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared) ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster a shared catalog"))); + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on a shared catalog", + RepackCommandAsString(cmd))); /* * Don't process temp tables of other backends ... their local buffer * manager is not going to cope. */ if (RELATION_IS_OTHER_TEMP(OldHeap)) - { - if (OidIsValid(indexOid)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster temporary tables of other sessions"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot vacuum temporary tables of other sessions"))); - } + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on temporary tables of other sessions", + RepackCommandAsString(cmd))); /* * Also check for active uses of the relation in the current transaction, * including open scans and pending AFTER trigger events. */ - CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM"); + CheckTableNotInUse(OldHeap, RepackCommandAsString(cmd)); /* Check heap and index are valid to cluster on */ if (OidIsValid(indexOid)) @@ -442,6 +381,24 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) else index = NULL; + /* + * When allow_system_table_mods is turned off, we disallow repacking a + * catalog on a particular index unless that's already the clustered index + * for that catalog. + * + * XXX We don't check for this in CLUSTER, because it's historically been + * allowed. + */ + if (cmd != REPACK_COMMAND_CLUSTER && + !allowSystemTableMods && OidIsValid(indexOid) && + IsCatalogRelation(OldHeap) && !index->rd_index->indisclustered) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied: \"%s\" is a system catalog", + RelationGetRelationName(OldHeap)), + errdetail("System catalogs can only be clustered by the index they're already clustered on, if any, unless \"%s\" is enabled.", + "allow_system_table_mods")); + /* * Quietly ignore the request if this is a materialized view which has not * been populated from its query. No harm is done because there is no data @@ -482,6 +439,63 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) pgstat_progress_end_command(); } +/* + * Check if the table (and its index) still meets the requirements of + * cluster_rel(). + */ +static bool +cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid, + Oid userid, int options) +{ + Oid tableOid = RelationGetRelid(OldHeap); + + /* Check that the user still has privileges for the relation */ + if (!repack_is_permitted_for_relation(cmd, tableOid, userid)) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + + /* + * Silently skip a temp table for a remote session. Only doing this check + * in the "recheck" case is appropriate (which currently means somebody is + * executing a database-wide CLUSTER or on a partitioned table), because + * there is another check in cluster() which will stop any attempt to + * cluster remote temp tables by name. There is another check in + * cluster_rel which is redundant, but we leave it for extra safety. + */ + if (RELATION_IS_OTHER_TEMP(OldHeap)) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + + if (OidIsValid(indexOid)) + { + /* + * Check that the index still exists + */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + + /* + * Check that the index is still the one with indisclustered set, if + * needed. + */ + if ((options & CLUOPT_RECHECK_ISCLUSTERED) != 0 && + !get_index_isclustered(indexOid)) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + } + + return true; +} + /* * Verify that the specified heap and index are valid to cluster on * @@ -642,8 +656,8 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose) Assert(CheckRelationLockedByMe(OldHeap, AccessExclusiveLock, false) && (index == NULL || CheckRelationLockedByMe(index, AccessExclusiveLock, false))); - if (index) - /* Mark the correct index as clustered */ + /* for CLUSTER or REPACK USING INDEX, mark the index as the one to use */ + if (index != NULL) mark_index_clustered(OldHeap, RelationGetRelid(index), true); /* Remember info about rel before closing OldHeap */ @@ -958,20 +972,20 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb /* Log what we're doing */ if (OldIndex != NULL && !use_sort) ereport(elevel, - (errmsg("clustering \"%s.%s\" using index scan on \"%s\"", - nspname, - RelationGetRelationName(OldHeap), - RelationGetRelationName(OldIndex)))); + errmsg("repacking \"%s.%s\" using index scan on \"%s\"", + nspname, + RelationGetRelationName(OldHeap), + RelationGetRelationName(OldIndex))); else if (use_sort) ereport(elevel, - (errmsg("clustering \"%s.%s\" using sequential scan and sort", - nspname, - RelationGetRelationName(OldHeap)))); + errmsg("repacking \"%s.%s\" using sequential scan and sort", + nspname, + RelationGetRelationName(OldHeap))); else ereport(elevel, - (errmsg("vacuuming \"%s.%s\"", - nspname, - RelationGetRelationName(OldHeap)))); + errmsg("repacking \"%s.%s\" in physical order", + nspname, + RelationGetRelationName(OldHeap))); /* * Hand off the actual copying to AM specific function, the generic code @@ -1458,8 +1472,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, int i; /* Report that we are now swapping relation files */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_SWAP_REL_FILES); /* Zero out possible results from swapped_relation_files */ memset(mapped_tables, 0, sizeof(mapped_tables)); @@ -1509,14 +1523,14 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; /* Report that we are now reindexing relations */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_REBUILD_INDEX); reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params); /* Report that we are now doing clean up */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_FINAL_CLEANUP); /* * If the relation being rebuilt is pg_class, swap_relation_files() @@ -1632,123 +1646,386 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, } } - /* - * Get a list of tables that the current user has privileges on and - * have indisclustered set. Return the list in a List * of RelToCluster - * (stored in the specified memory context), each one giving the tableOid - * and the indexOid on which the table is already clustered. + * Determine which relations to process, when REPACK/CLUSTER is called + * without specifying a table name. The exact process depends on whether + * USING INDEX was given or not, and in any case we only return tables and + * materialized views that the current user has privileges to repack/cluster. + * + * If USING INDEX was given, we scan pg_index to find those that have + * indisclustered set; if it was not given, scan pg_class and return all + * tables. + * + * Return it as a list of RelToCluster in the given memory context. */ static List * -get_tables_to_cluster(MemoryContext cluster_context) +get_tables_to_repack(RepackCommand cmd, bool usingindex, MemoryContext permcxt) { - Relation indRelation; + Relation catalog; TableScanDesc scan; - ScanKeyData entry; - HeapTuple indexTuple; - Form_pg_index index; - MemoryContext old_context; + HeapTuple tuple; List *rtcs = NIL; - /* - * Get all indexes that have indisclustered set and that the current user - * has the appropriate privileges for. - */ - indRelation = table_open(IndexRelationId, AccessShareLock); - ScanKeyInit(&entry, - Anum_pg_index_indisclustered, - BTEqualStrategyNumber, F_BOOLEQ, - BoolGetDatum(true)); - scan = table_beginscan_catalog(indRelation, 1, &entry); - while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + if (usingindex) { - RelToCluster *rtc; + ScanKeyData entry; - index = (Form_pg_index) GETSTRUCT(indexTuple); + /* + * For USING INDEX, scan pg_index to find those with indisclustered. + */ + catalog = table_open(IndexRelationId, AccessShareLock); + ScanKeyInit(&entry, + Anum_pg_index_indisclustered, + BTEqualStrategyNumber, F_BOOLEQ, + BoolGetDatum(true)); + scan = table_beginscan_catalog(catalog, 1, &entry); + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + RelToCluster *rtc; + Form_pg_index index; + MemoryContext oldcxt; - if (!cluster_is_permitted_for_relation(index->indrelid, GetUserId())) - continue; + index = (Form_pg_index) GETSTRUCT(tuple); - /* Use a permanent memory context for the result list */ - old_context = MemoryContextSwitchTo(cluster_context); + /* + * Try to obtain a light lock on the index's table, to ensure it + * doesn't go away while we collect the list. If we cannot, just + * disregard it. Be sure to release this if we ultimately decide + * not to process the table! + */ + if (!ConditionalLockRelationOid(index->indrelid, AccessShareLock)) + continue; - rtc = palloc_object(RelToCluster); - rtc->tableOid = index->indrelid; - rtc->indexOid = index->indexrelid; - rtcs = lappend(rtcs, rtc); + /* Verify that the table still exists; skip if not */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(index->indrelid))) + { + UnlockRelationOid(index->indrelid, AccessShareLock); + continue; + } - MemoryContextSwitchTo(old_context); + /* noisily skip rels which the user can't process */ + if (!repack_is_permitted_for_relation(cmd, index->indrelid, + GetUserId())) + { + UnlockRelationOid(index->indrelid, AccessShareLock); + continue; + } + + /* Use a permanent memory context for the result list */ + oldcxt = MemoryContextSwitchTo(permcxt); + rtc = palloc_object(RelToCluster); + rtc->tableOid = index->indrelid; + rtc->indexOid = index->indexrelid; + rtcs = lappend(rtcs, rtc); + MemoryContextSwitchTo(oldcxt); + } } - table_endscan(scan); + else + { + catalog = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(catalog, 0, NULL); + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + RelToCluster *rtc; + Form_pg_class class; + MemoryContext oldcxt; + + class = (Form_pg_class) GETSTRUCT(tuple); + + /* + * Try to obtain a light lock on the table, to ensure it doesn't + * go away while we collect the list. If we cannot, just + * disregard the table. Be sure to release this if we ultimately + * decide not to process the table! + */ + if (!ConditionalLockRelationOid(class->oid, AccessShareLock)) + continue; + + /* Verify that the table still exists */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(class->oid))) + { + UnlockRelationOid(class->oid, AccessShareLock); + continue; + } + + /* Can only process plain tables and matviews */ + if (class->relkind != RELKIND_RELATION && + class->relkind != RELKIND_MATVIEW) + { + UnlockRelationOid(class->oid, AccessShareLock); + continue; + } - relation_close(indRelation, AccessShareLock); + /* noisily skip rels which the user can't process */ + if (!repack_is_permitted_for_relation(cmd, class->oid, + GetUserId())) + { + UnlockRelationOid(class->oid, AccessShareLock); + continue; + } + + /* Use a permanent memory context for the result list */ + oldcxt = MemoryContextSwitchTo(permcxt); + rtc = palloc_object(RelToCluster); + rtc->tableOid = class->oid; + rtc->indexOid = InvalidOid; + rtcs = lappend(rtcs, rtc); + MemoryContextSwitchTo(oldcxt); + } + } + + table_endscan(scan); + relation_close(catalog, AccessShareLock); return rtcs; } /* - * Given an index on a partitioned table, return a list of RelToCluster for - * all the children leaves tables/indexes. + * Given a partitioned table or its index, return a list of RelToCluster for + * all the leaf child tables/indexes. * - * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock - * on the table containing the index. + * 'rel_is_index' tells whether 'relid' is that of an index (true) or of the + * owning relation. */ static List * -get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid) +get_tables_to_repack_partitioned(RepackCommand cmd, Oid relid, + bool rel_is_index, MemoryContext permcxt) { List *inhoids; - ListCell *lc; List *rtcs = NIL; - MemoryContext old_context; - - /* Do not lock the children until they're processed */ - inhoids = find_all_inheritors(indexOid, NoLock, NULL); - foreach(lc, inhoids) + /* + * Do not lock the children until they're processed. Note that we do hold + * a lock on the parent partitioned table. + */ + inhoids = find_all_inheritors(relid, NoLock, NULL); + foreach_oid(child_oid, inhoids) { - Oid indexrelid = lfirst_oid(lc); - Oid relid = IndexGetRelation(indexrelid, false); + Oid table_oid, + index_oid; RelToCluster *rtc; + MemoryContext oldcxt; - /* consider only leaf indexes */ - if (get_rel_relkind(indexrelid) != RELKIND_INDEX) - continue; + if (rel_is_index) + { + /* consider only leaf indexes */ + if (get_rel_relkind(child_oid) != RELKIND_INDEX) + continue; + + table_oid = IndexGetRelation(child_oid, false); + index_oid = child_oid; + } + else + { + /* consider only leaf relations */ + if (get_rel_relkind(child_oid) != RELKIND_RELATION) + continue; + + table_oid = child_oid; + index_oid = InvalidOid; + } /* * It's possible that the user does not have privileges to CLUSTER the - * leaf partition despite having such privileges on the partitioned - * table. We skip any partitions which the user is not permitted to - * CLUSTER. + * leaf partition despite having them on the partitioned table. Skip + * if so. */ - if (!cluster_is_permitted_for_relation(relid, GetUserId())) + if (!repack_is_permitted_for_relation(cmd, table_oid, GetUserId())) continue; /* Use a permanent memory context for the result list */ - old_context = MemoryContextSwitchTo(cluster_context); - + oldcxt = MemoryContextSwitchTo(permcxt); rtc = palloc_object(RelToCluster); - rtc->tableOid = relid; - rtc->indexOid = indexrelid; + rtc->tableOid = table_oid; + rtc->indexOid = index_oid; rtcs = lappend(rtcs, rtc); - - MemoryContextSwitchTo(old_context); + MemoryContextSwitchTo(oldcxt); } return rtcs; } + /* - * Return whether userid has privileges to CLUSTER relid. If not, this + * Return whether userid has privileges to REPACK relid. If not, this * function emits a WARNING. */ static bool -cluster_is_permitted_for_relation(Oid relid, Oid userid) +repack_is_permitted_for_relation(RepackCommand cmd, Oid relid, Oid userid) { + Assert(cmd == REPACK_COMMAND_CLUSTER || cmd == REPACK_COMMAND_REPACK); + if (pg_class_aclcheck(relid, userid, ACL_MAINTAIN) == ACLCHECK_OK) return true; ereport(WARNING, - (errmsg("permission denied to cluster \"%s\", skipping it", - get_rel_name(relid)))); + errmsg("permission denied to execute %s on \"%s\", skipping it", + RepackCommandAsString(cmd), + get_rel_name(relid))); + return false; } + + +/* + * Given a RepackStmt with an indicated relation name, resolve the relation + * name, obtain lock on it, then determine what to do based on the relation + * type: if it's table and not partitioned, repack it as indicated (using an + * existing clustered index, or following the given one), and return NULL. + * + * On the other hand, if the table is partitioned, do nothing further and + * instead return the opened and locked relcache entry, so that caller can + * process the partitions using the multiple-table handling code. In this + * case, if an index name is given, it's up to the caller to resolve it. + */ +static Relation +process_single_relation(RepackStmt *stmt, ClusterParams *params) +{ + Relation rel; + Oid tableOid; + + Assert(stmt->relation != NULL); + Assert(stmt->command == REPACK_COMMAND_CLUSTER || + stmt->command == REPACK_COMMAND_REPACK); + + /* + * Make sure ANALYZE is specified if a column list is present. + */ + if ((params->options & CLUOPT_ANALYZE) == 0 && stmt->relation->va_cols != NIL) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ANALYZE option must be specified when a column list is provided")); + + /* + * Find, lock, and check permissions on the table. We obtain + * AccessExclusiveLock right away to avoid lock-upgrade hazard in the + * single-transaction case. + */ + tableOid = RangeVarGetRelidExtended(stmt->relation->relation, + AccessExclusiveLock, + 0, + RangeVarCallbackMaintainsTable, + NULL); + rel = table_open(tableOid, NoLock); + + /* + * Reject clustering a remote temp table ... their local buffer manager is + * not going to cope. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on temporary tables of other sessions", + RepackCommandAsString(stmt->command))); + + /* + * For partitioned tables, let caller handle this. Otherwise, process it + * here and we're done. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return rel; + else + { + Oid indexOid; + + indexOid = determine_clustered_index(rel, stmt->usingindex, + stmt->indexname); + if (OidIsValid(indexOid)) + check_index_is_clusterable(rel, indexOid, AccessExclusiveLock); + cluster_rel(stmt->command, rel, indexOid, params); + + /* + * Do an analyze, if requested. We close the transaction and start a + * new one, so that we don't hold the stronger lock for longer than + * needed. + */ + if (params->options & CLUOPT_ANALYZE) + { + VacuumParams vac_params = {0}; + + PopActiveSnapshot(); + CommitTransactionCommand(); + + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + vac_params.options |= VACOPT_ANALYZE; + if (params->options & CLUOPT_VERBOSE) + vac_params.options |= VACOPT_VERBOSE; + analyze_rel(tableOid, NULL, vac_params, + stmt->relation->va_cols, true, NULL); + PopActiveSnapshot(); + CommandCounterIncrement(); + } + + return NULL; + } +} + +/* + * Given a relation and the usingindex/indexname options in a + * REPACK USING INDEX or CLUSTER command, return the OID of the + * index to use for clustering the table. + * + * Caller must hold lock on the relation so that the set of indexes + * doesn't change, and must call check_index_is_clusterable. + */ +static Oid +determine_clustered_index(Relation rel, bool usingindex, const char *indexname) +{ + Oid indexOid; + + if (indexname == NULL && usingindex) + { + /* + * If USING INDEX with no name is given, find a clustered index, or + * error out if none. + */ + indexOid = InvalidOid; + foreach_oid(idxoid, RelationGetIndexList(rel)) + { + if (get_index_isclustered(idxoid)) + { + indexOid = idxoid; + break; + } + } + + if (!OidIsValid(indexOid)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("there is no previously clustered index for table \"%s\"", + RelationGetRelationName(rel))); + } + else if (indexname != NULL) + { + /* An index was specified; obtain its OID. */ + indexOid = get_relname_relid(indexname, rel->rd_rel->relnamespace); + if (!OidIsValid(indexOid)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" for table \"%s\" does not exist", + indexname, RelationGetRelationName(rel))); + } + else + indexOid = InvalidOid; + + return indexOid; +} + +static const char * +RepackCommandAsString(RepackCommand cmd) +{ + switch (cmd) + { + case REPACK_COMMAND_REPACK: + return "REPACK"; + case REPACK_COMMAND_VACUUMFULL: + return "VACUUM"; + case REPACK_COMMAND_CLUSTER: + return "CLUSTER"; + } + return "???"; /* keep compiler quiet */ +} diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 62c1ebdfd9b20..bce3a2daa245d 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -352,7 +352,6 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) } } - /* * Sanity check DISABLE_PAGE_SKIPPING option. */ @@ -2294,8 +2293,9 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, if ((params.options & VACOPT_VERBOSE) != 0) cluster_params.options |= CLUOPT_VERBOSE; - /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ - cluster_rel(rel, InvalidOid, &cluster_params); + /* VACUUM FULL is a variant of REPACK; see cluster.c */ + cluster_rel(REPACK_COMMAND_VACUUMFULL, rel, InvalidOid, + &cluster_params); /* cluster_rel closes the relation, but keeps lock */ rel = NULL; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 19d8a29a35ecd..f01f5734fe938 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -288,7 +288,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); AlterCompositeTypeStmt AlterUserMappingStmt AlterRoleStmt AlterRoleSetStmt AlterPolicyStmt AlterStatsStmt AlterDefaultPrivilegesStmt DefACLAction - AnalyzeStmt CallStmt ClosePortalStmt ClusterStmt CommentStmt + AnalyzeStmt CallStmt ClosePortalStmt CommentStmt ConstraintsSetStmt CopyStmt CreateAsStmt CreateCastStmt CreateDomainStmt CreateExtensionStmt CreateGroupStmt CreateOpClassStmt CreateOpFamilyStmt AlterOpFamilyStmt CreatePLangStmt @@ -305,7 +305,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); GrantStmt GrantRoleStmt ImportForeignSchemaStmt IndexStmt InsertStmt ListenStmt LoadStmt LockStmt MergeStmt NotifyStmt ExplainableStmt PreparableStmt CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt - RemoveFuncStmt RemoveOperStmt RenameStmt ReturnStmt RevokeStmt RevokeRoleStmt + RemoveFuncStmt RemoveOperStmt RenameStmt RepackStmt ReturnStmt RevokeStmt RevokeRoleStmt RuleActionStmt RuleActionStmtOrEmpty RuleStmt SecLabelStmt SelectStmt TransactionStmt TransactionStmtLegacy TruncateStmt UnlistenStmt UpdateStmt VacuumStmt @@ -324,7 +324,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type opt_single_name %type opt_qualified_name -%type opt_concurrently +%type opt_concurrently opt_usingindex %type opt_drop_behavior %type opt_utility_option_list %type opt_wait_with_clause @@ -776,7 +776,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); QUOTE QUOTES RANGE READ REAL REASSIGN RECURSIVE REF_P REFERENCES REFERENCING - REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA + REFRESH REINDEX RELATIVE_P RELEASE RENAME REPACK REPEATABLE REPLACE REPLICA RESET RESPECT_P RESTART RESTRICT RETURN RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP ROUTINE ROUTINES ROW ROWS RULE @@ -1038,7 +1038,6 @@ stmt: | CallStmt | CheckPointStmt | ClosePortalStmt - | ClusterStmt | CommentStmt | ConstraintsSetStmt | CopyStmt @@ -1112,6 +1111,7 @@ stmt: | RemoveFuncStmt | RemoveOperStmt | RenameStmt + | RepackStmt | RevokeStmt | RevokeRoleStmt | RuleStmt @@ -1149,6 +1149,11 @@ opt_concurrently: | /*EMPTY*/ { $$ = false; } ; +opt_usingindex: + USING INDEX { $$ = true; } + | /* EMPTY */ { $$ = false; } + ; + opt_drop_behavior: CASCADE { $$ = DROP_CASCADE; } | RESTRICT { $$ = DROP_RESTRICT; } @@ -12085,38 +12090,82 @@ CreateConversionStmt: /***************************************************************************** * * QUERY: + * REPACK [ (options) ] [ [ ] [ USING INDEX ] ] + * + * obsolete variants: * CLUSTER (options) [ [ USING ] ] * CLUSTER [VERBOSE] [ [ USING ] ] * CLUSTER [VERBOSE] ON (for pre-8.3) * *****************************************************************************/ -ClusterStmt: - CLUSTER '(' utility_option_list ')' qualified_name cluster_index_specification +RepackStmt: + REPACK opt_utility_option_list vacuum_relation USING INDEX name { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); - n->relation = $5; + n->command = REPACK_COMMAND_REPACK; + n->relation = (VacuumRelation *) $3; n->indexname = $6; + n->usingindex = true; + n->params = $2; + $$ = (Node *) n; + } + | REPACK opt_utility_option_list vacuum_relation opt_usingindex + { + RepackStmt *n = makeNode(RepackStmt); + + n->command = REPACK_COMMAND_REPACK; + n->relation = (VacuumRelation *) $3; + n->indexname = NULL; + n->usingindex = $4; + n->params = $2; + $$ = (Node *) n; + } + | REPACK opt_utility_option_list opt_usingindex + { + RepackStmt *n = makeNode(RepackStmt); + + n->command = REPACK_COMMAND_REPACK; + n->relation = NULL; + n->indexname = NULL; + n->usingindex = $3; + n->params = $2; + $$ = (Node *) n; + } + | CLUSTER '(' utility_option_list ')' qualified_name cluster_index_specification + { + RepackStmt *n = makeNode(RepackStmt); + + n->command = REPACK_COMMAND_CLUSTER; + n->relation = makeNode(VacuumRelation); + n->relation->relation = $5; + n->indexname = $6; + n->usingindex = true; n->params = $3; $$ = (Node *) n; } | CLUSTER opt_utility_option_list { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); + n->command = REPACK_COMMAND_CLUSTER; n->relation = NULL; n->indexname = NULL; + n->usingindex = true; n->params = $2; $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-14 compatibility */ | CLUSTER opt_verbose qualified_name cluster_index_specification { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); - n->relation = $3; + n->command = REPACK_COMMAND_CLUSTER; + n->relation = makeNode(VacuumRelation); + n->relation->relation = $3; n->indexname = $4; + n->usingindex = true; if ($2) n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; @@ -12124,20 +12173,25 @@ ClusterStmt: /* unparenthesized VERBOSE kept for pre-17 compatibility */ | CLUSTER VERBOSE { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); + n->command = REPACK_COMMAND_CLUSTER; n->relation = NULL; n->indexname = NULL; + n->usingindex = true; n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* kept for pre-8.3 compatibility */ | CLUSTER opt_verbose name ON qualified_name { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); - n->relation = $5; + n->command = REPACK_COMMAND_CLUSTER; + n->relation = makeNode(VacuumRelation); + n->relation->relation = $5; n->indexname = $3; + n->usingindex = true; if ($2) n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; @@ -18194,6 +18248,7 @@ unreserved_keyword: | RELATIVE_P | RELEASE | RENAME + | REPACK | REPEATABLE | REPLACE | REPLICA @@ -18831,6 +18886,7 @@ bare_label_keyword: | RELATIVE_P | RELEASE | RENAME + | REPACK | REPEATABLE | REPLACE | REPLICA diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index bf707f2d57ffb..b4651a641318c 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -279,9 +279,9 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) return COMMAND_OK_IN_RECOVERY | COMMAND_OK_IN_READ_ONLY_TXN; } - case T_ClusterStmt: case T_ReindexStmt: case T_VacuumStmt: + case T_RepackStmt: { /* * These commands write WAL, so they're not strictly @@ -290,9 +290,9 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) * * However, they don't change the database state in a way that * would affect pg_dump output, so it's fine to run them in a - * read-only transaction. (CLUSTER might change the order of - * rows on disk, which could affect the ordering of pg_dump - * output, but that's not semantically significant.) + * read-only transaction. (REPACK/CLUSTER might change the + * order of rows on disk, which could affect the ordering of + * pg_dump output, but that's not semantically significant.) */ return COMMAND_OK_IN_READ_ONLY_TXN; } @@ -856,14 +856,14 @@ standard_ProcessUtility(PlannedStmt *pstmt, ExecuteCallStmt(castNode(CallStmt, parsetree), params, isAtomicContext, dest); break; - case T_ClusterStmt: - cluster(pstate, (ClusterStmt *) parsetree, isTopLevel); - break; - case T_VacuumStmt: ExecVacuum(pstate, (VacuumStmt *) parsetree, isTopLevel); break; + case T_RepackStmt: + ExecRepack(pstate, (RepackStmt *) parsetree, isTopLevel); + break; + case T_ExplainStmt: ExplainQuery(pstate, (ExplainStmt *) parsetree, params, dest); break; @@ -2865,10 +2865,6 @@ CreateCommandTag(Node *parsetree) tag = CMDTAG_CALL; break; - case T_ClusterStmt: - tag = CMDTAG_CLUSTER; - break; - case T_VacuumStmt: if (((VacuumStmt *) parsetree)->is_vacuumcmd) tag = CMDTAG_VACUUM; @@ -2876,6 +2872,13 @@ CreateCommandTag(Node *parsetree) tag = CMDTAG_ANALYZE; break; + case T_RepackStmt: + if (((RepackStmt *) parsetree)->command == REPACK_COMMAND_CLUSTER) + tag = CMDTAG_CLUSTER; + else + tag = CMDTAG_REPACK; + break; + case T_ExplainStmt: tag = CMDTAG_EXPLAIN; break; @@ -3517,7 +3520,7 @@ GetCommandLogLevel(Node *parsetree) lev = LOGSTMT_ALL; break; - case T_ClusterStmt: + case T_RepackStmt: lev = LOGSTMT_DDL; break; diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 50ea9e8fb83a6..5ac022274a738 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -288,8 +288,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_VACUUM; else if (pg_strcasecmp(cmd, "ANALYZE") == 0) cmdtype = PROGRESS_COMMAND_ANALYZE; - else if (pg_strcasecmp(cmd, "CLUSTER") == 0) - cmdtype = PROGRESS_COMMAND_CLUSTER; + else if (pg_strcasecmp(cmd, "REPACK") == 0) + cmdtype = PROGRESS_COMMAND_REPACK; else if (pg_strcasecmp(cmd, "CREATE INDEX") == 0) cmdtype = PROGRESS_COMMAND_CREATE_INDEX; else if (pg_strcasecmp(cmd, "BASEBACKUP") == 0) diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c index 6484c6a3dd4e2..199fc64ddf5f3 100644 --- a/src/bin/psql/tab-complete.in.c +++ b/src/bin/psql/tab-complete.in.c @@ -1267,7 +1267,7 @@ static const char *const sql_commands[] = { "DELETE FROM", "DISCARD", "DO", "DROP", "END", "EXECUTE", "EXPLAIN", "FETCH", "GRANT", "IMPORT FOREIGN SCHEMA", "INSERT INTO", "LISTEN", "LOAD", "LOCK", "MERGE INTO", "MOVE", "NOTIFY", "PREPARE", - "REASSIGN", "REFRESH MATERIALIZED VIEW", "REINDEX", "RELEASE", + "REASSIGN", "REFRESH MATERIALIZED VIEW", "REINDEX", "RELEASE", "REPACK", "RESET", "REVOKE", "ROLLBACK", "SAVEPOINT", "SECURITY LABEL", "SELECT", "SET", "SHOW", "START", "TABLE", "TRUNCATE", "UNLISTEN", "UPDATE", "VACUUM", "VALUES", @@ -5117,6 +5117,47 @@ match_previous_words(int pattern_id, COMPLETE_WITH_QUERY(Query_for_list_of_tablespaces); } +/* REPACK */ + else if (Matches("REPACK")) + COMPLETE_WITH_SCHEMA_QUERY_PLUS(Query_for_list_of_clusterables, + "(", "USING INDEX"); + else if (Matches("REPACK", "(*)")) + COMPLETE_WITH_SCHEMA_QUERY_PLUS(Query_for_list_of_clusterables, + "USING INDEX"); + else if (Matches("REPACK", MatchAnyExcept("("))) + COMPLETE_WITH("USING INDEX"); + else if (Matches("REPACK", "(*)", MatchAnyExcept("("))) + COMPLETE_WITH("USING INDEX"); + else if (Matches("REPACK", MatchAny, "USING", "INDEX") || + Matches("REPACK", "(*)", MatchAny, "USING", "INDEX")) + { + set_completion_reference(prev3_wd); + COMPLETE_WITH_SCHEMA_QUERY(Query_for_index_of_table); + } + + /* + * Complete ... [ (*) ] USING INDEX, with a list of indexes for + * . + */ + else if (TailMatches(MatchAny, "USING", "INDEX")) + { + set_completion_reference(prev3_wd); + COMPLETE_WITH_SCHEMA_QUERY(Query_for_index_of_table); + } + else if (HeadMatches("REPACK", "(*") && + !HeadMatches("REPACK", "(*)")) + { + /* + * This fires if we're in an unfinished parenthesized option list. + * get_previous_words treats a completed parenthesized option list as + * one word, so the above test is correct. + */ + if (ends_with(prev_wd, '(') || ends_with(prev_wd, ',')) + COMPLETE_WITH("ANALYZE", "VERBOSE"); + else if (TailMatches("ANALYZE", "VERBOSE")) + COMPLETE_WITH("ON", "OFF"); + } + /* SECURITY LABEL */ else if (Matches("SECURITY")) COMPLETE_WITH("LABEL"); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index b6508b60a843d..90f46b0350237 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202603062 +#define CATALOG_VERSION_NO 202603101 #endif diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index 8ea81622f9db1..28741988478a9 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -24,6 +24,7 @@ #define CLUOPT_RECHECK 0x02 /* recheck relation state */ #define CLUOPT_RECHECK_ISCLUSTERED 0x04 /* recheck relation state for * indisclustered */ +#define CLUOPT_ANALYZE 0x08 /* do an ANALYZE */ /* options for CLUSTER */ typedef struct ClusterParams @@ -31,8 +32,11 @@ typedef struct ClusterParams bits32 options; /* bitmask of CLUOPT_* */ } ClusterParams; -extern void cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel); -extern void cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params); + +extern void ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel); + +extern void cluster_rel(RepackCommand command, Relation OldHeap, Oid indexOid, + ClusterParams *params); extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode); extern void mark_index_clustered(Relation rel, Oid indexOid, bool is_internal); diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 359221dc29664..9c40772706c3c 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -73,28 +73,34 @@ #define PROGRESS_ANALYZE_STARTED_BY_MANUAL 1 #define PROGRESS_ANALYZE_STARTED_BY_AUTOVACUUM 2 -/* Progress parameters for cluster */ -#define PROGRESS_CLUSTER_COMMAND 0 -#define PROGRESS_CLUSTER_PHASE 1 -#define PROGRESS_CLUSTER_INDEX_RELID 2 -#define PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED 3 -#define PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN 4 -#define PROGRESS_CLUSTER_TOTAL_HEAP_BLKS 5 -#define PROGRESS_CLUSTER_HEAP_BLKS_SCANNED 6 -#define PROGRESS_CLUSTER_INDEX_REBUILD_COUNT 7 - -/* Phases of cluster (as advertised via PROGRESS_CLUSTER_PHASE) */ -#define PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP 1 -#define PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP 2 -#define PROGRESS_CLUSTER_PHASE_SORT_TUPLES 3 -#define PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP 4 -#define PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES 5 -#define PROGRESS_CLUSTER_PHASE_REBUILD_INDEX 6 -#define PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP 7 - -/* Commands of PROGRESS_CLUSTER */ -#define PROGRESS_CLUSTER_COMMAND_CLUSTER 1 -#define PROGRESS_CLUSTER_COMMAND_VACUUM_FULL 2 +/* + * Progress parameters for REPACK. + * + * Values for PROGRESS_REPACK_COMMAND are as in RepackCommand. + * + * Note: Since REPACK shares code with CLUSTER, these values are also + * used by CLUSTER. (CLUSTER being now deprecated, it makes little sense to + * maintain a separate set of constants.) + */ +#define PROGRESS_REPACK_COMMAND 0 +#define PROGRESS_REPACK_PHASE 1 +#define PROGRESS_REPACK_INDEX_RELID 2 +#define PROGRESS_REPACK_HEAP_TUPLES_SCANNED 3 +#define PROGRESS_REPACK_HEAP_TUPLES_WRITTEN 4 +#define PROGRESS_REPACK_TOTAL_HEAP_BLKS 5 +#define PROGRESS_REPACK_HEAP_BLKS_SCANNED 6 +#define PROGRESS_REPACK_INDEX_REBUILD_COUNT 7 + +/* + * Phases of repack (as advertised via PROGRESS_REPACK_PHASE). + */ +#define PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP 1 +#define PROGRESS_REPACK_PHASE_INDEX_SCAN_HEAP 2 +#define PROGRESS_REPACK_PHASE_SORT_TUPLES 3 +#define PROGRESS_REPACK_PHASE_WRITE_NEW_HEAP 4 +#define PROGRESS_REPACK_PHASE_SWAP_REL_FILES 5 +#define PROGRESS_REPACK_PHASE_REBUILD_INDEX 6 +#define PROGRESS_REPACK_PHASE_FINAL_CLEANUP 7 /* Progress parameters for CREATE INDEX */ /* 3, 4 and 5 reserved for "waitfor" metrics */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 4ee092206b0dc..f3d32ef0188df 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3982,18 +3982,6 @@ typedef struct AlterSystemStmt VariableSetStmt *setstmt; /* SET subcommand */ } AlterSystemStmt; -/* ---------------------- - * Cluster Statement (support pbrown's cluster index implementation) - * ---------------------- - */ -typedef struct ClusterStmt -{ - NodeTag type; - RangeVar *relation; /* relation being indexed, or NULL if all */ - char *indexname; /* original index defined */ - List *params; /* list of DefElem nodes */ -} ClusterStmt; - /* ---------------------- * Vacuum and Analyze Statements * @@ -4006,7 +3994,7 @@ typedef struct VacuumStmt NodeTag type; List *options; /* list of DefElem nodes */ List *rels; /* list of VacuumRelation, or NIL for all */ - bool is_vacuumcmd; /* true for VACUUM, false for ANALYZE */ + bool is_vacuumcmd; /* true for VACUUM, false otherwise */ } VacuumStmt; /* @@ -4024,6 +4012,27 @@ typedef struct VacuumRelation List *va_cols; /* list of column names, or NIL for all */ } VacuumRelation; +/* ---------------------- + * Repack Statement + * ---------------------- + */ +typedef enum RepackCommand +{ + REPACK_COMMAND_CLUSTER = 1, + REPACK_COMMAND_REPACK, + REPACK_COMMAND_VACUUMFULL, +} RepackCommand; + +typedef struct RepackStmt +{ + NodeTag type; + RepackCommand command; /* type of command being run */ + VacuumRelation *relation; /* relation being repacked */ + char *indexname; /* order tuples by this index */ + bool usingindex; /* whether USING INDEX is specified */ + List *params; /* list of DefElem nodes */ +} RepackStmt; + /* ---------------------- * Explain Statement * diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index f7753c5c8a87d..6f74a8c05c731 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -377,6 +377,7 @@ PG_KEYWORD("reindex", REINDEX, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("relative", RELATIVE_P, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("release", RELEASE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("repack", REPACK, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD, BARE_LABEL) diff --git a/src/include/tcop/cmdtaglist.h b/src/include/tcop/cmdtaglist.h index 1290c9bab6842..652dc61b834c8 100644 --- a/src/include/tcop/cmdtaglist.h +++ b/src/include/tcop/cmdtaglist.h @@ -196,6 +196,7 @@ PG_CMDTAG(CMDTAG_REASSIGN_OWNED, "REASSIGN OWNED", false, false, false) PG_CMDTAG(CMDTAG_REFRESH_MATERIALIZED_VIEW, "REFRESH MATERIALIZED VIEW", true, false, false) PG_CMDTAG(CMDTAG_REINDEX, "REINDEX", true, false, false) PG_CMDTAG(CMDTAG_RELEASE, "RELEASE", false, false, false) +PG_CMDTAG(CMDTAG_REPACK, "REPACK", false, false, false) PG_CMDTAG(CMDTAG_RESET, "RESET", false, false, false) PG_CMDTAG(CMDTAG_REVOKE, "REVOKE", true, false, false) PG_CMDTAG(CMDTAG_REVOKE_ROLE, "REVOKE ROLE", false, false, false) diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index 19f63b414310f..6300dbd15d5be 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -24,10 +24,10 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_INVALID, PROGRESS_COMMAND_VACUUM, PROGRESS_COMMAND_ANALYZE, - PROGRESS_COMMAND_CLUSTER, PROGRESS_COMMAND_CREATE_INDEX, PROGRESS_COMMAND_BASEBACKUP, PROGRESS_COMMAND_COPY, + PROGRESS_COMMAND_REPACK, } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out index 4d40a6809ab46..24b0b1a8fce86 100644 --- a/src/test/regress/expected/cluster.out +++ b/src/test/regress/expected/cluster.out @@ -495,6 +495,46 @@ ALTER TABLE clstrpart SET WITHOUT CLUSTER; ERROR: cannot mark index clustered in partitioned table ALTER TABLE clstrpart CLUSTER ON clstrpart_idx; ERROR: cannot mark index clustered in partitioned table +-- and they cannot get an index-ordered REPACK without an explicit index name +REPACK clstrpart USING INDEX; +ERROR: cannot execute REPACK on partitioned table "clstrpart" USING INDEX with no index name +-- Check that REPACK sets new relfilenodes: it should process exactly the same +-- tables as CLUSTER did. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart USING INDEX clstrpart_idx; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + relname | level | relkind | ?column? +-------------+-------+---------+---------- + clstrpart | 0 | p | t + clstrpart1 | 1 | p | t + clstrpart11 | 2 | r | f + clstrpart12 | 2 | p | t + clstrpart2 | 1 | r | f + clstrpart3 | 1 | p | t + clstrpart33 | 2 | r | f +(7 rows) + +-- And finally the same for REPACK w/o index. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + relname | level | relkind | ?column? +-------------+-------+---------+---------- + clstrpart | 0 | p | t + clstrpart1 | 1 | p | t + clstrpart11 | 2 | r | f + clstrpart12 | 2 | p | t + clstrpart2 | 1 | r | f + clstrpart3 | 1 | p | t + clstrpart33 | 2 | r | f +(7 rows) + DROP TABLE clstrpart; -- Ownership of partitions is checked CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i); @@ -513,7 +553,7 @@ CREATE TEMP TABLE ptnowner_oldnodes AS JOIN pg_class AS c ON c.oid=tree.relid; SET SESSION AUTHORIZATION regress_ptnowner; CLUSTER ptnowner USING ptnowner_i_idx; -WARNING: permission denied to cluster "ptnowner2", skipping it +WARNING: permission denied to execute CLUSTER on "ptnowner2", skipping it RESET SESSION AUTHORIZATION; SELECT a.relname, a.relfilenode=b.relfilenode FROM pg_class a JOIN ptnowner_oldnodes b USING (oid) ORDER BY a.relname COLLATE "C"; @@ -665,6 +705,101 @@ SELECT * FROM clstr_expression WHERE -a = -3 ORDER BY -a, b; (4 rows) COMMIT; +---------------------------------------------------------------------- +-- +-- REPACK +-- +---------------------------------------------------------------------- +-- REPACK handles individual tables identically to CLUSTER, but it's worth +-- checking if it handles table hierarchies identically as well. +REPACK clstr_tst USING INDEX clstr_tst_c; +-- Verify that inheritance link still works +INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table 2'); +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst; + a | b | c | substring | length +----+-----+------------------+--------------------------------+-------- + 10 | 14 | catorce | | + 18 | 5 | cinco | | + 9 | 4 | cuatro | | + 26 | 19 | diecinueve | | + 12 | 18 | dieciocho | | + 30 | 16 | dieciseis | | + 24 | 17 | diecisiete | | + 2 | 10 | diez | | + 23 | 12 | doce | | + 11 | 2 | dos | | + 25 | 9 | nueve | | + 31 | 8 | ocho | | + 1 | 11 | once | | + 28 | 15 | quince | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 + 29 | 7 | siete | | + 15 | 13 | trece | | + 22 | 30 | treinta | | + 17 | 32 | treinta y dos | | + 3 | 31 | treinta y uno | | + 5 | 3 | tres | | + 20 | 1 | uno | | + 6 | 20 | veinte | | + 14 | 25 | veinticinco | | + 21 | 24 | veinticuatro | | + 4 | 22 | veintidos | | + 19 | 29 | veintinueve | | + 16 | 28 | veintiocho | | + 27 | 26 | veintiseis | | + 13 | 27 | veintisiete | | + 7 | 23 | veintitres | | + 8 | 21 | veintiuno | | + 0 | 100 | in child table | | + 0 | 100 | in child table 2 | | +(34 rows) + +-- Verify that foreign key link still works +INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); +ERROR: insert or update on table "clstr_tst" violates foreign key constraint "clstr_tst_con" +DETAIL: Key (b)=(1111) is not present in table "clstr_tst_s". +SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass +ORDER BY 1; + conname +---------------------- + clstr_tst_a_not_null + clstr_tst_con + clstr_tst_pkey +(3 rows) + +-- Verify partial analyze works +REPACK (ANALYZE) clstr_tst (a); +REPACK (ANALYZE) clstr_tst; +REPACK (VERBOSE) clstr_tst (a); +ERROR: ANALYZE option must be specified when a column list is provided +-- REPACK w/o argument performs no ordering, so we can only check which tables +-- have the relfilenode changed. +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_old AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); +SET SESSION AUTHORIZATION regress_clstr_user; +SET client_min_messages = ERROR; -- order of "skipping" warnings may vary +REPACK; +RESET client_min_messages; +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_new AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); +-- Do the actual comparison. Unlike CLUSTER, clstr_3 should have been +-- processed because there is nothing like clustering index here. +SELECT o.relname FROM relnodes_old o +JOIN relnodes_new n ON o.relname = n.relname +WHERE o.relfilenode <> n.relfilenode +ORDER BY o.relname; + relname +--------- + clstr_1 + clstr_3 +(2 rows) + -- clean up DROP TABLE clustertest; DROP TABLE clstr_1; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index deb6e2ad6a94b..f373ad704b690 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2002,34 +2002,23 @@ pg_stat_progress_basebackup| SELECT pid, ELSE NULL::text END AS backup_type FROM pg_stat_get_progress_info('BASEBACKUP'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20); -pg_stat_progress_cluster| SELECT s.pid, - s.datid, - d.datname, - s.relid, - CASE s.param1 - WHEN 1 THEN 'CLUSTER'::text - WHEN 2 THEN 'VACUUM FULL'::text - ELSE NULL::text +pg_stat_progress_cluster| SELECT pid, + datid, + datname, + relid, + CASE + WHEN (command = ANY (ARRAY['CLUSTER'::text, 'VACUUM FULL'::text])) THEN command + WHEN (repack_index_relid = (0)::oid) THEN 'VACUUM FULL'::text + ELSE 'CLUSTER'::text END AS command, - CASE s.param2 - WHEN 0 THEN 'initializing'::text - WHEN 1 THEN 'seq scanning heap'::text - WHEN 2 THEN 'index scanning heap'::text - WHEN 3 THEN 'sorting tuples'::text - WHEN 4 THEN 'writing new heap'::text - WHEN 5 THEN 'swapping relation files'::text - WHEN 6 THEN 'rebuilding index'::text - WHEN 7 THEN 'performing final cleanup'::text - ELSE NULL::text - END AS phase, - (s.param3)::oid AS cluster_index_relid, - s.param4 AS heap_tuples_scanned, - s.param5 AS heap_tuples_written, - s.param6 AS heap_blks_total, - s.param7 AS heap_blks_scanned, - s.param8 AS index_rebuild_count - FROM (pg_stat_get_progress_info('CLUSTER'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) - LEFT JOIN pg_database d ON ((s.datid = d.oid))); + phase, + repack_index_relid AS cluster_index_relid, + heap_tuples_scanned, + heap_tuples_written, + heap_blks_total, + heap_blks_scanned, + index_rebuild_count + FROM pg_stat_progress_repack; pg_stat_progress_copy| SELECT s.pid, s.datid, d.datname, @@ -2089,6 +2078,35 @@ pg_stat_progress_create_index| SELECT s.pid, s.param15 AS partitions_done FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_progress_repack| SELECT s.pid, + s.datid, + d.datname, + s.relid, + CASE s.param1 + WHEN 1 THEN 'CLUSTER'::text + WHEN 2 THEN 'REPACK'::text + WHEN 3 THEN 'VACUUM FULL'::text + ELSE NULL::text + END AS command, + CASE s.param2 + WHEN 0 THEN 'initializing'::text + WHEN 1 THEN 'seq scanning heap'::text + WHEN 2 THEN 'index scanning heap'::text + WHEN 3 THEN 'sorting tuples'::text + WHEN 4 THEN 'writing new heap'::text + WHEN 5 THEN 'swapping relation files'::text + WHEN 6 THEN 'rebuilding index'::text + WHEN 7 THEN 'performing final cleanup'::text + ELSE NULL::text + END AS phase, + (s.param3)::oid AS repack_index_relid, + s.param4 AS heap_tuples_scanned, + s.param5 AS heap_tuples_written, + s.param6 AS heap_blks_total, + s.param7 AS heap_blks_scanned, + s.param8 AS index_rebuild_count + FROM (pg_stat_get_progress_info('REPACK'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) + LEFT JOIN pg_database d ON ((s.datid = d.oid))); pg_stat_progress_vacuum| SELECT s.pid, s.datid, d.datname, diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql index b7115f861044d..f90c6ec200b4a 100644 --- a/src/test/regress/sql/cluster.sql +++ b/src/test/regress/sql/cluster.sql @@ -76,7 +76,6 @@ INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass ORDER BY 1; - SELECT relname, relkind, EXISTS(SELECT 1 FROM pg_class WHERE oid = c.reltoastrelid) AS hastoast FROM pg_class c WHERE relname LIKE 'clstr_tst%' ORDER BY relname; @@ -229,6 +228,26 @@ SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM o CLUSTER clstrpart; ALTER TABLE clstrpart SET WITHOUT CLUSTER; ALTER TABLE clstrpart CLUSTER ON clstrpart_idx; +-- and they cannot get an index-ordered REPACK without an explicit index name +REPACK clstrpart USING INDEX; + +-- Check that REPACK sets new relfilenodes: it should process exactly the same +-- tables as CLUSTER did. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart USING INDEX clstrpart_idx; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + +-- And finally the same for REPACK w/o index. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + DROP TABLE clstrpart; -- Ownership of partitions is checked @@ -313,6 +332,57 @@ EXPLAIN (COSTS OFF) SELECT * FROM clstr_expression WHERE -a = -3 ORDER BY -a, b; SELECT * FROM clstr_expression WHERE -a = -3 ORDER BY -a, b; COMMIT; +---------------------------------------------------------------------- +-- +-- REPACK +-- +---------------------------------------------------------------------- + +-- REPACK handles individual tables identically to CLUSTER, but it's worth +-- checking if it handles table hierarchies identically as well. +REPACK clstr_tst USING INDEX clstr_tst_c; + +-- Verify that inheritance link still works +INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table 2'); +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst; + +-- Verify that foreign key link still works +INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); + +SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass +ORDER BY 1; + +-- Verify partial analyze works +REPACK (ANALYZE) clstr_tst (a); +REPACK (ANALYZE) clstr_tst; +REPACK (VERBOSE) clstr_tst (a); + +-- REPACK w/o argument performs no ordering, so we can only check which tables +-- have the relfilenode changed. +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_old AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); + +SET SESSION AUTHORIZATION regress_clstr_user; +SET client_min_messages = ERROR; -- order of "skipping" warnings may vary +REPACK; +RESET client_min_messages; + +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_new AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); + +-- Do the actual comparison. Unlike CLUSTER, clstr_3 should have been +-- processed because there is nothing like clustering index here. +SELECT o.relname FROM relnodes_old o +JOIN relnodes_new n ON o.relname = n.relname +WHERE o.relfilenode <> n.relfilenode +ORDER BY o.relname; + -- clean up DROP TABLE clustertest; DROP TABLE clstr_1; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3250564d4ff67..744ef29d44ce0 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2581,6 +2581,8 @@ ReorderBufferTupleCidEnt ReorderBufferTupleCidKey ReorderBufferUpdateProgressTxnCB ReorderTuple +RepackCommand +RepackStmt ReparameterizeForeignPathByChild_function ReplOriginId ReplOriginXactState From c2a23dcf9e3af1c80a99b4ee43f0885eb6894e3a Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 10 Mar 2026 15:24:39 -0400 Subject: [PATCH 20/32] Use the newest to-be-frozen xid as the conflict horizon for freezing Previously WAL records that froze tuples used OldestXmin as the snapshot conflict horizon, or the visibility cutoff if the page would become all-frozen. Both are newer than (or equal to) the newst XID actually frozen on the page. Track the newest XID that will be frozen and use that as the snapshot conflict horizon instead. This yields an older horizon resulting in fewer query cancellations on standbys. Author: Melanie Plageman Reviewed-by: Peter Geoghegan Discussion: https://postgr.es/m/CAAKRu_bbaUV8OUjAfVa_iALgKnTSfB4gO3jnkfpcFgrxEpSGJQ%40mail.gmail.com --- src/backend/access/heap/heapam.c | 14 +++++++++++ src/backend/access/heap/pruneheap.c | 36 +++++++++-------------------- src/include/access/heapam.h | 12 ++++++++++ 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1ecc83308510c..8f1c11a93500d 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -7089,6 +7089,12 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * process this tuple as part of freezing its page, and return true. Return * false if nothing can be changed about the tuple right now. * + * FreezePageConflictXid is advanced only for xmin/xvac freezing, not for xmax + * changes. We only remove xmax state here when it is lock-only, or when the + * updater XID (including an updater member of a MultiXact) must be aborted; + * otherwise, the tuple would already be removable. Neither case affects + * visibility on a standby. + * * Also sets *totally_frozen to true if the tuple will be totally frozen once * caller executes returned freeze plan (or if the tuple was already totally * frozen by an earlier VACUUM). This indicates that there are no remaining @@ -7164,7 +7170,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, /* Verify that xmin committed if and when freeze plan is executed */ if (freeze_xmin) + { frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED; + if (TransactionIdFollows(xid, pagefrz->FreezePageConflictXid)) + pagefrz->FreezePageConflictXid = xid; + } } /* @@ -7183,6 +7193,9 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, */ replace_xvac = pagefrz->freeze_required = true; + if (TransactionIdFollows(xid, pagefrz->FreezePageConflictXid)) + pagefrz->FreezePageConflictXid = xid; + /* Will set replace_xvac flags in freeze plan below */ } @@ -7492,6 +7505,7 @@ heap_freeze_tuple(HeapTupleHeader tuple, pagefrz.freeze_required = true; pagefrz.FreezePageRelfrozenXid = FreezeLimit; pagefrz.FreezePageRelminMxid = MultiXactCutoff; + pagefrz.FreezePageConflictXid = InvalidTransactionId; pagefrz.NoFreezePageRelfrozenXid = FreezeLimit; pagefrz.NoFreezePageRelminMxid = MultiXactCutoff; diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 65c9f393f41a9..8748fa882e982 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -377,6 +377,7 @@ prune_freeze_setup(PruneFreezeParams *params, /* initialize page freezing working state */ prstate->pagefrz.freeze_required = false; + prstate->pagefrz.FreezePageConflictXid = InvalidTransactionId; if (prstate->attempt_freeze) { Assert(new_relfrozen_xid && new_relmin_mxid); @@ -407,7 +408,6 @@ prune_freeze_setup(PruneFreezeParams *params, * PruneState. */ prstate->deadoffsets = presult->deadoffsets; - prstate->frz_conflict_horizon = InvalidTransactionId; /* * Vacuum may update the VM after we're done. We can keep track of @@ -746,22 +746,8 @@ heap_page_will_freeze(bool did_tuple_hint_fpi, * critical section. */ heap_pre_freeze_checks(prstate->buffer, prstate->frozen, prstate->nfrozen); - - /* - * Calculate what the snapshot conflict horizon should be for a record - * freezing tuples. We can use the visibility_cutoff_xid as our cutoff - * for conflicts when the whole page is eligible to become all-frozen - * in the VM once we're done with it. Otherwise, we generate a - * conservative cutoff by stepping back from OldestXmin. - */ - if (prstate->set_all_frozen) - prstate->frz_conflict_horizon = prstate->visibility_cutoff_xid; - else - { - /* Avoids false conflicts when hot_standby_feedback in use */ - prstate->frz_conflict_horizon = prstate->cutoffs->OldestXmin; - TransactionIdRetreat(prstate->frz_conflict_horizon); - } + Assert(TransactionIdPrecedes(prstate->pagefrz.FreezePageConflictXid, + prstate->cutoffs->OldestXmin)); } else if (prstate->nfrozen > 0) { @@ -952,18 +938,18 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, /* * The snapshotConflictHorizon for the whole record should be the * most conservative of all the horizons calculated for any of the - * possible modifications. If this record will prune tuples, any - * transactions on the standby older than the youngest xmax of the - * most recently removed tuple this record will prune will - * conflict. If this record will freeze tuples, any transactions - * on the standby with xids older than the youngest tuple this - * record will freeze will conflict. + * possible modifications. If this record will prune tuples, any + * queries on the standby older than the newest xid of the most + * recently removed tuple this record will prune will conflict. If + * this record will freeze tuples, any queries on the standby with + * xids older than the newest tuple this record will freeze will + * conflict. */ TransactionId conflict_xid; - if (TransactionIdFollows(prstate.frz_conflict_horizon, + if (TransactionIdFollows(prstate.pagefrz.FreezePageConflictXid, prstate.latest_xid_removed)) - conflict_xid = prstate.frz_conflict_horizon; + conflict_xid = prstate.pagefrz.FreezePageConflictXid; else conflict_xid = prstate.latest_xid_removed; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 24a27cc043afa..ad993c07311c8 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -208,6 +208,18 @@ typedef struct HeapPageFreeze TransactionId FreezePageRelfrozenXid; MultiXactId FreezePageRelminMxid; + /* + * Newest XID that this page's freeze actions will remove from tuple + * visibility metadata (currently xmin and/or xvac). It is used to derive + * the snapshot conflict horizon for a WAL record that freezes tuples. On + * a standby, we must not replay that change while any snapshot could + * still treat that XID as running. + * + * It's only used if we execute freeze plans for this page, so there is no + * corresponding "no freeze" tracker. + */ + TransactionId FreezePageConflictXid; + /* * "No freeze" NewRelfrozenXid/NewRelminMxid trackers. * From 138592d1b06634b85d4b0275ba6501676bb8113a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Mar 2026 00:06:09 +0200 Subject: [PATCH 21/32] Don't clear pendingRecoveryConflicts at end of transaction Commit 17f51ea818 introduced a new pendingRecoveryConflicts field in PGPROC to replace the various ProcSignals. The new field was cleared in ProcArrayEndTransaction(), which makes sense for conflicts with e.g. locks or buffer pins which are gone at end of transaction. But it is not appropriate for conflicts on a database, or a logical slot. Because of this, the 035_standby_logical_decoding.pl test was occasionally getting stuck in the buildfarm. It happens if the startup process signals recovery conflict with the logical slot just when the walsender process using the slot calls ProcArrayEndTransaction(). To fix, don't clear pendingRecoveryConflicts in ProcArrayEndTransaction(). We could still clear certain conflict flags, like conflicts on locks, but we didn't try to do that before commit 17f51ea818 either. In the passing, fix a misspelled comment, and make InitAuxiliaryProcess() to also clear pendingRecoveryConflicts. I don't think aux processes can have recovery conflicts, but it seems best to initialize the field and keep InitAuxiliaryProcess() as close to InitProcess() as possible. Analyzed-by: Alexander Lakhin Discussion: https://www.postgresql.org/message-id/3e07149d-060b-48a0-8f94-3d5e4946ae45@gmail.com --- src/backend/storage/ipc/procarray.c | 7 +------ src/backend/storage/lmgr/proc.c | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 406b8253f8bbf..0f913897acc92 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -710,8 +710,6 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -752,8 +750,6 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -935,7 +931,6 @@ ProcArrayClearTransaction(PGPROC *proc) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkptFlags); @@ -3526,7 +3521,7 @@ SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflict } /* - * SignalRecoveryConflictWithDatabase --- signal all backends specified database + * SignalRecoveryConflictWithDatabase -- signal backends using specified database * * Like SignalRecoveryConflict, but signals all backends using the database. */ diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index daf70d9ce2a8f..d407725e6027e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -689,6 +689,7 @@ InitAuxiliaryProcess(void) Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); } #endif + pg_atomic_write_u32(&MyProc->pendingRecoveryConflicts, 0); /* * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch From 4c7362c553663d24b479e6f286720e5175c93d42 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 10 Mar 2026 18:28:18 -0400 Subject: [PATCH 22/32] Remove unused PruneState member frz_conflict_horizon c2a23dcf9e3af1c removed use of PruneState.frz_conflict_horizon but neglected to actually remove the member. Do that now. --- src/backend/access/heap/pruneheap.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 8748fa882e982..6beeb6956e3e2 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -114,13 +114,6 @@ typedef struct */ HeapPageFreeze pagefrz; - /* - * The snapshot conflict horizon used when freezing tuples. The final - * snapshot conflict horizon for the record may be newer if pruning - * removes newer transaction IDs. - */ - TransactionId frz_conflict_horizon; - /*------------------------------------------------------- * Information about what was done * From 4c910f3bbe92aa4e84ff15fa27b4de2da0d7ae50 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 11 Mar 2026 07:36:10 +0900 Subject: [PATCH 23/32] bloom: Optimize bitmap scan path with streaming read This commit replaces the per-page buffer read look in blgetbitmap() with a reading stream, to improve scan efficiency, particularly useful for large bloom indexes. Some benchmarking with a large number of rows has shown a very nice improvement in terms of runtime and IO read reduction with test cases up to 10M rows for a bloom index scan. For the io_uring method, The author has reported a 3x in runtime with io_uring while I was at close to a 7x. For the worker method with 3 workers, the author has reported better numbers than myself in runtime, with the reduction in IO stats being appealing for all the cases measured. Author: Xuneng Zhou Reviewed-by: Michael Paquier Reviewed-by: Nazir Bilal Yavuz Discussion: https://postgr.es/m/CABPTF7VrqfbcDXqGrdLQ2xaQ=K0RzExNuw6U_GGqzSJu32wfdQ@mail.gmail.com --- contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c index 0535d45f2d825..1a0e42021ec1e 100644 --- a/contrib/bloom/blscan.c +++ b/contrib/bloom/blscan.c @@ -18,6 +18,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "storage/read_stream.h" /* * Begin scan of bloom index. @@ -76,11 +77,13 @@ int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { int64 ntids = 0; - BlockNumber blkno = BLOOM_HEAD_BLKNO, + BlockNumber blkno, npages; int i; BufferAccessStrategy bas; BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + BlockRangeReadStreamPrivate p; + ReadStream *stream; if (so->sign == NULL) { @@ -120,14 +123,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) if (scan->instrument) scan->instrument->nsearches++; + /* Scan all blocks except the metapage using streaming reads */ + p.current_blocknum = BLOOM_HEAD_BLKNO; + p.last_exclusive = npages; + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, + bas, + scan->indexRelation, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; - buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, - blkno, RBM_NORMAL, bas); - + buffer = read_stream_next_buffer(stream, NULL); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); @@ -163,6 +181,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); } + + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + read_stream_end(stream); FreeAccessStrategy(bas); return ntids; From 82467f627bd478569de04f4a3f1993098e80c812 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 10 Mar 2026 19:32:13 -0400 Subject: [PATCH 24/32] Require share-exclusive lock to set hint bits and to flush At the moment hint bits can be set with just a share lock on a page (and, until 45f658dacb9, in one case even without any lock). Because of this we need to copy pages while writing them out, as otherwise the checksum could be corrupted. The need to copy the page is problematic to implement AIO writes: 1) Instead of just needing a single buffer for a copied page we need one for each page that's potentially undergoing I/O 2) To be able to use the "worker" AIO implementation the copied page needs to reside in shared memory It also causes problems for using unbuffered/direct-IO, independent of AIO: Some filesystems, raid implementations, ... do not tolerate the data being written out to change during the write. E.g. they may compute internal checksums that can be invalidated by concurrent modifications, leading e.g. to filesystem errors (as the case with btrfs). It also just is plain odd to allow modifications of buffers that are just share locked. To address these issues, this commit changes the rules so that modifications to pages are not allowed anymore while holding a share lock. Instead the new share-exclusive lock (introduced in fcb9c977aa5) allows at most one backend to modify a buffer while other backends have the same page share locked. An existing share-lock can be upgraded to a share-exclusive lock, if there are no conflicting locks. For that BufferBeginSetHintBits()/BufferFinishSetHintBits() and BufferSetHintBits16() have been introduced. To prevent hint bits from being set while the buffer is being written out, writing out buffers now requires a share-exclusive lock. The use of share-exclusive to gate setting hint bits means that from now on only one backend can set hint bits at a time. To allow multiple backends to set hint bits would require more complicated locking: For setting hint bits we'd need to store the count of backends currently setting hint bits and we would need another lock-level for I/O conflicting with the lock-level to set hint bits. Given that the share-exclusive lock for setting hint bits is only held for a short time, that backends would often just set the same hint bits and that the cost of occasionally not setting hint bits in hotly accessed pages is fairly low, this seems like an acceptable tradeoff. The biggest change to adapt to this is in heapam. To avoid performance regressions for sequential scans that need to set a lot of hint bits, we need to amortize the cost of BufferBeginSetHintBits() for cases where hint bits are set at a high frequency. To that end HeapTupleSatisfiesMVCCBatch() uses the new SetHintBitsExt(), which defers BufferFinishSetHintBits() until all hint bits on a page have been set. Conversely, to avoid regressions in cases where we can't set hint bits in bulk (because we're looking only at individual tuples), use BufferSetHintBits16() when setting hint bits without batching. Several other places also need to be adapted, but those changes are comparatively simpler. After this we do not need to copy buffers to write them out anymore. That change is done separately however. Reviewed-by: Melanie Plageman Reviewed-by: Heikki Linnakangas Reviewed-by: Chao Li Discussion: https://postgr.es/m/fvfmkr5kk4nyex56ejgxj3uzi63isfxovp2biecb4bspbjrze7@az2pljabhnff Discussion: https://postgr.es/m/stj36ea6yyhoxtqkhpieia2z4krnam7qyetc57rfezgk4zgapf%40gcnactj4z56m --- src/backend/access/gist/gistget.c | 20 +- src/backend/access/hash/hashutil.c | 14 +- src/backend/access/heap/heapam_visibility.c | 130 +++++-- src/backend/access/nbtree/nbtinsert.c | 31 +- src/backend/access/nbtree/nbtutils.c | 14 +- src/backend/access/transam/xloginsert.c | 11 +- src/backend/storage/buffer/README | 66 ++-- src/backend/storage/buffer/bufmgr.c | 396 +++++++++++++++----- src/backend/storage/freespace/freespace.c | 14 +- src/backend/storage/freespace/fsmpage.c | 11 +- src/include/storage/bufmgr.h | 4 + src/tools/pgindent/typedefs.list | 1 + 12 files changed, 528 insertions(+), 184 deletions(-) diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index b64ccf5e912cd..4d7c100d73781 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -64,11 +64,7 @@ gistkillitems(IndexScanDesc scan) * safe. */ if (BufferGetLSNAtomic(buffer) != so->curPageLSN) - { - UnlockReleaseBuffer(buffer); - so->numKilled = 0; /* reset counter */ - return; - } + goto unlock; Assert(GistPageIsLeaf(page)); @@ -78,6 +74,17 @@ gistkillitems(IndexScanDesc scan) */ for (i = 0; i < so->numKilled; i++) { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can update the + * page while just holding a share lock. If we are not allowed, + * there's no point continuing. + */ + if (!BufferBeginSetHintBits(buffer)) + goto unlock; + } + offnum = so->killedItems[i]; iid = PageGetItemId(page, offnum); ItemIdMarkDead(iid); @@ -87,9 +94,10 @@ gistkillitems(IndexScanDesc scan) if (killedsomething) { GistMarkPageHasGarbage(page); - MarkBufferDirtyHint(buffer, true); + BufferFinishSetHintBits(buffer, true, true); } +unlock: UnlockReleaseBuffer(buffer); /* diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index cf7f0b9017631..3e16119d0276c 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -593,6 +593,17 @@ _hash_kill_items(IndexScanDesc scan) if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can + * update the page while just holding a share lock. If we + * are not allowed, there's no point continuing. + */ + if (!BufferBeginSetHintBits(so->currPos.buf)) + goto unlock_page; + } + /* found the item */ ItemIdMarkDead(iid); killedsomething = true; @@ -610,9 +621,10 @@ _hash_kill_items(IndexScanDesc scan) if (killedsomething) { opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; - MarkBufferDirtyHint(buf, true); + BufferFinishSetHintBits(so->currPos.buf, true, true); } +unlock_page: if (so->hashso_bucket_buf == so->currPos.buf || havePin) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 75ae268d753c2..fc64f4343ce02 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -80,10 +80,38 @@ /* - * SetHintBits() + * To be allowed to set hint bits, SetHintBits() needs to call + * BufferBeginSetHintBits(). However, that's not free, and some callsites call + * SetHintBits() on many tuples in a row. For those it makes sense to amortize + * the cost of BufferBeginSetHintBits(). Additionally it's desirable to defer + * the cost of BufferBeginSetHintBits() until a hint bit needs to actually be + * set. This enum serves as the necessary state space passed to + * SetHintBitsExt(). + */ +typedef enum SetHintBitsState +{ + /* not yet checked if hint bits may be set */ + SHB_INITIAL, + /* failed to get permission to set hint bits, don't check again */ + SHB_DISABLED, + /* allowed to set hint bits */ + SHB_ENABLED, +} SetHintBitsState; + +/* + * SetHintBitsExt() * * Set commit/abort hint bits on a tuple, if appropriate at this time. * + * To be allowed to set a hint bit on a tuple, the page must not be undergoing + * IO at this time (otherwise we e.g. could corrupt PG's page checksum or even + * the filesystem's, as is known to happen with btrfs). + * + * The right to set a hint bit can be acquired on a page level with + * BufferBeginSetHintBits(). Only a single backend gets the right to set hint + * bits at a time. Alternatively, if called with a NULL SetHintBitsState*, + * hint bits are set with BufferSetHintBits16(). + * * It is only safe to set a transaction-committed hint bit if we know the * transaction's commit record is guaranteed to be flushed to disk before the * buffer, or if the table is temporary or unlogged and will be obliterated by @@ -111,24 +139,67 @@ * InvalidTransactionId if no check is needed. */ static inline void -SetHintBits(HeapTupleHeader tuple, Buffer buffer, - uint16 infomask, TransactionId xid) +SetHintBitsExt(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid, SetHintBitsState *state) { + /* + * In batched mode, if we previously did not get permission to set hint + * bits, don't try again - in all likelihood IO is still going on. + */ + if (state && *state == SHB_DISABLED) + return; + if (TransactionIdIsValid(xid)) { - /* NB: xid must be known committed here! */ - XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + if (BufferIsPermanent(buffer)) + { + /* NB: xid must be known committed here! */ + XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + + if (XLogNeedsFlush(commitLSN) && + BufferGetLSNAtomic(buffer) < commitLSN) + { + /* not flushed and no LSN interlock, so don't set hint */ + return; + } + } + } + + /* + * If we're not operating in batch mode, use BufferSetHintBits16() to mark + * the page dirty, that's cheaper than + * BufferBeginSetHintBits()/BufferFinishSetHintBits(). That's important + * for cases where we set a lot of hint bits on a page individually. + */ + if (!state) + { + BufferSetHintBits16(&tuple->t_infomask, + tuple->t_infomask | infomask, buffer); + return; + } - if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) && - BufferGetLSNAtomic(buffer) < commitLSN) + if (*state == SHB_INITIAL) + { + if (!BufferBeginSetHintBits(buffer)) { - /* not flushed and no LSN interlock, so don't set hint */ + *state = SHB_DISABLED; return; } - } + *state = SHB_ENABLED; + } tuple->t_infomask |= infomask; - MarkBufferDirtyHint(buffer, true); +} + +/* + * Simple wrapper around SetHintBitExt(), use when operating on a single + * tuple. + */ +static inline void +SetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + SetHintBitsExt(tuple, buffer, infomask, xid, NULL); } /* @@ -864,9 +935,9 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, * inserting/deleting transaction was still running --- which was more cycles * and more contention on ProcArrayLock. */ -static bool +static inline bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, - Buffer buffer) + Buffer buffer, SetHintBitsState *state) { HeapTupleHeader tuple = htup->t_data; @@ -921,8 +992,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) { /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId, state); return true; } @@ -934,13 +1005,13 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) return false; else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + SetHintBitsExt(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple), state); else { /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId, state); return false; } } @@ -1003,14 +1074,14 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) { /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId, state); return true; } /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple), state); } else { @@ -1607,9 +1678,10 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, * ->vistuples_dense is set to contain the offsets of visible tuples. * * The reason this is more efficient than HeapTupleSatisfiesMVCC() is that it - * avoids a cross-translation-unit function call for each tuple and allows the - * compiler to optimize across calls to HeapTupleSatisfiesMVCC. In the future - * it will also allow more efficient setting of hint bits. + * avoids a cross-translation-unit function call for each tuple, allows the + * compiler to optimize across calls to HeapTupleSatisfiesMVCC and allows + * setting hint bits more efficiently (see the one BufferFinishSetHintBits() + * call below). * * Returns the number of visible tuples. */ @@ -1620,6 +1692,7 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, OffsetNumber *vistuples_dense) { int nvis = 0; + SetHintBitsState state = SHB_INITIAL; Assert(IsMVCCSnapshot(snapshot)); @@ -1628,7 +1701,7 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, bool valid; HeapTuple tup = &batchmvcc->tuples[i]; - valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer); + valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer, &state); batchmvcc->visible[i] = valid; if (likely(valid)) @@ -1638,6 +1711,9 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, } } + if (state == SHB_ENABLED) + BufferFinishSetHintBits(buffer, true, true); + return nvis; } @@ -1657,7 +1733,7 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) switch (snapshot->snapshot_type) { case SNAPSHOT_MVCC: - return HeapTupleSatisfiesMVCC(htup, snapshot, buffer); + return HeapTupleSatisfiesMVCC(htup, snapshot, buffer, NULL); case SNAPSHOT_SELF: return HeapTupleSatisfiesSelf(htup, snapshot, buffer); case SNAPSHOT_ANY: diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d17aaa5aa0fb8..796e1513ddf96 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -681,20 +681,31 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, { /* * The conflicting tuple (or all HOT chains pointed to by - * all posting list TIDs) is dead to everyone, so mark the - * index entry killed. + * all posting list TIDs) is dead to everyone, so try to + * mark the index entry killed. It's ok if we're not + * allowed to, this isn't required for correctness. */ - ItemIdMarkDead(curitemid); - opaque->btpo_flags |= BTP_HAS_GARBAGE; + Buffer buf; - /* - * Mark buffer with a dirty hint, since state is not - * crucial. Be sure to mark the proper buffer dirty. - */ + /* Be sure to operate on the proper buffer */ if (nbuf != InvalidBuffer) - MarkBufferDirtyHint(nbuf, true); + buf = nbuf; else - MarkBufferDirtyHint(insertstate->buf, true); + buf = insertstate->buf; + + /* + * Use the hint bit infrastructure to check if we can + * update the page while just holding a share lock. + * + * Can't use BufferSetHintBits16() here as we update two + * different locations. + */ + if (BufferBeginSetHintBits(buf)) + { + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + BufferFinishSetHintBits(buf, true, true); + } } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5c50f0dd1bd91..f14ff95cb2b76 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -361,6 +361,17 @@ _bt_killitems(IndexScanDesc scan) */ if (killtuple && !ItemIdIsDead(iid)) { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can + * update the page while just holding a share lock. If we + * are not allowed, there's no point continuing. + */ + if (!BufferBeginSetHintBits(buf)) + goto unlock_page; + } + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; @@ -380,9 +391,10 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(buf, true); + BufferFinishSetHintBits(buf, true, true); } +unlock_page: if (!so->dropPin) _bt_unlockbuf(rel, buf); else diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index a9a1678acc97a..03c85dada710b 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1077,11 +1077,6 @@ XLogCheckBufferNeedsBackup(Buffer buffer) * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. - * - * It is possible that multiple concurrent backends could attempt to write WAL - * records. In that case, multiple copies of the same block would be recorded - * in separate WAL records by different backends, though that is still OK from - * a correctness perspective. */ XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std) @@ -1102,11 +1097,9 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) /* * We assume page LSN is first data on *every* page that can be passed to - * XLogInsert, whether it has the standard page layout or not. Since we're - * only holding a share-lock on the page, we must take the buffer header - * lock when we look at the LSN. + * XLogInsert, whether it has the standard page layout or not. */ - lsn = BufferGetLSNAtomic(buffer); + lsn = PageGetLSN(BufferGetPage(buffer)); if (lsn <= RedoRecPtr) { diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 119f31b5d6584..b332e002ba13b 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -25,21 +25,26 @@ that might need to do such a wait is instead handled by waiting to obtain the relation-level lock, which is why you'd better hold one first.) Pins may not be held across transaction boundaries, however. -Buffer content locks: there are two kinds of buffer lock, shared and exclusive, -which act just as you'd expect: multiple backends can hold shared locks on -the same buffer, but an exclusive lock prevents anyone else from holding -either shared or exclusive lock. (These can alternatively be called READ -and WRITE locks.) These locks are intended to be short-term: they should not -be held for long. Buffer locks are acquired and released by LockBuffer(). -It will *not* work for a single backend to try to acquire multiple locks on -the same buffer. One must pin a buffer before trying to lock it. +Buffer content locks: there are three kinds of buffer lock, shared, +share-exclusive and exclusive: +a) multiple backends can hold shared locks on the same buffer + (alternatively called a READ lock) +b) one backend can hold a share-exclusive lock on a buffer while multiple + backends can hold a share lock +c) an exclusive lock prevents anyone else from holding a shared, + share-exclusive or exclusive lock. + (alternatively called a WRITE lock) + +These locks are intended to be short-term: they should not be held for long. +Buffer locks are acquired and released by LockBuffer(). It will *not* work +for a single backend to try to acquire multiple locks on the same buffer. One +must pin a buffer before trying to lock it. Buffer access rules: -1. To scan a page for tuples, one must hold a pin and either shared or -exclusive content lock. To examine the commit status (XIDs and status bits) -of a tuple in a shared buffer, one must likewise hold a pin and either shared -or exclusive lock. +1. To scan a page for tuples, one must hold a pin and at least a share lock. +To examine the commit status (XIDs and status bits) of a tuple in a shared +buffer, one must likewise hold a pin and at least a share lock. 2. Once one has determined that a tuple is interesting (visible to the current transaction) one may drop the content lock, yet continue to access @@ -55,19 +60,25 @@ one must hold a pin and an exclusive content lock on the containing buffer. This ensures that no one else might see a partially-updated state of the tuple while they are doing visibility checks. -4. It is considered OK to update tuple commit status bits (ie, OR the -values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or -HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and -pin on a buffer. This is OK because another backend looking at the tuple -at about the same time would OR the same bits into the field, so there -is little or no risk of conflicting update; what's more, if there did -manage to be a conflict it would merely mean that one bit-update would -be lost and need to be done again later. These four bits are only hints -(they cache the results of transaction status lookups in pg_xact), so no -great harm is done if they get reset to zero by conflicting updates. -Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID -and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires -an exclusive buffer lock (and it must also be WAL-logged). +4. Non-critical information on a page ("hint bits") may be modified while +holding only a share-exclusive lock and pin on the page. To do so in cases +where only a share lock is already held, use BufferBeginSetHintBits() & +BufferFinishSetHintBits() (if multiple hint bits are to be set) or +BufferSetHintBits16() (if a single hint bit is set). + +E.g. for heapam, a share-exclusive lock allows to update tuple commit status +bits (ie, OR the values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, +HEAP_XMAX_COMMITTED, or HEAP_XMAX_INVALID into t_infomask) while holding only +a share-exclusive lock and pin on a buffer. This is OK because another +backend looking at the tuple at about the same time would OR the same bits +into the field, so there is little or no risk of conflicting update; what's +more, if there did manage to be a conflict it would merely mean that one +bit-update would be lost and need to be done again later. These four bits are +only hints (they cache the results of transaction status lookups in pg_xact), +so no great harm is done if they get reset to zero by conflicting updates. +Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID and +HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires an +exclusive buffer lock (and it must also be WAL-logged). 5. To physically remove a tuple or compact free space on a page, one must hold a pin and an exclusive lock, *and* observe while holding the @@ -80,7 +91,6 @@ buffer (increment the refcount) while one is performing the cleanup, but it won't be able to actually examine the page until it acquires shared or exclusive content lock. - Obtaining the lock needed under rule #5 is done by the bufmgr routines LockBufferForCleanup() or ConditionalLockBufferForCleanup(). They first get an exclusive lock and then check to see if the shared pin count is currently @@ -96,6 +106,10 @@ VACUUM's use, since we don't allow multiple VACUUMs concurrently on a single relation anyway. Anyone wishing to obtain a cleanup lock outside of recovery or a VACUUM must use the conditional variant of the function. +6. To write out a buffer, a share-exclusive lock needs to be held. This +prevents the buffer from being modified while written out, which could corrupt +checksums and cause issues on the OS or device level when direct-IO is used. + Buffer Manager's Internal Locking --------------------------------- diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5f3d083e93886..0546ee0193ce8 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2481,10 +2481,10 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* * If the buffer was dirty, try to write it out. There is a race - * condition here, in that someone might dirty it after we released the - * buffer header lock above, or even while we are writing it out (since - * our share-lock won't prevent hint-bit updates). We will recheck the - * dirty bit after re-locking the buffer header. + * condition here, another backend could dirty the buffer between + * StrategyGetBuffer() checking that it is not in use and invalidating the + * buffer below. That's addressed by InvalidateVictimBuffer() verifying + * that the buffer is not dirty. */ if (buf_state & BM_DIRTY) { @@ -2492,20 +2492,20 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) Assert(buf_state & BM_VALID); /* - * We need a share-lock on the buffer contents to write it out (else - * we might write invalid data, eg because someone else is compacting - * the page contents while we write). We must use a conditional lock - * acquisition here to avoid deadlock. Even though the buffer was not - * pinned (and therefore surely not locked) when StrategyGetBuffer - * returned it, someone else could have pinned and exclusive-locked it - * by the time we get here. If we try to get the lock unconditionally, - * we'd block waiting for them; if they later block waiting for us, - * deadlock ensues. (This has been observed to happen when two - * backends are both trying to split btree index pages, and the second - * one just happens to be trying to split the page the first one got - * from StrategyGetBuffer.) + * We need a share-exclusive lock on the buffer contents to write it + * out (else we might write invalid data, eg because someone else is + * compacting the page contents while we write). We must use a + * conditional lock acquisition here to avoid deadlock. Even though + * the buffer was not pinned (and therefore surely not locked) when + * StrategyGetBuffer returned it, someone else could have pinned and + * (share-)exclusive-locked it by the time we get here. If we try to + * get the lock unconditionally, we'd block waiting for them; if they + * later block waiting for us, deadlock ensues. (This has been + * observed to happen when two backends are both trying to split btree + * index pages, and the second one just happens to be trying to split + * the page the first one got from StrategyGetBuffer.) */ - if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE)) + if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE)) { /* * Someone else has locked the buffer, so give it up and loop back @@ -2518,18 +2518,14 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* * If using a nondefault strategy, and writing the buffer would * require a WAL flush, let the strategy decide whether to go ahead - * and write/reuse the buffer or to choose another victim. We need a - * lock to inspect the page LSN, so this can't be done inside + * and write/reuse the buffer or to choose another victim. We need to + * hold the content lock in at least share-exclusive mode to safely + * inspect the page LSN, so this couldn't have been done inside * StrategyGetBuffer. */ if (strategy != NULL) { - XLogRecPtr lsn; - - /* Read the LSN while holding buffer header lock */ - buf_state = LockBufHdr(buf_hdr); - lsn = BufferGetLSN(buf_hdr); - UnlockBufHdr(buf_hdr); + XLogRecPtr lsn = BufferGetLSN(buf_hdr); if (XLogNeedsFlush(lsn) && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) @@ -3019,7 +3015,7 @@ BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode) * * Checks if buffer is already dirty. * - * Buffer must be pinned and exclusive-locked. (Without an exclusive lock, + * Buffer must be pinned and [share-]exclusive-locked. (Without such a lock, * the result may be stale before it's returned.) */ bool @@ -3039,7 +3035,8 @@ BufferIsDirty(Buffer buffer) else { bufHdr = GetBufferDescriptor(buffer - 1); - Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) || + BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); } return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY; @@ -4074,8 +4071,8 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) } /* - * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the - * buffer is clean by the time we've locked it.) + * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do + * nothing if the buffer is clean by the time we've locked it.) */ PinBuffer_Locked(bufHdr); @@ -4405,11 +4402,8 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, * However, we will need to force the changes to disk via fsync before * we can checkpoint WAL. * - * The caller must hold a pin on the buffer and have share-locked the - * buffer contents. (Note: a share-lock does not prevent updates of - * hint bits in the buffer, so the page could change while the write - * is in progress, but we assume that that will not invalidate the data - * written.) + * The caller must hold a pin on the buffer and have + * (share-)exclusively-locked the buffer contents. * * If the caller has an smgr reference for the buffer's relation, pass it * as the second parameter. If not, pass NULL. @@ -4425,6 +4419,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, char *bufToWrite; uint64 buf_state; + Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) || + BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE)); + /* * Try to start an I/O operation. If StartBufferIO returns false, then * someone else flushed the buffer before we could, so we need not do @@ -4452,8 +4449,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, buf_state = LockBufHdr(buf); /* - * Run PageGetLSN while holding header lock, since we don't have the - * buffer locked exclusively in all cases. + * As we hold at least a share-exclusive lock on the buffer, the LSN + * cannot change during the flush (and thus can't be torn). */ recptr = BufferGetLSN(buf); @@ -4557,7 +4554,7 @@ FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, { Buffer buffer = BufferDescriptorGetBuffer(buf); - BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE); + BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE_EXCLUSIVE); FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL); BufferLockUnlock(buffer, buf); } @@ -4629,8 +4626,9 @@ BufferIsPermanent(Buffer buffer) /* * BufferGetLSNAtomic * Retrieves the LSN of the buffer atomically using a buffer header lock. - * This is necessary for some callers who may not have an exclusive lock - * on the buffer. + * This is necessary for some callers who may only hold a share lock on + * the buffer. A share lock allows a concurrent backend to set hint bits + * on the page, which in turn may require a WAL record to be emitted. */ XLogRecPtr BufferGetLSNAtomic(Buffer buffer) @@ -5476,8 +5474,8 @@ FlushDatabaseBuffers(Oid dbid) } /* - * Flush a previously, shared or exclusively, locked and pinned buffer to the - * OS. + * Flush a previously, share-exclusively or exclusively, locked and pinned + * buffer to the OS. */ void FlushOneBuffer(Buffer buffer) @@ -5550,56 +5548,38 @@ IncrBufferRefCount(Buffer buffer) } /* - * MarkBufferDirtyHint - * - * Mark a buffer dirty for non-critical changes. + * Shared-buffer only helper for MarkBufferDirtyHint() and + * BufferSetHintBits16(). * - * This is essentially the same as MarkBufferDirty, except: - * - * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. - * 2. The caller might have only share-lock instead of exclusive-lock on the - * buffer's content lock. - * 3. This function does not guarantee that the buffer is always marked dirty - * (due to a race condition), so it cannot be used for important changes. + * This is separated out because it turns out that the repeated checks for + * local buffers, repeated GetBufferDescriptor() and repeated reading of the + * buffer's state sufficiently hurts the performance of BufferSetHintBits16(). */ -void -MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +static inline void +MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, + bool buffer_std) { - BufferDesc *bufHdr; Page page = BufferGetPage(buffer); - if (!BufferIsValid(buffer)) - elog(ERROR, "bad buffer ID: %d", buffer); - - if (BufferIsLocal(buffer)) - { - MarkLocalBufferDirty(buffer); - return; - } - - bufHdr = GetBufferDescriptor(buffer - 1); - Assert(GetPrivateRefCount(buffer) > 0); - /* here, either share or exclusive lock is OK */ - Assert(BufferIsLockedByMe(buffer)); + + /* here, either share-exclusive or exclusive lock is OK */ + Assert(BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_EXCLUSIVE) || + BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_SHARE_EXCLUSIVE)); /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many - * tuples. So, be as quick as we can if the buffer is already dirty. We - * do this by not acquiring spinlock if it looks like the status bits are - * already set. Since we make this test unlocked, there's a chance we - * might fail to notice that the flags have just been cleared, and failed - * to reset them, due to memory-ordering issues. But since this function - * is only intended to be used in cases where failing to write out the - * data would be harmless anyway, it doesn't really matter. + * tuples. So, be as quick as we can if the buffer is already dirty. + * + * As we are holding (at least) a share-exclusive lock, nobody could have + * cleaned or dirtied the page concurrently, so we can just rely on the + * previously fetched value here without any danger of races. */ - if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) != - (BM_DIRTY | BM_JUST_DIRTIED)) + if (unlikely((lockstate & (BM_DIRTY | BM_JUST_DIRTIED)) != + (BM_DIRTY | BM_JUST_DIRTIED))) { XLogRecPtr lsn = InvalidXLogRecPtr; - bool dirtied = false; bool delayChkptFlags = false; uint64 buf_state; @@ -5612,8 +5592,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * We don't check full_page_writes here because that logic is included * when we call XLogInsert() since the value changes dynamically. */ - if (XLogHintBitIsNeeded() && - (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT)) + if (XLogHintBitIsNeeded() && (lockstate & BM_PERMANENT)) { /* * If we must not write WAL, due to a relfilelocator-specific @@ -5658,27 +5637,29 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) buf_state = LockBufHdr(bufHdr); + /* + * It should not be possible for the buffer to already be dirty, see + * comment above. + */ + Assert(!(buf_state & BM_DIRTY)); Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); - if (!(buf_state & BM_DIRTY)) + if (XLogRecPtrIsValid(lsn)) { - dirtied = true; /* Means "will be dirtied by this action" */ - /* - * Set the page LSN if we wrote a backup block. We aren't supposed - * to set this when only holding a share lock but as long as we - * serialise it somehow we're OK. We choose to set LSN while - * holding the buffer header lock, which causes any reader of an - * LSN who holds only a share lock to also obtain a buffer header - * lock before using PageGetLSN(), which is enforced in - * BufferGetLSNAtomic(). + * Set the page LSN if we wrote a backup block. To allow backends + * that only hold a share lock on the buffer to read the LSN in a + * tear-free manner, we set the page LSN while holding the buffer + * header lock. This allows any reader of an LSN who holds only a + * share lock to also obtain a buffer header lock before using + * PageGetLSN() to read the LSN in a tear free way. This is done + * in BufferGetLSNAtomic(). * * If checksums are enabled, you might think we should reset the * checksum here. That will happen when the page is written * sometime later in this checkpoint cycle. */ - if (XLogRecPtrIsValid(lsn)) - PageSetLSN(page, lsn); + PageSetLSN(page, lsn); } UnlockBufHdrExt(bufHdr, buf_state, @@ -5688,15 +5669,48 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (delayChkptFlags) MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; - if (dirtied) - { - pgBufferUsage.shared_blks_dirtied++; - if (VacuumCostActive) - VacuumCostBalance += VacuumCostPageDirty; - } + pgBufferUsage.shared_blks_dirtied++; + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageDirty; } } +/* + * MarkBufferDirtyHint + * + * Mark a buffer dirty for non-critical changes. + * + * This is essentially the same as MarkBufferDirty, except: + * + * 1. The caller does not write WAL; so if checksums are enabled, we may need + * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * 2. The caller might have only a share-exclusive-lock instead of an + * exclusive-lock on the buffer's content lock. + * 3. This function does not guarantee that the buffer is always marked dirty + * (it e.g. can't always on a hot standby), so it cannot be used for + * important changes. + */ +inline void +MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +{ + BufferDesc *bufHdr; + + bufHdr = GetBufferDescriptor(buffer - 1); + + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + if (BufferIsLocal(buffer)) + { + MarkLocalBufferDirty(buffer); + return; + } + + MarkSharedBufferDirtyHint(buffer, bufHdr, + pg_atomic_read_u64(&bufHdr->state), + buffer_std); +} + /* * Release buffer content locks for shared buffers. * @@ -6798,6 +6812,192 @@ IsBufferCleanupOK(Buffer buffer) return false; } +/* + * Helper for BufferBeginSetHintBits() and BufferSetHintBits16(). + * + * This checks if the current lock mode already suffices to allow hint bits + * being set and, if not, whether the current lock can be upgraded. + * + * Updates *lockstate when returning true. + */ +static inline bool +SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate) +{ + uint64 old_state; + PrivateRefCountEntry *ref; + BufferLockMode mode; + + ref = GetPrivateRefCountEntry(buffer, true); + + if (ref == NULL) + elog(ERROR, "buffer is not pinned"); + + mode = ref->data.lockmode; + if (mode == BUFFER_LOCK_UNLOCK) + elog(ERROR, "buffer is not locked"); + + /* we're done if we are already holding a sufficient lock level */ + if (mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE_EXCLUSIVE) + { + *lockstate = pg_atomic_read_u64(&buf_hdr->state); + return true; + } + + /* + * We are only holding a share lock right now, try to upgrade it to + * SHARE_EXCLUSIVE. + */ + Assert(mode == BUFFER_LOCK_SHARE); + + old_state = pg_atomic_read_u64(&buf_hdr->state); + while (true) + { + uint64 desired_state; + + desired_state = old_state; + + /* + * Can't upgrade if somebody else holds the lock in exclusive or + * share-exclusive mode. + */ + if (unlikely((old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) != 0)) + { + return false; + } + + /* currently held lock state */ + desired_state -= BM_LOCK_VAL_SHARED; + + /* new lock level */ + desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE; + + if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state, + &old_state, desired_state))) + { + ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE; + *lockstate = desired_state; + + return true; + } + } +} + +/* + * Try to acquire the right to set hint bits on the buffer. + * + * To be allowed to set hint bits, this backend needs to hold either a + * share-exclusive or an exclusive lock. In case this backend only holds a + * share lock, this function will try to upgrade the lock to + * share-exclusive. The caller is only allowed to set hint bits if true is + * returned. + * + * Once BufferBeginSetHintBits() has returned true, hint bits may be set + * without further calls to BufferBeginSetHintBits(), until the buffer is + * unlocked. + * + * + * Requiring a share-exclusive lock to set hint bits prevents setting hint + * bits on buffers that are currently being written out, which could corrupt + * the checksum on the page. Flushing buffers also requires a share-exclusive + * lock. + * + * Due to a lock >= share-exclusive being required to set hint bits, only one + * backend can set hint bits at a time. Allowing multiple backends to set hint + * bits would require more complicated locking: For setting hint bits we'd + * need to store the count of backends currently setting hint bits, for I/O we + * would need another lock-level conflicting with the hint-setting + * lock-level. Given that the share-exclusive lock for setting hint bits is + * only held for a short time, that backends often would just set the same + * hint bits and that the cost of occasionally not setting hint bits in hotly + * accessed pages is fairly low, this seems like an acceptable tradeoff. + */ +bool +BufferBeginSetHintBits(Buffer buffer) +{ + BufferDesc *buf_hdr; + uint64 lockstate; + + if (BufferIsLocal(buffer)) + { + /* + * NB: Will need to check if there is a write in progress, once it is + * possible for writes to be done asynchronously. + */ + return true; + } + + buf_hdr = GetBufferDescriptor(buffer - 1); + + return SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate); +} + +/* + * End a phase of setting hint bits on this buffer, started with + * BufferBeginSetHintBits(). + * + * This would strictly speaking not be required (i.e. the caller could do + * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity + * checks. + */ +void +BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std) +{ + if (!BufferIsLocal(buffer)) + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) || + BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); + + if (mark_dirty) + MarkBufferDirtyHint(buffer, buffer_std); +} + +/* + * Try to set hint bits on a single 16bit value in a buffer. + * + * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer + * dirty and return true. Otherwise false is returned. + * + * *ptr needs to be a pointer to memory within the buffer. + * + * This is a bit faster than BufferBeginSetHintBits() / + * BufferFinishSetHintBits() when setting hints once in a buffer, but slower + * than the former when setting hint bits multiple times in the same buffer. + */ +bool +BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer) +{ + BufferDesc *buf_hdr; + uint64 lockstate; +#ifdef USE_ASSERT_CHECKING + char *page; + + /* verify that the address is on the page */ + page = BufferGetPage(buffer); + Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ)); +#endif + + if (BufferIsLocal(buffer)) + { + *ptr = val; + + MarkLocalBufferDirty(buffer); + + return true; + } + + buf_hdr = GetBufferDescriptor(buffer - 1); + + if (SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate)) + { + *ptr = val; + + MarkSharedBufferDirtyHint(buffer, buf_hdr, lockstate, true); + + return true; + } + + return false; +} + /* * Functions for buffer I/O handling diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index ad337c0087182..b9a8f368a6372 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -904,13 +904,17 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, max_avail = fsm_get_max_avail(page); /* - * Reset the next slot pointer. This encourages the use of low-numbered - * pages, increasing the chances that a later vacuum can truncate the - * relation. We don't bother with marking the page dirty if it wasn't - * already, since this is just a hint. + * Try to reset the next slot pointer. This encourages the use of + * low-numbered pages, increasing the chances that a later vacuum can + * truncate the relation. We don't bother with marking the page dirty if + * it wasn't already, since this is just a hint. */ LockBuffer(buf, BUFFER_LOCK_SHARE); - ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + if (BufferBeginSetHintBits(buf)) + { + ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + BufferFinishSetHintBits(buf, false, false); + } LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index 33ee825529ca0..a2657c4033b9b 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -298,9 +298,18 @@ fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, * lock and get a garbled next pointer every now and then, than take the * concurrency hit of an exclusive lock. * + * Without an exclusive lock, we need to use the hint bit infrastructure + * to be allowed to modify the page. + * * Wrap-around is handled at the beginning of this function. */ - fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + if (exclusive_lock_held || BufferBeginSetHintBits(buf)) + { + fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + + if (!exclusive_lock_held) + BufferFinishSetHintBits(buf, false, false); + } return slot; } diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index a40adf6b2a8da..4017896f9518e 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -314,6 +314,10 @@ extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); +extern bool BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer); +extern bool BufferBeginSetHintBits(Buffer buffer); +extern void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std); + extern void UnlockBuffers(void); extern void UnlockBuffer(Buffer buffer); extern void LockBufferInternal(Buffer buffer, BufferLockMode mode); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 744ef29d44ce0..3da19d4141332 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2765,6 +2765,7 @@ SetConstraintStateData SetConstraintTriggerData SetExprState SetFunctionReturnMode +SetHintBitsState SetOp SetOpCmd SetOpPath From d537f59fbbfcd3b50a7208b1320a0fa73ca589f5 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 11 Mar 2026 09:22:11 +0100 Subject: [PATCH 25/32] Sort out table_open vs. relation_open in rewriter table_open() is a wrapper around relation_open() that checks that the relkind is table-like and gives a user-facing error message if not. It is best used in directly user-facing areas to check that the user used the right kind of command for the relkind. In internal uses where the relkind was previously checked from the user's perspective, table_open() is not necessary and might even be confusing if it were to give out-of-context error messages. In rewriteHandler.c, there were several such table_open() calls, which this changes to relation_open(). This currently doesn't make a difference, but there are plans to have other relkinds that could appear in the rewriter but that shouldn't be accessible via table-specific commands, and this clears the way for that. Reviewed-by: Ashutosh Bapat Discussion: https://www.postgresql.org/message-id/flat/6d3fef19-a420-4e11-8235-8ea534bf2080%40eisentraut.org Discussion: https://www.postgresql.org/message-id/flat/a855795d-e697-4fa5-8698-d20122126567@eisentraut.org --- src/backend/rewrite/rewriteHandler.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 7c99290be4d21..f98062668d6dc 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -195,7 +195,7 @@ AcquireRewriteLocks(Query *parsetree, else lockmode = rte->rellockmode; - rel = table_open(rte->relid, lockmode); + rel = relation_open(rte->relid, lockmode); /* * While we have the relation open, update the RTE's relkind, @@ -203,7 +203,7 @@ AcquireRewriteLocks(Query *parsetree, */ rte->relkind = rel->rd_rel->relkind; - table_close(rel, NoLock); + relation_close(rel, NoLock); break; case RTE_JOIN: @@ -2116,7 +2116,7 @@ fireRIRrules(Query *parsetree, List *activeRIRs) * We can use NoLock here since either the parser or * AcquireRewriteLocks should have locked the rel already. */ - rel = table_open(rte->relid, NoLock); + rel = relation_open(rte->relid, NoLock); /* * Collect the RIR rules that we must apply @@ -2226,7 +2226,7 @@ fireRIRrules(Query *parsetree, List *activeRIRs) rte->relkind != RELKIND_PARTITIONED_TABLE)) continue; - rel = table_open(rte->relid, NoLock); + rel = relation_open(rte->relid, NoLock); /* * Fetch any new security quals that must be applied to this RTE. @@ -3445,7 +3445,7 @@ rewriteTargetView(Query *parsetree, Relation view) * already have the right lock!) Since it will become the query target * relation, RowExclusiveLock is always the right thing. */ - base_rel = table_open(base_rte->relid, RowExclusiveLock); + base_rel = relation_open(base_rte->relid, RowExclusiveLock); /* * While we have the relation open, update the RTE's relkind, just in case @@ -4021,7 +4021,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length, * We can use NoLock here since either the parser or * AcquireRewriteLocks should have locked the rel already. */ - rt_entry_relation = table_open(rt_entry->relid, NoLock); + rt_entry_relation = relation_open(rt_entry->relid, NoLock); /* * Rewrite the targetlist as needed for the command type. From e87cd16c459f5b59d7db465e3de2db617589d273 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 23:38:41 -0700 Subject: [PATCH 26/32] Refactor handling of x86 CPUID instructions Introduce two helpers for CPUID, pg_cpuid and pg_cpuid_subleaf that wrap the platform specific __get_cpuid/__cpuid and __get_cpuid_count/__cpuidex functions. Additionally, introduce the CPUIDResult struct to make code working with CPUID easier to read by referencing the register name (e.g. ECX) instead of a numeric index. Author: Lukas Fittl Suggested-By: John Naylor Reviewed-by: Discussion: --- src/port/pg_cpu_x86.c | 70 ++++++++++++++++++++++---------- src/tools/pgindent/typedefs.list | 1 + 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 7575838245cd7..0fe4753eecc29 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -53,6 +53,44 @@ mask_available(uint32 value, uint32 mask) return (value & mask) == mask; } +/* General purpose registers used by CPUID */ +typedef struct CPUIDResult +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +} CPUIDResult; + +StaticAssertDecl(sizeof(CPUIDResult) == 4 * sizeof(unsigned int), + "CPUIDResult must have no padding"); + +static inline void +pg_cpuid(int leaf, CPUIDResult *r) +{ +#if defined(HAVE__GET_CPUID) + __get_cpuid(leaf, &r->eax, &r->ebx, &r->ecx, &r->edx); +#elif defined(HAVE__CPUID) + __cpuid((int *) r, leaf); +#else +#error cpuid instruction not available +#endif +} + +static inline bool +pg_cpuid_subleaf(int leaf, int subleaf, CPUIDResult *r) +{ +#if defined(HAVE__GET_CPUID_COUNT) + return __get_cpuid_count(leaf, subleaf, &r->eax, &r->ebx, &r->ecx, &r->edx) == 1; +#elif defined(HAVE__CPUIDEX) + __cpuidex((int *) r, leaf, subleaf); + return true; +#else + memset(r, 0, sizeof(CPUIDResult)); + return false; +#endif +} + /* * Parse the CPU ID info for runtime checks. */ @@ -62,33 +100,21 @@ pg_attribute_target("xsave") void set_x86_features(void) { - unsigned int exx[4] = {0, 0, 0, 0}; + CPUIDResult r = {0}; -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif + pg_cpuid(0x01, &r); - X86Features[PG_SSE4_2] = exx[2] >> 20 & 1; - X86Features[PG_POPCNT] = exx[2] >> 23 & 1; + X86Features[PG_SSE4_2] = r.ecx >> 20 & 1; + X86Features[PG_POPCNT] = r.ecx >> 23 & 1; /* All these features depend on OSXSAVE */ - if (exx[2] & (1 << 27)) + if (r.ecx & (1 << 27)) { uint32 xcr0_val = 0; /* second cpuid call on leaf 7 to check extended AVX-512 support */ - memset(exx, 0, 4 * sizeof(exx[0])); - -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#endif + pg_cpuid_subleaf(0x07, 0, &r); #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ @@ -99,11 +125,11 @@ set_x86_features(void) if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) { - X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1; - X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1; + X86Features[PG_AVX512_BW] = r.ebx >> 30 & 1; + X86Features[PG_AVX512_VL] = r.ebx >> 31 & 1; - X86Features[PG_AVX512_VPCLMULQDQ] = exx[2] >> 10 & 1; - X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; + X86Features[PG_AVX512_VPCLMULQDQ] = r.ecx >> 10 & 1; + X86Features[PG_AVX512_VPOPCNTDQ] = r.ecx >> 14 & 1; } } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3da19d4141332..489defe73626e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -549,6 +549,7 @@ CostSelector Counters CoverExt CoverPos +CPUIDResult CreateAmStmt CreateCastStmt CreateConversionStmt From 73a159af80bfdac8c9c9afc678859bd3c3fc4c9c Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 31 Jan 2026 08:49:46 -0800 Subject: [PATCH 27/32] Check for HAVE__CPUIDEX and HAVE__GET_CPUID_COUNT separately Previously we would only check for the availability of __cpuidex if the related __get_cpuid_count was not available on a platform. But there are cases where we want to be able to call __cpuidex as the only viable option, specifically, when accessing a high leaf like VM Hypervisor information (0x40000000), which __get_cpuid_count does not allow. This will be used in an future commit to access Hypervisor information about the TSC frequency of x86 CPUs, where available. Note that __cpuidex is defined in cpuid.h for GCC/clang, but in intrin.h for MSVC. Because we now set HAVE__CPUIDEX for GCC/clang when available, adjust existing code to check for _MSC_VER when including intrin.h. Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- configure | 22 +++++++++++++--------- configure.ac | 30 +++++++++++++++++------------- meson.build | 12 +++++++++--- src/port/pg_cpu_x86.c | 10 +++++----- 4 files changed, 44 insertions(+), 30 deletions(-) diff --git a/configure b/configure index 42621ecd05189..8279bdd8095f9 100755 --- a/configure +++ b/configure @@ -17657,7 +17657,8 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi fi -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 $as_echo_n "checking for __get_cpuid_count... " >&6; } if ${pgac_cv__get_cpuid_count+:} false; then : @@ -17690,21 +17691,25 @@ if test x"$pgac_cv__get_cpuid_count" = x"yes"; then $as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h -else - # __cpuidex() - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +fi +# __cpuidex() +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 $as_echo_n "checking for __cpuidex... " >&6; } if ${pgac_cv__cpuidex+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#ifdef _MSC_VER + #include + #else + #include + #endif int main () { -unsigned int exx[4] = {0, 0, 0, 0}; - __cpuidex(exx, 7, 0); +int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); ; return 0; @@ -17720,11 +17725,10 @@ rm -f core conftest.err conftest.$ac_objext \ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 $as_echo "$pgac_cv__cpuidex" >&6; } - if test x"$pgac_cv__cpuidex" = x"yes"; then +if test x"$pgac_cv__cpuidex" = x"yes"; then $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h - fi fi # Check for XSAVE intrinsics diff --git a/configure.ac b/configure.ac index 61ec895d23cf3..b5e6bbcb1269c 100644 --- a/configure.ac +++ b/configure.ac @@ -2104,7 +2104,8 @@ else fi fi -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2114,18 +2115,21 @@ AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [pgac_cv__get_cpuid_count="no"])]) if test x"$pgac_cv__get_cpuid_count" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) -else - # __cpuidex() - AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], - [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __cpuidex(exx, 7, 0); - ]])], - [pgac_cv__cpuidex="yes"], - [pgac_cv__cpuidex="no"])]) - if test x"$pgac_cv__cpuidex" = x"yes"; then - AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) - fi +fi +# __cpuidex() +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#ifdef _MSC_VER + #include + #else + #include + #endif], + [[int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) fi # Check for XSAVE intrinsics diff --git a/meson.build b/meson.build index 2df54409ca6b1..d22ccf8f2383e 100644 --- a/meson.build +++ b/meson.build @@ -2121,7 +2121,8 @@ elif cc.links(''' endif -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. if cc.links(''' #include int main(int arg, char **argv) @@ -2132,11 +2133,16 @@ if cc.links(''' ''', name: '__get_cpuid_count', args: test_c_args) cdata.set('HAVE__GET_CPUID_COUNT', 1) -elif cc.links(''' +endif +if cc.links(''' + #ifdef _MSC_VER #include + #else + #include + #endif int main(int arg, char **argv) { - unsigned int exx[4] = {0, 0, 0, 0}; + int exx[4] = {0, 0, 0, 0}; __cpuidex(exx, 7, 0); } ''', name: '__cpuidex', diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 0fe4753eecc29..7b01c17750c35 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -17,12 +17,12 @@ #if defined(USE_SSE2) || defined(__i386__) -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) -#include -#endif - -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#if defined(HAVE__CPUID) || defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || defined(HAVE__CPUIDEX) +#if defined(_MSC_VER) #include +#else +#include +#endif /* defined(_MSC_VER) */ #endif #ifdef HAVE_XSAVE_INTRINSICS From eddd4e0bf308bc7e7057bd79c18c8375bc0af2f3 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 24 Feb 2026 23:44:28 -0800 Subject: [PATCH 28/32] pg_test_timing: Reduce per-loop overhead The pg_test_timing program was previously using INSTR_TIME_GET_NANOSEC on an absolute instr_time value in order to do a diff, which goes against the spirit of how the GET_* macros are supposed to be used, and will cause overhead in a future change that assumes these macros are typically used on intervals only. Additionally the program was doing unnecessary work in the test loop by measuring the time elapsed, instead of checking the existing current time measurement against a target end time. To support that, introduce a new INSTR_TIME_SET_NANOSEC macro that allows initializing an instr_time variable from a user-defined interval. Author: Lukas Fittl Reviewed-by: Discussion: --- src/bin/pg_test_timing/pg_test_timing.c | 29 ++++++++++++++----------- src/include/portability/instr_time.h | 4 +++- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index a5621251afcee..dd865ed887587 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -152,14 +152,12 @@ handle_args(int argc, char *argv[]) static uint64 test_timing(unsigned int duration) { - uint64 total_time; - int64 time_elapsed = 0; uint64 loop_count = 0; - uint64 prev, - cur; instr_time start_time, end_time, - temp; + duration_time, + prev, + cur; /* * Pre-zero the statistics data structures. They're already zero by @@ -171,20 +169,26 @@ test_timing(unsigned int duration) largest_diff = 0; largest_diff_count = 0; - total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0; + INSTR_TIME_SET_NANOSEC(duration_time, duration > 0 ? duration * NS_PER_S : 0); INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_NANOSEC(start_time); + cur = start_time; - while (time_elapsed < total_time) + end_time = start_time; + INSTR_TIME_ADD(end_time, duration_time); + + while (INSTR_TIME_GT(end_time, cur)) { int32 diff, bits; + instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_NANOSEC(temp); - diff = cur - prev; + INSTR_TIME_SET_CURRENT(cur); + + diff_time = cur; + INSTR_TIME_SUBTRACT(diff_time, prev); + diff = INSTR_TIME_GET_NANOSEC(diff_time); /* Did time go backwards? */ if (unlikely(diff < 0)) @@ -217,10 +221,9 @@ test_timing(unsigned int duration) largest_diff_count++; loop_count++; - INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_NANOSEC(temp); } + /* Refresh end time to be the actual time spent (vs the target end time) */ INSTR_TIME_SET_CURRENT(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 8b6baeffd3e46..1b0c8e28f9b1f 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -17,8 +17,9 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_NANOSEC(t, x) set t to the specified value (in nanosecs) * + * INSTR_TIME_SET_CURRENT(t) set t to current time * * INSTR_TIME_ADD(x, y) x += y * @@ -170,6 +171,7 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_NANOSEC(t, n) ((t).ticks = n) #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) From 9d96fc08833e90bf5703b0de61d864a838cfd9ca Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Fri, 25 Jul 2025 17:57:20 -0700 Subject: [PATCH 29/32] instrumentation: Streamline ticks to nanosecond conversion across platforms The timing infrastructure (INSTR_* macros) measures time elapsed using clock_gettime() on POSIX systems, which returns the time as nanoseconds, and QueryPerformanceCounter() on Windows, which is a specialized timing clock source that returns a tick counter that needs to be converted to nanoseconds using the result of QueryPerformanceFrequency(). This conversion currently happens ad-hoc on Windows, e.g. when calling INSTR_TIME_GET_NANOSEC, which calls QueryPerformanceFrequency() on every invocation, despite the frequency being stable after program start, incurring unnecessary overhead. It also causes a fractured implementation where macros are defined differently between platforms. To ease code readability, and prepare for a future change that intends to use a ticks-to-nanosecond conversion on x86-64 for TSC use, introduce a new pg_ticks_to_ns() function that gets called on all platforms. This function relies on a separately initialized ticks_per_ns_scaled value, that represents the conversion ratio. This value is initialized from QueryPerformanceFrequency() on Windows, and set to zero on x86-64 POSIX systems, which results in the ticks being treated as nanoseconds. Other architectures always directly return the original ticks. To support this, pg_initialize_timing() is introduced, and is now mandatory for both the backend and any frontend programs to call before utilizing INSTR_* macros. Author: Lukas Fittl Author: Andres Freund Author: David Geier Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/backend/main/main.c | 5 ++ src/bin/pg_test_timing/pg_test_timing.c | 3 + src/bin/pgbench/pgbench.c | 3 + src/bin/psql/startup.c | 4 + src/common/Makefile | 1 + src/common/instr_time.c | 92 +++++++++++++++++++++ src/common/meson.build | 1 + src/include/portability/instr_time.h | 101 +++++++++++++++++------- 8 files changed, 180 insertions(+), 30 deletions(-) create mode 100644 src/common/instr_time.c diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7b9b602f3c4b0..884fb7b4910a1 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -90,6 +90,11 @@ main(int argc, char *argv[]) */ startup_hacks(progname); + /* + * Initialize timing infrastructure + */ + pg_initialize_timing(); + /* * Remember the physical location of the initially given argv[] array for * possible use by ps display. On some platforms, the argv[] storage must diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index dd865ed887587..98672ae5d32a6 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -43,6 +43,9 @@ main(int argc, char *argv[]) handle_args(argc, argv); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + loop_count = test_timing(test_duration); output(loop_count); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 1dae918cc09d2..f962402a191de 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7334,6 +7334,9 @@ main(int argc, char **argv) initRandomState(&state[i].cs_func_rs); } + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + /* opening connection... */ con = doConnect(); if (con == NULL) diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 9a397ec87b736..69d044d405d5b 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa50972..1a2fbbe887f22 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 0000000000000..68bc585f2cc08 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,92 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "portability/instr_time.h" + +/* + * Stores what the number of ticks needs to be multiplied with to end up + * with nanoseconds using integer math. + * + * On certain platforms (currently Windows) the ticks to nanoseconds conversion + * requires floating point math because: + * + * sec = ticks / frequency_hz + * ns = ticks / frequency_hz * 1,000,000,000 + * ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz + * + * Here, 'ns' is usually a floating number. For example for a 2.5 GHz CPU + * the scaling factor becomes 1,000,000 / 2,500,000 = 1.2. + * + * To be able to use integer math we work around the lack of precision. We + * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the + * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by + * the same amount. We utilize unsigned integers even though ticks are stored + * as a signed value to encourage compilers to generate better assembly. + * + * We remember the maximum number of ticks that can be multiplied by the scale + * factor without overflowing so we can check via a * b > max <=> a > max / b. + * + * On all other platforms we are using clock_gettime(), which uses nanoseconds + * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns + * to return the original value. + */ +uint64 ticks_per_ns_scaled = 0; +uint64 max_ticks_no_overflow = 0; + +static void set_ticks_per_ns(void); + +static bool timing_initialized = false; + +void +pg_initialize_timing(void) +{ + if (timing_initialized) + return; + + set_ticks_per_ns(); + timing_initialized = true; +} + +#ifndef WIN32 + +static void +set_ticks_per_ns() +{ + ticks_per_ns_scaled = 0; + max_ticks_no_overflow = 0; +} + +#else /* WIN32 */ + +/* GetTimerFrequency returns counts per second */ +static inline double +GetTimerFrequency(void) +{ + LARGE_INTEGER f; + + QueryPerformanceFrequency(&f); + return (double) f.QuadPart; +} + +static void +set_ticks_per_ns() +{ + ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* WIN32 */ diff --git a/src/common/meson.build b/src/common/meson.build index 4f9b8b8263d55..9bd55cda95b10 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 1b0c8e28f9b1f..eadf76720d99e 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -79,11 +79,32 @@ typedef struct instr_time #define NS_PER_MS INT64CONST(1000000) #define NS_PER_US INT64CONST(1000) +/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ +#define TICKS_TO_NS_SHIFT 14 -#ifndef WIN32 +#ifdef WIN32 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TICKS_TO_NS 0 +#endif + +/* + * Variables used to translate ticks to nanoseconds, initialized by + * pg_initialize_timing. + */ +extern PGDLLIMPORT uint64 ticks_per_ns_scaled; +extern PGDLLIMPORT uint64 max_ticks_no_overflow; + +/* + * Initialize timing infrastructure + * + * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros. + */ +extern void pg_initialize_timing(void); +#ifndef WIN32 -/* Use clock_gettime() */ +/* On POSIX, use clock_gettime() for system clock source */ #include @@ -107,9 +128,8 @@ typedef struct instr_time #define PG_INSTR_CLOCK CLOCK_REALTIME #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_get_ticks(void) { instr_time now; struct timespec tmp; @@ -120,21 +140,12 @@ pg_clock_gettime_ns(void) return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - - #else /* WIN32 */ +/* On Windows, use QueryPerformanceCounter() for system clock source */ -/* Use QueryPerformanceCounter() */ - -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_query_performance_counter(void) +pg_get_ticks(void) { instr_time now; LARGE_INTEGER tmp; @@ -145,23 +156,47 @@ pg_query_performance_counter(void) return now; } -static inline double -GetTimerFrequency(void) -{ - LARGE_INTEGER f; - - QueryPerformanceFrequency(&f); - return (double) f.QuadPart; -} - -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_query_performance_counter()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) - #endif /* WIN32 */ +static inline int64 +pg_ticks_to_ns(int64 ticks) +{ +#if PG_INSTR_TICKS_TO_NS + int64 ns = 0; + + /* + * Avoid doing work if we don't use scaled ticks, e.g. system clock on + * Unix + */ + if (ticks_per_ns_scaled == 0) + return ticks; + + /* + * Would multiplication overflow? If so perform computation in two parts. + */ + if (unlikely(ticks > (int64) max_ticks_no_overflow)) + { + /* + * To avoid overflow, first scale total ticks down by the fixed + * factor, and *afterwards* multiply them by the frequency-based scale + * factor. + * + * The remaining ticks can follow the regular formula, since they + * won't overflow. + */ + int64 count = ticks >> TICKS_TO_NS_SHIFT; + + ns = count * ticks_per_ns_scaled; + ticks -= (count << TICKS_TO_NS_SHIFT); + } + + ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT; + + return ns; +#else + return ticks; +#endif /* PG_INSTR_TICKS_TO_NS */ +} /* * Common macros @@ -173,6 +208,9 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_NANOSEC(t, n) ((t).ticks = n) +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) + #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) @@ -185,6 +223,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (pg_ticks_to_ns((t).ticks)) + #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) From 0d6e0106ce2477af6327dfb899c295c0ad0936af Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Wed, 11 Mar 2026 00:55:03 -0700 Subject: [PATCH 30/32] instrumentation: Use Time-Stamp Counter (TSC) on x86-64 for faster measurements This allows the direct use of the Time-Stamp Counter (TSC) value retrieved from the CPU using RDTSC/RDTSC instructions, instead of APIs like clock_gettime() on POSIX systems. This reduces the overhead of EXPLAIN with ANALYZE and TIMING ON. Tests showed that runtime when instrumented can be reduced by up to 10% for queries moving lots of rows through the plan. To control use of the TSC, the new "timing_clock_source" GUC is introduced, whose default ("auto") automatically uses the TSC when running on Linux/x86-64, in case the system clocksource is reported as "tsc". The use of the system APIs can be enforced by setting "system", or on x86-64 architectures the use of TSC can be enforced by explicitly setting "tsc". In order to use the TSC the frequency is first determined by use of CPUID, and if not available, by running a short calibration loop at program start, falling back to the system time if TSC values are not stable. Note, that we split TSC usage into the RDTSC CPU instruction which does not wait for out-of-order execution (faster, less precise) and the RDTSCP instruction, which waits for outstanding instructions to retire. RDTSCP is deemed to have little benefit in the typical InstrStartNode() / InstrStopNode() use case of EXPLAIN, and can be up to twice as slow. To separate these use cases, the new macro INSTR_TIME_SET_CURRENT_FAST() is introduced, which uses RDTSC. The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be used when precision is more important than performance. When the system timing clock source is used both of these macros instead utilize the system APIs (clock_gettime / QueryPerformanceCounter) like before. Author: David Geier Author: Andres Freund Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- doc/src/sgml/config.sgml | 54 ++++ src/backend/executor/instrument.c | 58 +++- src/backend/main/main.c | 7 +- src/backend/utils/misc/guc_parameters.dat | 11 + src/backend/utils/misc/guc_tables.c | 11 + src/backend/utils/misc/postgresql.conf.sample | 4 + src/bin/pg_test_timing/pg_test_timing.c | 14 +- src/bin/pgbench/pgbench.c | 2 +- src/bin/psql/startup.c | 2 +- src/common/instr_time.c | 293 +++++++++++++++++- src/include/port/pg_cpu.h | 9 + src/include/portability/instr_time.h | 157 ++++++++-- src/include/utils/guc_hooks.h | 3 + src/include/utils/guc_tables.h | 1 + src/port/pg_cpu_x86.c | 180 ++++++++++- src/tools/pgindent/typedefs.list | 1 + 16 files changed, 766 insertions(+), 41 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 8cdd826fbd37a..99a6593d9ac59 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2533,6 +2533,60 @@ include_dir 'conf.d' + + Timing + + + + timing_clock_source (enum) + + timing_clock_source configuration parameter + + + + + Selects the method for making timing measurements using the OS or specialized CPU + instructions. Possible values are: + + + + auto (automatically chooses TSC clock source for modern CPUs, + otherwise uses the OS system clock) + + + + + system (measures timing using the OS system clock) + + + + + tsc (measures timing using the x86-64 Time-Stamp Counter (TSC) + by directly executing RDTSC/RDTSCP instructions, see below) + + + + The default is auto. + + + If enabled, the TSC clock source will use the RDTSC instruction for the x86-64 + Time-Stamp Counter (TSC) to perform certain time measurements, for example during + EXPLAIN ANALYZE. The RDTSC instruction has less overhead than going through the OS + clock source, which for an EXPLAIN ANALYZE statement will show timing closer to the + actual runtime when timing is off. For timings that require higher precision the + RDTSCP instruction is used, which avoids inaccuracies due to CPU instruction re-ordering. + Use of RDTSC/RDTSC is not supported on older CPUs or hypervisors that don't pass the TSC + frequency to guest VMs, and is not advised on systems that utilize an emulated TSC. + + + To help decide which clock source to use on an x86-64 system you can run the + pg_test_timing utility to check TSC availability, and + perform timing measurements. + + + + + Background Writer diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc2522f..03cc82182ee4a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -72,7 +72,7 @@ InstrStartNode(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); else - INSTR_TIME_SET_CURRENT(instr->starttime); + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } /* save buffer usage totals at node entry, if needed */ @@ -99,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); @@ -294,3 +294,57 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } + +/* GUC hooks for timing_clock_source */ + +#include "portability/instr_time.h" +#include "utils/guc_hooks.h" + +bool +check_timing_clock_source(int *newval, void **extra, GucSource source) +{ + pg_initialize_timing(true); + +#if PG_INSTR_TSC_CLOCK + if (*newval == TIMING_CLOCK_SOURCE_TSC && !has_usable_tsc) + { + GUC_check_errdetail("TSC is not supported as timing clock source"); + return false; + } +#endif + + return true; +} + +void +assign_timing_clock_source(int newval, void *extra) +{ + /* + * Ignore the return code since the check hook already verified TSC is + * usable if its explicitly requested + */ + pg_set_timing_clock_source(newval); +} + +const char * +show_timing_clock_source(void) +{ + switch (timing_clock_source) + { + case TIMING_CLOCK_SOURCE_AUTO: +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + return "auto (tsc)"; +#endif + return "auto (system)"; + case TIMING_CLOCK_SOURCE_SYSTEM: + return "system"; +#if PG_INSTR_TSC_CLOCK + case TIMING_CLOCK_SOURCE_TSC: + return "tsc"; +#endif + } + + /* unreachable */ + return "?"; +} diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 884fb7b4910a1..bcb45a54678bb 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -93,7 +93,12 @@ main(int argc, char *argv[]) /* * Initialize timing infrastructure */ - pg_initialize_timing(); +#if defined(WIN32) + /* Skip TSC calibration on Windows, its too expensive per connection */ + pg_initialize_timing(false); +#else + pg_initialize_timing(true); +#endif /* * Remember the physical location of the initially given argv[] array for diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index a5a0edf2534aa..18cd9a0fafdd6 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2988,6 +2988,17 @@ assign_hook => 'assign_timezone_abbreviations', }, +{ name => 'timing_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME', + short_desc => 'Controls the clock source used for collecting timing measurements.', + long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.', + variable => 'timing_clock_source', + boot_val => 'TIMING_CLOCK_SOURCE_AUTO', + options => 'timing_clock_source_options', + check_hook => 'check_timing_clock_source', + assign_hook => 'assign_timing_clock_source', + show_hook => 'show_timing_clock_source', +}, + { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', short_desc => 'Logs details of pre-authentication connection handshake.', flags => 'GUC_NOT_IN_SAMPLE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 38aaf82f12094..b8bb9590d9c6c 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -92,6 +92,7 @@ #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" +#include "portability/instr_time.h" #include "utils/bytea.h" #include "utils/float.h" #include "utils/guc_hooks.h" @@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry timing_clock_source_options[] = { + {"auto", TIMING_CLOCK_SOURCE_AUTO, false}, + {"system", TIMING_CLOCK_SOURCE_SYSTEM, false}, +#if PG_INSTR_TSC_CLOCK + {"tsc", TIMING_CLOCK_SOURCE_TSC, false}, +#endif + {NULL, 0, false} +}; + static const struct config_enum_entry huge_pages_status_options[] = { {"off", HUGE_PAGES_OFF, false}, {"on", HUGE_PAGES_ON, false}, @@ -723,6 +733,7 @@ const char *const config_group_names[] = [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"), [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"), [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"), + [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"), [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"), [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"), [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e686d88afc427..3cbe96b96edd5 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -193,6 +193,10 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Time - + +#timing_clock_source = auto # auto, system, tsc (if supported) + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index 98672ae5d32a6..9f4b196c4bb8d 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -43,8 +43,18 @@ main(int argc, char *argv[]) handle_args(argc, argv); - /* initialize timing infrastructure (required for INSTR_* calls) */ - pg_initialize_timing(); + /* + * Initialize timing infrastructure (required for INSTR_* calls) + * + * This initialization should match the one in main() so the user can + * reason about what the backend will do. + */ +#if defined(WIN32) + /* Skip TSC calibration on Windows, its too expensive per connection */ + pg_initialize_timing(false); +#else + pg_initialize_timing(true); +#endif loop_count = test_timing(test_duration); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index f962402a191de..06db4042e8f70 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7335,7 +7335,7 @@ main(int argc, char **argv) } /* initialize timing infrastructure (required for INSTR_* calls) */ - pg_initialize_timing(); + pg_initialize_timing(false); /* opening connection... */ con = doConnect(); diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 69d044d405d5b..83753dab7d3bf 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -329,7 +329,7 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); /* initialize timing infrastructure (required for INSTR_* calls) */ - pg_initialize_timing(); + pg_initialize_timing(false); SyncVariables(); diff --git a/src/common/instr_time.c b/src/common/instr_time.c index 68bc585f2cc08..2becf9b0780c0 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -14,14 +14,21 @@ */ #include "postgres.h" +#include + +#ifndef WIN32 +#include +#endif + +#include "port/pg_cpu.h" #include "portability/instr_time.h" /* * Stores what the number of ticks needs to be multiplied with to end up * with nanoseconds using integer math. * - * On certain platforms (currently Windows) the ticks to nanoseconds conversion - * requires floating point math because: + * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows) + * the ticks to nanoseconds conversion requires floating point math because: * * sec = ticks / frequency_hz * ns = ticks / frequency_hz * 1,000,000,000 @@ -40,7 +47,7 @@ * We remember the maximum number of ticks that can be multiplied by the scale * factor without overflowing so we can check via a * b > max <=> a > max / b. * - * On all other platforms we are using clock_gettime(), which uses nanoseconds + * In all other cases we are using clock_gettime(), which uses nanoseconds * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns * to return the original value. */ @@ -49,22 +56,73 @@ uint64 max_ticks_no_overflow = 0; static void set_ticks_per_ns(void); +int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO; static bool timing_initialized = false; +#if PG_INSTR_TSC_CLOCK +/* Indicates if TSC instructions (RDTSC and RDTSCP) are usable. */ +bool has_usable_tsc = false; + +static void tsc_initialize(bool allow_tsc_calibration); +static bool tsc_use_by_default(void); +static void set_ticks_per_ns_system(); +static void set_ticks_per_ns_for_tsc(void); +#endif + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + * + * The allow_tsc_calibration argument sets whether the TSC logic (if available) + * is permitted to do calibration if it couldn't get the frequency from CPUID. + * + * Calibration may take up to TSC_CALIBRATION_MAX_NS and delays program start. + */ void -pg_initialize_timing(void) +pg_initialize_timing(bool allow_tsc_calibration) { if (timing_initialized) return; +#if PG_INSTR_TSC_CLOCK + tsc_initialize(allow_tsc_calibration); +#endif + set_ticks_per_ns(); timing_initialized = true; } +bool +pg_set_timing_clock_source(TimingClockSourceType source) +{ + Assert(timing_initialized); + +#if PG_INSTR_TSC_CLOCK + switch (source) + { + case TIMING_CLOCK_SOURCE_AUTO: + use_tsc = has_usable_tsc && tsc_use_by_default(); + break; + case TIMING_CLOCK_SOURCE_SYSTEM: + use_tsc = false; + break; + case TIMING_CLOCK_SOURCE_TSC: + if (!has_usable_tsc) /* Tell caller TSC is not usable */ + return false; + use_tsc = true; + break; + } +#endif + + set_ticks_per_ns(); + timing_clock_source = source; + return true; +} + #ifndef WIN32 static void -set_ticks_per_ns() +set_ticks_per_ns_system() { ticks_per_ns_scaled = 0; max_ticks_no_overflow = 0; @@ -83,10 +141,233 @@ GetTimerFrequency(void) } static void -set_ticks_per_ns() +set_ticks_per_ns_system() { ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; } #endif /* WIN32 */ + +static void +set_ticks_per_ns() +{ +#if PG_INSTR_TSC_CLOCK + if (use_tsc) + set_ticks_per_ns_for_tsc(); + else + set_ticks_per_ns_system(); +#else + set_ticks_per_ns_system(); +#endif +} + +/* TSC specific logic */ + +#if PG_INSTR_TSC_CLOCK + +bool use_tsc = false; + +static uint32 tsc_frequency_khz = 0; + +static uint32 tsc_calibrate(void); + +/* + * Decide whether we use the RDTSC/RDTSCP instructions at runtime, for x86-64, + * instead of incurring the overhead of a full clock_gettime() call. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + */ +static void +tsc_initialize(bool allow_tsc_calibration) +{ + /* Determine speed at which the TSC advances */ + tsc_frequency_khz = x86_tsc_frequency_khz(); + + if (tsc_frequency_khz) + { + has_usable_tsc = x86_feature_available(PG_RDTSCP); + return; + } + + /* + * CPUID did not give us the TSC frequency. If TSC is invariant and RDTSCP + * is available, we can measure the frequency by comparing TSC ticks + * against walltime using a short calibration loop. + */ + if (allow_tsc_calibration && x86_feature_available(PG_TSC_INVARIANT) && + x86_feature_available(PG_RDTSCP)) + { + tsc_frequency_khz = tsc_calibrate(); + has_usable_tsc = (tsc_frequency_khz > 0); + } +} + +/* + * Decides whether to use the TSC clock source if the user did not specify it + * one way or the other, and it is available (checked separately). + * + * Mirrors the Linux kernel's clocksource watchdog disable logic as updated in + * 2021 to reflect the reliability of the TSC on Intel platforms, see + * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion + * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/ + * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/ + * for reference. + * + * When the CPU has an invariant TSC (which we require in x86_tsc_frequency_khz), + * TSC_ADJUST bit set (Intel-only), and the system has at most 4 physical + * packages (sockets), we consider the TSC trustworthy by default, matching the + * Linux kernel. + * + * On other CPU platforms (e.g. AMD), in a virtual machine, or on 8+ socket + * systems we don't have an easy way to determine the TSC's reliability. If on + * Linux, we can check if TSC is the active clocksource, based on it having run + * the watchdog logic to monitor TSC correctness. For other platforms the user + * must explicitly enable it via GUC instead. + */ +static bool +tsc_use_by_default(void) +{ + if (x86_feature_available(PG_TSC_ADJUST)) + { + int cpus_per_package = x86_logical_processors_per_package(); + long total_cpus; + +#ifdef _SC_NPROCESSORS_CONF + total_cpus = sysconf(_SC_NPROCESSORS_CONF); +#elif defined(WIN32) + { + SYSTEM_INFO si; + + GetSystemInfo(&si); + total_cpus = si.dwNumberOfProcessors; + } +#else + total_cpus = -1; +#endif /* _SC_NPROCESSORS_CONF / WIN32 */ + + if (total_cpus > 0 && cpus_per_package > 0 && (total_cpus / cpus_per_package) <= 4) + return true; + } + +#if defined(__linux__) + { + FILE *fp; + char buf[128]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp) + { + bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL && + strcmp(buf, "tsc\n") == 0); + + fclose(fp); + if (is_tsc) + return true; + } + } +#endif + + return false; +} + +/* + * Calibrate the TSC frequency by comparing TSC ticks against walltime. + * + * Takes initial TSC and system clock snapshots, then loops, recomputing the + * frequency each iteration from cumulative TSC ticks divided by elapsed time. + * + * Once the frequency estimate stabilizes (consecutive iterations agree), we + * consider it converged and the frequency in KHz is returned. If either too + * many iterations or a time limit passes without convergence, 0 is returned. + */ +#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS) +#define TSC_CALIBRATION_ITERATIONS 1000000 +#define TSC_CALIBRATION_STABLE_CYCLES 3 + +static uint32 +tsc_calibrate(void) +{ + instr_time initial_wall; + int64 initial_tsc; + double freq_khz = 0; + double prev_freq_khz = 0; + int stable_count = 0; + int64 prev_tsc; + uint32 unused; + + /* Ensure INSTR_* time below work on system time */ + set_ticks_per_ns_system(); + + INSTR_TIME_SET_CURRENT(initial_wall); + +#ifdef _MSC_VER + initial_tsc = __rdtscp(&unused); +#else + initial_tsc = __builtin_ia32_rdtscp(&unused); +#endif + prev_tsc = initial_tsc; + + for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++) + { + instr_time now_wall; + int64 now_tsc; + int64 elapsed_ns; + int64 elapsed_ticks; + + INSTR_TIME_SET_CURRENT(now_wall); + +#ifdef _MSC_VER + now_tsc = __rdtscp(&unused); +#else + now_tsc = __builtin_ia32_rdtscp(&unused); +#endif + + INSTR_TIME_SUBTRACT(now_wall, initial_wall); + elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall); + + /* Safety: bail out if we've taken too long */ + if (elapsed_ns >= TSC_CALIBRATION_MAX_NS) + break; + + elapsed_ticks = now_tsc - initial_tsc; + + /* Skip if TSC hasn't advanced, or we walked backwards for some reason */ + if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0) + continue; + + freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000; + + /* + * Once freq_khz / prev_freq_khz is small, check if it stays that way. + * If it does for long enough, we've got a winner frequency. + */ + if (prev_freq_khz != 0 && fabs(freq_khz / prev_freq_khz) < 1.0001) + { + stable_count++; + if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES) + return (uint32) freq_khz; + } + else + stable_count = 0; + + prev_tsc = now_tsc; + prev_freq_khz = freq_khz; + } + + /* did not converge */ + return 0; +} + +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac27..a32e67487f834 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -23,6 +23,12 @@ typedef enum X86FeatureId /* scalar registers and 128-bit XMM registers */ PG_SSE4_2, PG_POPCNT, + PG_HYPERVISOR, + + /* TSC flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, /* 512-bit ZMM registers */ PG_AVX512_BW, @@ -45,6 +51,9 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern int x86_logical_processors_per_package(void); +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index eadf76720d99e..a7640d0e72a57 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,10 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in + * certain cases, or alternatively clock_gettime() on Unix-like systems and + * QueryPerformanceCounter() on Windows. These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -19,7 +20,11 @@ * * INSTR_TIME_SET_NANOSEC(t, x) set t to the specified value (in nanosecs) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window + * + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * INSTR_TIME_ADD(x, y) x += y * @@ -82,12 +87,6 @@ typedef struct instr_time /* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ #define TICKS_TO_NS_SHIFT 14 -#ifdef WIN32 -#define PG_INSTR_TICKS_TO_NS 1 -#else -#define PG_INSTR_TICKS_TO_NS 0 -#endif - /* * Variables used to translate ticks to nanoseconds, initialized by * pg_initialize_timing. @@ -95,12 +94,68 @@ typedef struct instr_time extern PGDLLIMPORT uint64 ticks_per_ns_scaled; extern PGDLLIMPORT uint64 max_ticks_no_overflow; +typedef enum +{ + TIMING_CLOCK_SOURCE_AUTO, + TIMING_CLOCK_SOURCE_SYSTEM, + TIMING_CLOCK_SOURCE_TSC +} TimingClockSourceType; + +extern int timing_clock_source; + /* * Initialize timing infrastructure * - * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros. + * This must be called at least once by frontend programs before using + * INSTR_TIME_SET_CURRENT* macros. Backend programs automatically initialize + * this through the GUC check hook. + */ +extern void pg_initialize_timing(bool allow_tsc_calibrate); + +/* + * Sets the time source to be used. Mainly intended for frontend programs, + * the backend should set it via the timing_clock_source GUC instead. + * + * Returns false if the clock source could not be set, for example when TSC + * is not available despite being explicitly set. + */ +extern bool pg_set_timing_clock_source(TimingClockSourceType source); + +#if defined(__x86_64__) || defined(_M_X64) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(WIN32) +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 0 +#endif + + +#if PG_INSTR_TSC_CLOCK +/* Whether the hardware TSC clock is available and usable. */ +extern PGDLLIMPORT bool has_usable_tsc; + +/* Whether to actually use TSC based on availability and GUC settings. */ +extern PGDLLIMPORT bool use_tsc; + +#endif /* PG_INSTR_TSC_CLOCK */ + +/* + * Returns the current timing clock source effectively in use, resolving + * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or + * TIMING_CLOCK_SOURCE_TSC. */ -extern void pg_initialize_timing(void); +static inline TimingClockSourceType +pg_current_timing_clock_source(void) +{ +#if PG_INSTR_TSC_CLOCK + return use_tsc ? TIMING_CLOCK_SOURCE_TSC : TIMING_CLOCK_SOURCE_SYSTEM; +#else + return TIMING_CLOCK_SOURCE_SYSTEM; +#endif +} #ifndef WIN32 @@ -119,22 +174,25 @@ extern void pg_initialize_timing(void); * than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than * their version of CLOCK_MONOTONIC. + * + * Note this does not get used in case the TSC clock source logic is used, + * which directly calls architecture specific timing instructions (e.g. RDTSC). */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW #elif defined(CLOCK_MONOTONIC) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC #else -#define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME #endif static inline instr_time -pg_get_ticks(void) +pg_get_ticks_system(void) { instr_time now; struct timespec tmp; - clock_gettime(PG_INSTR_CLOCK, &tmp); + clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; @@ -145,7 +203,7 @@ pg_get_ticks(void) /* On Windows, use QueryPerformanceCounter() for system clock source */ static inline instr_time -pg_get_ticks(void) +pg_get_ticks_system(void) { instr_time now; LARGE_INTEGER tmp; @@ -198,6 +256,66 @@ pg_ticks_to_ns(int64 ticks) #endif /* PG_INSTR_TICKS_TO_NS */ } +#if PG_INSTR_TSC_CLOCK + +#ifdef _MSC_VER +#include +#endif /* defined(_MSC_VER) */ + +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + +#ifdef _MSC_VER + now.ticks = __rdtsc(); +#else + /* Avoid complex includes on clang/GCC that raise compile times */ + now.ticks = __builtin_ia32_rdtsc(); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + uint32 unused; + +#ifdef _MSC_VER + now.ticks = __rdtscp(&unused); +#else + now.ticks = __builtin_ia32_rdtscp(&unused); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); +} + +#else + +static inline instr_time +pg_get_ticks_fast(void) +{ + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + return pg_get_ticks_system(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ + /* * Common macros */ @@ -208,6 +326,9 @@ pg_ticks_to_ns(int64 ticks) #define INSTR_TIME_SET_NANOSEC(t, n) ((t).ticks = n) +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_get_ticks()) diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 9c90670d9b8d8..a396e746415e3 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -162,6 +162,9 @@ extern const char *show_timezone(void); extern bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); extern void assign_timezone_abbreviations(const char *newval, void *extra); +extern void assign_timing_clock_source(int newval, void *extra); +extern bool check_timing_clock_source(int *newval, void **extra, GucSource source); +extern const char *show_timing_clock_source(void); extern bool check_transaction_buffers(int *newval, void **extra, GucSource source); extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 71a8016196138..63440b8e36c83 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -60,6 +60,7 @@ enum config_group CONN_AUTH_TCP, CONN_AUTH_AUTH, CONN_AUTH_SSL, + RESOURCES_TIME, RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 7b01c17750c35..fc29212f38ccf 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -25,6 +25,11 @@ #endif /* defined(_MSC_VER) */ #endif +#ifdef __linux__ +#include +#include +#endif + #ifdef HAVE_XSAVE_INTRINSICS #include #endif @@ -100,22 +105,23 @@ pg_attribute_target("xsave") void set_x86_features(void) { - CPUIDResult r = {0}; + CPUIDResult r = {0}, r2 = {0}; pg_cpuid(0x01, &r); X86Features[PG_SSE4_2] = r.ecx >> 20 & 1; X86Features[PG_POPCNT] = r.ecx >> 23 & 1; + X86Features[PG_HYPERVISOR] = r.ecx >> 31 & 1; + + pg_cpuid_subleaf(0x07, 0, &r2); - /* All these features depend on OSXSAVE */ + X86Features[PG_TSC_ADJUST] = (r2.ebx & (1 << 1)) != 0; + + /* leaf 7 features that depend on OSXSAVE */ if (r.ecx & (1 << 27)) { uint32 xcr0_val = 0; - /* second cpuid call on leaf 7 to check extended AVX-512 support */ - - pg_cpuid_subleaf(0x07, 0, &r); - #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ xcr0_val = _xgetbv(0); @@ -125,15 +131,169 @@ set_x86_features(void) if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) { - X86Features[PG_AVX512_BW] = r.ebx >> 30 & 1; - X86Features[PG_AVX512_VL] = r.ebx >> 31 & 1; + X86Features[PG_AVX512_BW] = r2.ebx >> 30 & 1; + X86Features[PG_AVX512_VL] = r2.ebx >> 31 & 1; - X86Features[PG_AVX512_VPCLMULQDQ] = r.ecx >> 10 & 1; - X86Features[PG_AVX512_VPOPCNTDQ] = r.ecx >> 14 & 1; + X86Features[PG_AVX512_VPCLMULQDQ] = r2.ecx >> 10 & 1; + X86Features[PG_AVX512_VPOPCNTDQ] = r2.ecx >> 14 & 1; } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, &r); + X86Features[PG_RDTSCP] = r.edx >> 27 & 1; + + pg_cpuid(0x80000007, &r); + X86Features[PG_TSC_INVARIANT] = r.edx >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* + * Return the number of logical processors per physical CPU package (socket). + * + * This uses CPUID.0B (Extended Topology Enumeration) to enumerate topology + * levels. Each sub-leaf reports a level type in ECX[15:8] (1 = SMT, 2 = Core) + * and the number of logical processors at that level and below in EBX[15:0]. + * The value at the highest level gives us logical processors per package. + * + * Vendor-specific leaves (0x1F for Intel, 0x80000026 for AMD) provide + * finer-grained sub-package topology but are assumed to report the same + * per-package totals on current hardware. + * + * Returns 0 if topology information is not available. + */ +int +x86_logical_processors_per_package(void) +{ + int logical_per_package = 0; + + for (int subleaf = 0; subleaf < 8; subleaf++) + { + CPUIDResult r = {0}; + uint32 level_type; + + if (!pg_cpuid_subleaf(0x0B, subleaf, &r)) + return 0; + + level_type = (r.ecx >> 8) & 0xff; + + /* level_type 0 means end of enumeration */ + if (level_type == 0) + break; + + logical_per_package = r.ebx & 0xffff; + } + + return logical_per_package; +} + +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates TSC is not invariant, or the frequency information was not + * accessible and the instructions should not be used. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + + if (!x86_feature_available(PG_TSC_INVARIANT)) + return 0; + + if (x86_feature_available(PG_HYPERVISOR)) + return x86_hypervisor_tsc_frequency_khz(); + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "Intel® 64 and IA-32 Architectures Software Developer’s Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, &r); + if (r.ecx > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (r.eax == 0 || r.ebx == 0) + return 0; + + return r.ecx / 1000 * r.ebx / r.eax; + } + + /* + * When CPUID.15H is not available/incomplete, but we have verified an + * invariant TSC is used, we can instead get the processor base frequency + * in MHz from CPUID.16H:EAX, the "Processor Frequency Information Leaf". + */ + pg_cpuid(0x16, &r); + if (r.eax > 0) + return r.eax * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access an MSR to get the frequency (which is typically not available + * for unprivileged processes), so we instead rely on the TSC calibration logic. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r.ebx == 0x61774d56 && r.ecx == 0x4d566572 && r.edx == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r.ebx == 0x4b4d564b && r.ecx == 0x564b4d56 && r.edx == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + +/* + * The hypervisor is determined using the 0x40000000 Hypervisor information + * leaf, which requires use of __cpuidex to set ECX to 0 to access it. + * + * The similar __get_cpuid_count function does not work as expected since it + * contains a check for __get_cpuid_max, which has been observed to be lower + * than the special Hypervisor leaf, despite it being available. + */ +#if defined(HAVE__CPUIDEX) + __cpuidex((int *) &r, 0x40000000, 0); + + if (r.eax >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) + { + __cpuidex((int *) &r, 0x40000010, 0); + if (r.eax > 0) + return r.eax; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 489defe73626e..a3b76886caa61 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3118,6 +3118,7 @@ TimeoutId TimeoutType Timestamp TimestampTz +TimingClockSourceType TmFromChar TmToChar ToastAttrInfo From 96a7c19c2b4a7e260a46a017d09e152250013aa1 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Thu, 12 Feb 2026 01:12:19 -0800 Subject: [PATCH 31/32] pg_test_timing: Also test RDTSC/RDTSCP timing and report time source Author: David Geier Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/bin/pg_test_timing/pg_test_timing.c | 96 ++++++++++++++++++++++--- src/include/portability/instr_time.h | 6 ++ 2 files changed, 93 insertions(+), 9 deletions(-) diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index 9f4b196c4bb8d..329957d061c7c 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -30,14 +30,16 @@ static long long int largest_diff_count; static void handle_args(int argc, char *argv[]); -static uint64 test_timing(unsigned int duration); +static void test_system_timing(void); +#if PG_INSTR_TSC_CLOCK +static void test_tsc_timing(void); +#endif +static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing); static void output(uint64 loop_count); int main(int argc, char *argv[]) { - uint64 loop_count; - set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing")); progname = get_progname(argv[0]); @@ -56,9 +58,11 @@ main(int argc, char *argv[]) pg_initialize_timing(true); #endif - loop_count = test_timing(test_duration); + test_system_timing(); - output(loop_count); +#if PG_INSTR_TSC_CLOCK + test_tsc_timing(); +#endif return 0; } @@ -156,14 +160,66 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", - "Testing timing overhead for %u seconds.\n", + printf(ngettext("Testing timing overhead for %u second.\n\n", + "Testing timing overhead for %u seconds.\n\n", test_duration), test_duration); } +/* + * This tests default (non-fast) timing code. A clock source for that is + * always available. Hence, we can unconditionally output the result. + */ +static void +test_system_timing(void) +{ + uint64 loop_count; + + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false); + output(loop_count); +} + +/* + * If on a supported architecture, test the TSC clock source. This clock + * source is not always available. In that case we print an informational + * message indicating as such. + * + * We first emit "slow" timings (RDTSCP on x86), which are used for higher + * precision measurements when the TSC clock source is enabled. We emit + * "fast" timings second (RDTSC on x86), which is used for faster timing + * measurements with lower precision. + */ +#if PG_INSTR_TSC_CLOCK +static void +test_tsc_timing(void) +{ + uint64 loop_count; + + printf("\n"); + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false); + if (loop_count > 0) + { + output(loop_count); + printf("\n"); + + /* Now, emit fast timing measurements */ + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true); + output(loop_count); + printf("\n"); + + pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO); + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n")); + else + printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n")); + } + else + printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?.\n")); +} +#endif + static uint64 -test_timing(unsigned int duration) +test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing) { uint64 loop_count = 0; instr_time start_time, @@ -171,6 +227,24 @@ test_timing(unsigned int duration) duration_time, prev, cur; + char *time_source = NULL; + + if (!pg_set_timing_clock_source(source)) + return 0; + + time_source = PG_INSTR_SYSTEM_CLOCK_NAME; + +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME; +#endif + + if (fast_timing) + printf(_("Fast clock source: %s\n"), time_source); + else if (source == TIMING_CLOCK_SOURCE_SYSTEM) + printf(_("System clock source: %s\n"), time_source); + else + printf(_("Clock source: %s\n"), time_source); /* * Pre-zero the statistics data structures. They're already zero by @@ -197,7 +271,11 @@ test_timing(unsigned int duration) instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(cur); + + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(cur); + else + INSTR_TIME_SET_CURRENT(cur); diff_time = cur; INSTR_TIME_SUBTRACT(diff_time, prev); diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index a7640d0e72a57..681e3f5bf8d3e 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -123,6 +123,8 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #if defined(__x86_64__) || defined(_M_X64) #define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" +#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" #define PG_INSTR_TICKS_TO_NS 1 #elif defined(WIN32) #define PG_INSTR_TSC_CLOCK 0 @@ -180,10 +182,13 @@ pg_current_timing_clock_source(void) */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) #define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)" #elif defined(CLOCK_MONOTONIC) #define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)" #else #define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)" #endif static inline instr_time @@ -202,6 +207,7 @@ pg_get_ticks_system(void) /* On Windows, use QueryPerformanceCounter() for system clock source */ +#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter" static inline instr_time pg_get_ticks_system(void) { From 6aa00e5ed2a5eb8714323f505efe87b5b0011393 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 01:38:14 -0700 Subject: [PATCH 32/32] instrumentation: ARM support for fast time measurements Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl Reviewed-by: Discussion: --- src/common/instr_time.c | 65 ++++++++++++++++++++++++++-- src/include/port/pg_cpu.h | 6 +++ src/include/portability/instr_time.h | 57 ++++++++++++++++++++++-- src/port/meson.build | 1 + src/port/pg_cpu_arm.c | 45 +++++++++++++++++++ 5 files changed, 166 insertions(+), 8 deletions(-) create mode 100644 src/port/pg_cpu_arm.c diff --git a/src/common/instr_time.c b/src/common/instr_time.c index 2becf9b0780c0..7d74c058d7aa4 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -20,6 +20,10 @@ #include #endif +#if defined(__APPLE__) +#include +#endif + #include "port/pg_cpu.h" #include "portability/instr_time.h" @@ -162,7 +166,7 @@ set_ticks_per_ns() #endif } -/* TSC specific logic */ +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ #if PG_INSTR_TSC_CLOCK @@ -170,6 +174,19 @@ bool use_tsc = false; static uint32 tsc_frequency_khz = 0; +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + static uint32 tsc_calibrate(void); /* @@ -363,11 +380,51 @@ tsc_calibrate(void) return 0; } +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Initialize the AArch64 generic timer as a clock source. + */ static void -set_ticks_per_ns_for_tsc(void) +tsc_initialize(bool allow_tsc_calibration) { - ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; - max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; + if (aarch64_has_heterogeneous_cores()) + return; + + tsc_frequency_khz = aarch64_cntvct_frequency_khz(); + if (tsc_frequency_khz != 0) + has_usable_tsc = true; +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; } +#endif /* defined(__aarch64__) */ + #endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index a32e67487f834..82df66f381e03 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -56,4 +56,10 @@ extern uint32 x86_tsc_frequency_khz(void); #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 681e3f5bf8d3e..ac8020bdd629d 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,8 +4,9 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in - * certain cases, or alternatively clock_gettime() on Unix-like systems and + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and * QueryPerformanceCounter() on Windows. These macros also give some breathing * room to use other high-precision-timing APIs. * @@ -126,6 +127,11 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" #define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" #define PG_INSTR_TICKS_TO_NS 1 +#elif defined(__aarch64__) && !defined(WIN32) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" +#define PG_INSTR_TICKS_TO_NS 1 #elif defined(WIN32) #define PG_INSTR_TSC_CLOCK 0 #define PG_INSTR_TICKS_TO_NS 1 @@ -134,7 +140,6 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TICKS_TO_NS 0 #endif - #if PG_INSTR_TSC_CLOCK /* Whether the hardware TSC clock is available and usable. */ extern PGDLLIMPORT bool has_usable_tsc; @@ -264,6 +269,8 @@ pg_ticks_to_ns(int64 ticks) #if PG_INSTR_TSC_CLOCK +#if defined(__x86_64__) || defined(_M_X64) + #ifdef _MSC_VER #include #endif /* defined(_MSC_VER) */ @@ -306,7 +313,49 @@ pg_get_ticks(void) return pg_get_ticks_system(); } -#else +#elif defined(__aarch64__) && !defined(WIN32) + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ static inline instr_time pg_get_ticks_fast(void) diff --git a/src/port/meson.build b/src/port/meson.build index 7296f8e3c037f..110bcd28edd4c 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_localeconv_r.c', 'pg_numa.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 0000000000000..6fd9dd892ec98 --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(WIN32) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */