diff --git a/configure b/configure index 4aaaf92ba0a12..8279bdd8095f9 100755 --- a/configure +++ b/configure @@ -5341,6 +5341,7 @@ if test x"$pgac_cv_prog_CC_cflags__Werror_vla" = x"yes"; then fi + # -Wvla is not applicable for C++ # On macOS, complain about usage of symbols newer than the deployment target { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Werror=unguarded-availability-new, for CFLAGS" >&5 @@ -5433,7 +5434,6 @@ if test x"$pgac_cv_prog_CXX_cxxflags__Werror_unguarded_availability_new" = x"yes fi - # -Wvla is not applicable for C++ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Wendif-labels, for CFLAGS" >&5 $as_echo_n "checking whether ${CC} supports -Wendif-labels, for CFLAGS... " >&6; } @@ -17657,7 +17657,8 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi fi -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 $as_echo_n "checking for __get_cpuid_count... " >&6; } if ${pgac_cv__get_cpuid_count+:} false; then : @@ -17690,21 +17691,25 @@ if test x"$pgac_cv__get_cpuid_count" = x"yes"; then $as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h -else - # __cpuidex() - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +fi +# __cpuidex() +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 $as_echo_n "checking for __cpuidex... " >&6; } if ${pgac_cv__cpuidex+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#ifdef _MSC_VER + #include + #else + #include + #endif int main () { -unsigned int exx[4] = {0, 0, 0, 0}; - __cpuidex(exx, 7, 0); +int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); ; return 0; @@ -17720,11 +17725,10 @@ rm -f core conftest.err conftest.$ac_objext \ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 $as_echo "$pgac_cv__cpuidex" >&6; } - if test x"$pgac_cv__cpuidex" = x"yes"; then +if test x"$pgac_cv__cpuidex" = x"yes"; then $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h - fi fi # Check for XSAVE intrinsics diff --git a/configure.ac b/configure.ac index 9bc457bac87a2..b5e6bbcb1269c 100644 --- a/configure.ac +++ b/configure.ac @@ -549,10 +549,10 @@ if test "$GCC" = yes -a "$ICC" = no; then AC_SUBST(PERMIT_DECLARATION_AFTER_STATEMENT) # Really don't want VLAs to be used in our dialect of C PGAC_PROG_CC_CFLAGS_OPT([-Werror=vla]) + # -Wvla is not applicable for C++ # On macOS, complain about usage of symbols newer than the deployment target PGAC_PROG_CC_CFLAGS_OPT([-Werror=unguarded-availability-new]) PGAC_PROG_CXX_CFLAGS_OPT([-Werror=unguarded-availability-new]) - # -Wvla is not applicable for C++ PGAC_PROG_CC_CFLAGS_OPT([-Wendif-labels]) PGAC_PROG_CXX_CFLAGS_OPT([-Wendif-labels]) PGAC_PROG_CC_CFLAGS_OPT([-Wmissing-format-attribute]) @@ -2104,7 +2104,8 @@ else fi fi -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2114,18 +2115,21 @@ AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [pgac_cv__get_cpuid_count="no"])]) if test x"$pgac_cv__get_cpuid_count" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) -else - # __cpuidex() - AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], - [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __cpuidex(exx, 7, 0); - ]])], - [pgac_cv__cpuidex="yes"], - [pgac_cv__cpuidex="no"])]) - if test x"$pgac_cv__cpuidex" = x"yes"; then - AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) - fi +fi +# __cpuidex() +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#ifdef _MSC_VER + #include + #else + #include + #endif], + [[int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) fi # Check for XSAVE intrinsics diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c index 0535d45f2d825..1a0e42021ec1e 100644 --- a/contrib/bloom/blscan.c +++ b/contrib/bloom/blscan.c @@ -18,6 +18,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "storage/read_stream.h" /* * Begin scan of bloom index. @@ -76,11 +77,13 @@ int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { int64 ntids = 0; - BlockNumber blkno = BLOOM_HEAD_BLKNO, + BlockNumber blkno, npages; int i; BufferAccessStrategy bas; BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + BlockRangeReadStreamPrivate p; + ReadStream *stream; if (so->sign == NULL) { @@ -120,14 +123,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) if (scan->instrument) scan->instrument->nsearches++; + /* Scan all blocks except the metapage using streaming reads */ + p.current_blocknum = BLOOM_HEAD_BLKNO; + p.last_exclusive = npages; + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, + bas, + scan->indexRelation, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; - buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, - blkno, RBM_NORMAL, bas); - + buffer = read_stream_next_buffer(stream, NULL); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); @@ -163,6 +181,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); } + + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + read_stream_end(stream); FreeAccessStrategy(bas); return ntids; diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 8cdd826fbd37a..99a6593d9ac59 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2533,6 +2533,60 @@ include_dir 'conf.d' + + Timing + + + + timing_clock_source (enum) + + timing_clock_source configuration parameter + + + + + Selects the method for making timing measurements using the OS or specialized CPU + instructions. Possible values are: + + + + auto (automatically chooses TSC clock source for modern CPUs, + otherwise uses the OS system clock) + + + + + system (measures timing using the OS system clock) + + + + + tsc (measures timing using the x86-64 Time-Stamp Counter (TSC) + by directly executing RDTSC/RDTSCP instructions, see below) + + + + The default is auto. + + + If enabled, the TSC clock source will use the RDTSC instruction for the x86-64 + Time-Stamp Counter (TSC) to perform certain time measurements, for example during + EXPLAIN ANALYZE. The RDTSC instruction has less overhead than going through the OS + clock source, which for an EXPLAIN ANALYZE statement will show timing closer to the + actual runtime when timing is off. For timings that require higher precision the + RDTSCP instruction is used, which avoids inaccuracies due to CPU instruction re-ordering. + Use of RDTSC/RDTSC is not supported on older CPUs or hypervisors that don't pass the TSC + frequency to guest VMs, and is not advised on systems that utilize an emulated TSC. + + + To help decide which clock source to use on an x86-64 system you can run the + pg_test_timing utility to check TSC availability, and + perform timing measurements. + + + + + Background Writer diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index b3d5355068801..cc014564c9704 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -413,6 +413,14 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_progress_repackpg_stat_progress_repack + One row for each backend running + REPACK, showing current progress. See + . + + + pg_stat_progress_basebackuppg_stat_progress_basebackup One row for each WAL sender process streaming a base backup, @@ -5796,9 +5804,9 @@ FROM pg_stat_get_backend_idset() AS backendid; PostgreSQL has the ability to report the progress of certain commands during command execution. Currently, the only commands which support progress reporting are ANALYZE, - CLUSTER, - CREATE INDEX, VACUUM, - COPY, + COPY, CREATE INDEX, + REPACK (and its obsolete spelling CLUSTER), + VACUUM, and (i.e., replication command that issues to take a base backup). @@ -6731,6 +6739,218 @@ FROM pg_stat_get_backend_idset() AS backendid; + + REPACK Progress Reporting + + + pg_stat_progress_repack + + + + Whenever REPACK is running, + the pg_stat_progress_repack view will contain a + row for each backend that is currently running the command. The tables + below describe the information that will be reported and provide + information about how to interpret it. + + + + <structname>pg_stat_progress_repack</structname> View + + + + + Column Type + + + Description + + + + + + + + pid integer + + + Process ID of backend. + + + + + + datid oid + + + OID of the database to which this backend is connected. + + + + + + datname name + + + Name of the database to which this backend is connected. + + + + + + relid oid + + + OID of the table being repacked. + + + + + + phase text + + + Current processing phase. See . + + + + + + repack_index_relid oid + + + If the table is being scanned using an index, this is the OID of the + index being used; otherwise, it is zero. + + + + + + heap_tuples_scanned bigint + + + Number of heap tuples scanned. + This counter only advances when the phase is + seq scanning heap, + index scanning heap + or writing new heap. + + + + + + heap_tuples_written bigint + + + Number of heap tuples written. + This counter only advances when the phase is + seq scanning heap, + index scanning heap + or writing new heap. + + + + + + heap_blks_total bigint + + + Total number of heap blocks in the table. This number is reported + as of the beginning of seq scanning heap. + + + + + + heap_blks_scanned bigint + + + Number of heap blocks scanned. This counter only advances when the + phase is seq scanning heap. + + + + + + index_rebuild_count bigint + + + Number of indexes rebuilt. This counter only advances when the phase + is rebuilding index. + + + + +
+ + + REPACK Phases + + + + + + Phase + Description + + + + + + initializing + + The command is preparing to begin scanning the heap. This phase is + expected to be very brief. + + + + seq scanning heap + + The command is currently scanning the table using a sequential scan. + + + + index scanning heap + + REPACK is currently scanning the table using an index scan. + + + + sorting tuples + + REPACK is currently sorting tuples. + + + + writing new heap + + REPACK is currently writing the new heap. + + + + swapping relation files + + The command is currently swapping newly-built files into place. + + + + rebuilding index + + The command is currently rebuilding an index. + + + + performing final cleanup + + The command is performing final cleanup. When this phase is + completed, REPACK will end. + + + + +
+
+ VACUUM Progress Reporting diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index e167406c74490..141ada9c50a05 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -167,6 +167,7 @@ Complete list of usable sgml source files in this directory. + diff --git a/doc/src/sgml/ref/alter_foreign_table.sgml b/doc/src/sgml/ref/alter_foreign_table.sgml index e6d99e99016e7..228067f087cee 100644 --- a/doc/src/sgml/ref/alter_foreign_table.sgml +++ b/doc/src/sgml/ref/alter_foreign_table.sgml @@ -32,7 +32,7 @@ ALTER FOREIGN TABLE [ IF EXISTS ] namewhere action is one of: - ADD [ COLUMN ] column_name data_type [ COLLATE collation ] [ column_constraint [ ... ] ] + ADD [ COLUMN ] [ IF NOT EXISTS ] column_name data_type [ COLLATE collation ] [ column_constraint [ ... ] ] DROP [ COLUMN ] [ IF EXISTS ] column_name [ RESTRICT | CASCADE ] ALTER [ COLUMN ] column_name [ SET DATA ] TYPE data_type [ COLLATE collation ] ALTER [ COLUMN ] column_name SET DEFAULT expression @@ -67,11 +67,13 @@ ALTER FOREIGN TABLE [ IF EXISTS ] name - ADD COLUMN + ADD [ COLUMN ] [ IF NOT EXISTS ] This form adds a new column to the foreign table, using the same syntax as CREATE FOREIGN TABLE. + If IF NOT EXISTS is specified and a column already + exists with this name, no error is thrown. Unlike the case when adding a column to a regular table, nothing happens to the underlying storage: this action simply declares that some new column is now accessible through the foreign table. @@ -80,7 +82,7 @@ ALTER FOREIGN TABLE [ IF EXISTS ] name - DROP COLUMN [ IF EXISTS ] + DROP [ COLUMN ] [ IF EXISTS ] This form drops a column from a foreign table. diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml index 0b47460080b92..17778e9471c42 100644 --- a/doc/src/sgml/ref/cluster.sgml +++ b/doc/src/sgml/ref/cluster.sgml @@ -33,50 +33,9 @@ CLUSTER [ ( option [, ...] ) ] [ Description - CLUSTER instructs PostgreSQL - to cluster the table specified - by table_name - based on the index specified by - index_name. The index must - already have been defined on - table_name. - - - - When a table is clustered, it is physically reordered - based on the index information. Clustering is a one-time operation: - when the table is subsequently updated, the changes are - not clustered. That is, no attempt is made to store new or - updated rows according to their index order. (If one wishes, one can - periodically recluster by issuing the command again. Also, setting - the table's fillfactor storage parameter to less than - 100% can aid in preserving cluster ordering during updates, since updated - rows are kept on the same page if enough space is available there.) - - - - When a table is clustered, PostgreSQL - remembers which index it was clustered by. The form - CLUSTER table_name - reclusters the table using the same index as before. You can also - use the CLUSTER or SET WITHOUT CLUSTER - forms of ALTER TABLE to set the index to be used for - future cluster operations, or to clear any previous setting. - - - - CLUSTER without a - table_name reclusters all the - previously-clustered tables in the current database that the calling user - has privileges for. This form of CLUSTER cannot be - executed inside a transaction block. - - - - When a table is being clustered, an ACCESS - EXCLUSIVE lock is acquired on it. This prevents any other - database operations (both reads and writes) from operating on the - table until the CLUSTER is finished. + The CLUSTER command is equivalent to + with an USING INDEX + clause. See there for more details. @@ -136,63 +95,12 @@ CLUSTER [ ( option [, ...] ) ] [ - - In cases where you are accessing single rows randomly - within a table, the actual order of the data in the - table is unimportant. However, if you tend to access some - data more than others, and there is an index that groups - them together, you will benefit from using CLUSTER. - If you are requesting a range of indexed values from a table, or a - single indexed value that has multiple rows that match, - CLUSTER will help because once the index identifies the - table page for the first row that matches, all other rows - that match are probably already on the same table page, - and so you save disk accesses and speed up the query. - - - - CLUSTER can re-sort the table using either an index scan - on the specified index, or (if the index is a b-tree) a sequential - scan followed by sorting. It will attempt to choose the method that - will be faster, based on planner cost parameters and available statistical - information. - - While CLUSTER is running, the is temporarily changed to pg_catalog, pg_temp. - - When an index scan is used, a temporary copy of the table is created that - contains the table data in the index order. Temporary copies of each - index on the table are created as well. Therefore, you need free space on - disk at least equal to the sum of the table size and the index sizes. - - - - When a sequential scan and sort is used, a temporary sort file is - also created, so that the peak temporary space requirement is as much - as double the table size, plus the index sizes. This method is often - faster than the index scan method, but if the disk space requirement is - intolerable, you can disable this choice by temporarily setting to off. - - - - It is advisable to set to - a reasonably large value (but not more than the amount of RAM you can - dedicate to the CLUSTER operation) before clustering. - - - - Because the planner records statistics about the ordering of - tables, it is advisable to run ANALYZE - on the newly clustered table. - Otherwise, the planner might make poor choices of query plans. - - Because CLUSTER remembers which indexes are clustered, one can cluster the tables one wants clustered manually the first time, @@ -270,6 +178,7 @@ CLUSTER index_name ON See Also + diff --git a/doc/src/sgml/ref/repack.sgml b/doc/src/sgml/ref/repack.sgml new file mode 100644 index 0000000000000..8ccf7c7a417b5 --- /dev/null +++ b/doc/src/sgml/ref/repack.sgml @@ -0,0 +1,330 @@ + + + + + REPACK + + + + REPACK + 7 + SQL - Language Statements + + + + REPACK + rewrite a table to reclaim disk space + + + + +REPACK [ ( option [, ...] ) ] [ table_and_columns [ USING INDEX [ index_name ] ] ] +REPACK [ ( option [, ...] ) ] USING INDEX + +where option can be one of: + + VERBOSE [ boolean ] + ANALYZE [ boolean ] + +and table_and_columns is: + + table_name [ ( column_name [, ...] ) ] + + + + + Description + + + REPACK reclaims storage occupied by dead + tuples. Unlike VACUUM, it does so by rewriting the + entire contents of the table specified + by table_name into a new disk + file with no extra space (except for the space guaranteed by + the fillfactor storage parameter), allowing unused space + to be returned to the operating system. + + + + Without + a table_name, REPACK + processes every table and materialized view in the current database that + the current user has the MAINTAIN privilege on. This + form of REPACK cannot be executed inside a transaction + block. + + + + If a USING INDEX clause is specified, the rows are + physically reordered based on information from an index. Please see the + notes on clustering below. + + + + When a table is being repacked, an ACCESS EXCLUSIVE lock + is acquired on it. This prevents any other database operations (both reads + and writes) from operating on the table until the REPACK + is finished. + + + + Notes on Clustering + + + If the USING INDEX clause is specified, the rows in + the table are stored in the order that the index specifies; + clustering, because rows are physically clustered + afterwards. + If an index name is specified in the command, the order implied by that + index is used, and that index is configured as the index to cluster on. + (This also applies to an index given to the CLUSTER + command.) + If no index name is specified, then the index that has + been configured as the index to cluster on is used; an + error is thrown if none has. + An index can be set manually using ALTER TABLE ... CLUSTER ON, + and reset with ALTER TABLE ... SET WITHOUT CLUSTER. + + + + If no table name is specified in REPACK USING INDEX, + all tables which have a clustering index defined and which the calling + user has privileges for are processed. + + + + Clustering is a one-time operation: when the table is + subsequently updated, the changes are not clustered. That is, no attempt + is made to store new or updated rows according to their index order. (If + one wishes, one can periodically recluster by issuing the command again. + Also, setting the table's fillfactor storage parameter + to less than 100% can aid in preserving cluster ordering during updates, + since updated rows are kept on the same page if enough space is available + there.) + + + + In cases where you are accessing single rows randomly within a table, the + actual order of the data in the table is unimportant. However, if you tend + to access some data more than others, and there is an index that groups + them together, you will benefit from using clustering. If + you are requesting a range of indexed values from a table, or a single + indexed value that has multiple rows that match, + clustering will help because once the index identifies the + table page for the first row that matches, all other rows that match are + probably already on the same table page, and so you save disk accesses and + speed up the query. + + + + REPACK can re-sort the table using either an index scan + on the specified index (if the index is a b-tree), or a sequential scan + followed by sorting. It will attempt to choose the method that will be + faster, based on planner cost parameters and available statistical + information. + + + + Because the planner records statistics about the ordering of tables, it is + advisable to + run ANALYZE on the + newly repacked table. Otherwise, the planner might make poor choices of + query plans. + + + + + Notes on Resources + + + When an index scan or a sequential scan without sort is used, a temporary + copy of the table is created that contains the table data in the index + order. Temporary copies of each index on the table are created as well. + Therefore, you need free space on disk at least equal to the sum of the + table size and the index sizes. + + + + When a sequential scan and sort is used, a temporary sort file is also + created, so that the peak temporary space requirement is as much as double + the table size, plus the index sizes. This method is often faster than + the index scan method, but if the disk space requirement is intolerable, + you can disable this choice by temporarily setting + to off. + + + + It is advisable to set to a + reasonably large value (but not more than the amount of RAM you can + dedicate to the REPACK operation) before repacking. + + + + + + + Parameters + + + + table_name + + + The name (possibly schema-qualified) of a table. + + + + + + column_name + + + The name of a specific column to analyze. Defaults to all columns. + If a column list is specific, ANALYZE must also + be specified. + + + + + + index_name + + + The name of an index. + + + + + + VERBOSE + + + Prints a progress report as each table is repacked + at INFO level. + + + + + + ANALYZE + ANALYSE + + + Applies on the table after repacking. This is + currently only supported when a single (non-partitioned) table is specified. + + + + + + boolean + + + Specifies whether the selected option should be turned on or off. + You can write TRUE, ON, or + 1 to enable the option, and FALSE, + OFF, or 0 to disable it. The + boolean value can also + be omitted, in which case TRUE is assumed. + + + + + + + + Notes + + + To repack a table, one must have the MAINTAIN privilege + on the table. + + + + While REPACK is running, the is temporarily changed to pg_catalog, + pg_temp. + + + + Each backend running REPACK will report its progress + in the pg_stat_progress_repack view. See + for details. + + + + Repacking a partitioned table repacks each of its partitions. If an index + is specified, each partition is repacked using the partition of that + index. REPACK on a partitioned table cannot be executed + inside a transaction block. + + + + + + Examples + + + Repack the table employees: + +REPACK employees; + + + + + Repack the table employees on the basis of its + index employees_ind (Since index is used here, this is + effectively clustering): + +REPACK employees USING INDEX employees_ind; + + + + + Repack the table cases on physical ordering, + running an ANALYZE on the given columns once + repacking is done, showing informational messages: + +REPACK (ANALYZE, VERBOSE) cases (district, case_nr); + + + + + Repack all tables in the database on which you have + the MAINTAIN privilege: + +REPACK; + + + + + Repack all tables for which a clustering index has previously been + configured on which you have the MAINTAIN privilege, + showing informational messages: + +REPACK (VERBOSE) USING INDEX; + + + + + + + Compatibility + + + There is no REPACK statement in the SQL standard. + + + + + See Also + + + + + + + diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index 6d0fdd43cfb31..ac5d083d468e6 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -25,7 +25,6 @@ VACUUM [ ( option [, ...] ) ] [ where option can be one of: - FULL [ boolean ] FREEZE [ boolean ] VERBOSE [ boolean ] ANALYZE [ boolean ] @@ -39,6 +38,7 @@ VACUUM [ ( option [, ...] ) ] [ boolean ] ONLY_DATABASE_STATS [ boolean ] BUFFER_USAGE_LIMIT size + FULL [ boolean ] and table_and_columns is: @@ -95,20 +95,6 @@ VACUUM [ ( option [, ...] ) ] [ Parameters - - FULL - - - Selects full vacuum, which can reclaim more - space, but takes much longer and exclusively locks the table. - This method also requires extra disk space, since it writes a - new copy of the table and doesn't release the old copy until - the operation is complete. Usually this should only be used when a - significant amount of space needs to be reclaimed from within the table. - - - - FREEZE @@ -362,6 +348,23 @@ VACUUM [ ( option [, ...] ) ] [ + + FULL + + + This option, which is deprecated, makes VACUUM + behave like REPACK without a + USING INDEX clause. + This method of compacting the table takes much longer than + VACUUM and exclusively locks the table. + This method also requires extra disk space, since it writes a + new copy of the table and doesn't release the old copy until + the operation is complete. Usually this should only be used when a + significant amount of space needs to be reclaimed from within the table. + + + + boolean diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index 2cf02c37b17bd..d9fdbb5d254cc 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -195,6 +195,7 @@ &refreshMaterializedView; &reindex; &releaseSavepoint; + &repack; &reset; &revoke; &rollback; diff --git a/meson.build b/meson.build index 2df54409ca6b1..d22ccf8f2383e 100644 --- a/meson.build +++ b/meson.build @@ -2121,7 +2121,8 @@ elif cc.links(''' endif -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. if cc.links(''' #include int main(int arg, char **argv) @@ -2132,11 +2133,16 @@ if cc.links(''' ''', name: '__get_cpuid_count', args: test_c_args) cdata.set('HAVE__GET_CPUID_COUNT', 1) -elif cc.links(''' +endif +if cc.links(''' + #ifdef _MSC_VER #include + #else + #include + #endif int main(int arg, char **argv) { - unsigned int exx[4] = {0, 0, 0, 0}; + int exx[4] = {0, 0, 0, 0}; __cpuidex(exx, 7, 0); } ''', name: '__cpuidex', diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 146ee97a47dc4..1909c3254b5ba 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1689,9 +1689,6 @@ initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, state->bs_leader = NULL; state->bs_worker_id = 0; state->bs_sortstate = NULL; - state->bs_context = CurrentMemoryContext; - state->bs_emptyTuple = NULL; - state->bs_emptyTupleLen = 0; /* Remember the memory context to use for an empty tuple, if needed. */ state->bs_context = CurrentMemoryContext; diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index b64ccf5e912cd..4d7c100d73781 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -64,11 +64,7 @@ gistkillitems(IndexScanDesc scan) * safe. */ if (BufferGetLSNAtomic(buffer) != so->curPageLSN) - { - UnlockReleaseBuffer(buffer); - so->numKilled = 0; /* reset counter */ - return; - } + goto unlock; Assert(GistPageIsLeaf(page)); @@ -78,6 +74,17 @@ gistkillitems(IndexScanDesc scan) */ for (i = 0; i < so->numKilled; i++) { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can update the + * page while just holding a share lock. If we are not allowed, + * there's no point continuing. + */ + if (!BufferBeginSetHintBits(buffer)) + goto unlock; + } + offnum = so->killedItems[i]; iid = PageGetItemId(page, offnum); ItemIdMarkDead(iid); @@ -87,9 +94,10 @@ gistkillitems(IndexScanDesc scan) if (killedsomething) { GistMarkPageHasGarbage(page); - MarkBufferDirtyHint(buffer, true); + BufferFinishSetHintBits(buffer, true, true); } +unlock: UnlockReleaseBuffer(buffer); /* diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index cf7f0b9017631..3e16119d0276c 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -593,6 +593,17 @@ _hash_kill_items(IndexScanDesc scan) if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can + * update the page while just holding a share lock. If we + * are not allowed, there's no point continuing. + */ + if (!BufferBeginSetHintBits(so->currPos.buf)) + goto unlock_page; + } + /* found the item */ ItemIdMarkDead(iid); killedsomething = true; @@ -610,9 +621,10 @@ _hash_kill_items(IndexScanDesc scan) if (killedsomething) { opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; - MarkBufferDirtyHint(buf, true); + BufferFinishSetHintBits(so->currPos.buf, true, true); } +unlock_page: if (so->hashso_bucket_buf == so->currPos.buf || havePin) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index a231563f0dfec..8f1c11a93500d 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6613,11 +6613,11 @@ heap_inplace_update_and_unlock(Relation relation, /*---------- * NO EREPORT(ERROR) from here till changes are complete * - * Our buffer lock won't stop a reader having already pinned and checked - * visibility for this tuple. Hence, we write WAL first, then mutate the - * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(), - * checkpoint delay makes that acceptable. With the usual order of - * changes, a crash after memcpy() and before XLogInsert() could allow + * Our exclusive buffer lock won't stop a reader having already pinned and + * checked visibility for this tuple. With the usual order of changes + * (i.e. updating the buffer contents before WAL logging), a reader could + * observe our not-yet-persistent update to relfrozenxid and update + * datfrozenxid based on that. A crash in that moment could allow * datfrozenxid to overtake relfrozenxid: * * ["D" is a VACUUM (ONLY_DATABASE_STATS)] @@ -6629,21 +6629,15 @@ heap_inplace_update_and_unlock(Relation relation, * [crash] * [recovery restores datfrozenxid w/o relfrozenxid] * - * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(). - * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack. - * The stack copy facilitates a FPI of the post-mutation block before we - * accept other sessions seeing it. DELAY_CHKPT_START allows us to - * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint() - * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START. - * This function, however, likely could avoid it with the following order - * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use - * DELAY_CHKPT_START here, too, as a way to have fewer distinct code - * patterns to analyze. Inplace update isn't so frequent that it should - * pursue the small optimization of skipping DELAY_CHKPT_START. - */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + * We avoid that by using a temporary copy of the buffer to hide our + * change from other backends until the change has been WAL-logged. We + * apply our change to the temporary copy and WAL-log it, before modifying + * the real page. That way any action a reader of the in-place-updated + * value takes will be WAL logged after this change. + */ START_CRIT_SECTION(); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + MarkBufferDirty(buffer); /* XLOG stuff */ if (RelationNeedsWAL(relation)) @@ -6692,8 +6686,6 @@ heap_inplace_update_and_unlock(Relation relation, memcpy(dst, src, newlen); - MarkBufferDirty(buffer); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* @@ -6702,7 +6694,6 @@ heap_inplace_update_and_unlock(Relation relation, */ AtInplace_Inval(); - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock); @@ -7098,6 +7089,12 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * process this tuple as part of freezing its page, and return true. Return * false if nothing can be changed about the tuple right now. * + * FreezePageConflictXid is advanced only for xmin/xvac freezing, not for xmax + * changes. We only remove xmax state here when it is lock-only, or when the + * updater XID (including an updater member of a MultiXact) must be aborted; + * otherwise, the tuple would already be removable. Neither case affects + * visibility on a standby. + * * Also sets *totally_frozen to true if the tuple will be totally frozen once * caller executes returned freeze plan (or if the tuple was already totally * frozen by an earlier VACUUM). This indicates that there are no remaining @@ -7173,7 +7170,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, /* Verify that xmin committed if and when freeze plan is executed */ if (freeze_xmin) + { frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED; + if (TransactionIdFollows(xid, pagefrz->FreezePageConflictXid)) + pagefrz->FreezePageConflictXid = xid; + } } /* @@ -7192,6 +7193,9 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, */ replace_xvac = pagefrz->freeze_required = true; + if (TransactionIdFollows(xid, pagefrz->FreezePageConflictXid)) + pagefrz->FreezePageConflictXid = xid; + /* Will set replace_xvac flags in freeze plan below */ } @@ -7501,6 +7505,7 @@ heap_freeze_tuple(HeapTupleHeader tuple, pagefrz.freeze_required = true; pagefrz.FreezePageRelfrozenXid = FreezeLimit; pagefrz.FreezePageRelminMxid = MultiXactCutoff; + pagefrz.FreezePageConflictXid = InvalidTransactionId; pagefrz.NoFreezePageRelfrozenXid = FreezeLimit; pagefrz.NoFreezePageRelminMxid = MultiXactCutoff; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 3ff36f59bf869..5137d2510ea4c 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -741,13 +741,13 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (OldIndex != NULL && !use_sort) { const int ci_index[] = { - PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_INDEX_RELID + PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_INDEX_RELID }; int64 ci_val[2]; /* Set phase and OIDOldIndex to columns */ - ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; + ci_val[0] = PROGRESS_REPACK_PHASE_INDEX_SCAN_HEAP; ci_val[1] = RelationGetRelid(OldIndex); pgstat_progress_update_multi_param(2, ci_index, ci_val); @@ -759,15 +759,15 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, else { /* In scan-and-sort mode and also VACUUM FULL, set phase */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP); tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); heapScan = (HeapScanDesc) tableScan; indexScan = NULL; /* Set total heap blocks */ - pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, + pgstat_progress_update_param(PROGRESS_REPACK_TOTAL_HEAP_BLKS, heapScan->rs_nblocks); } @@ -809,7 +809,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, * is manually updated to the correct value when the table * scan finishes. */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_BLKS_SCANNED, heapScan->rs_nblocks); break; } @@ -825,7 +825,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, */ if (prev_cblock != heapScan->rs_cblock) { - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_BLKS_SCANNED, (heapScan->rs_cblock + heapScan->rs_nblocks - heapScan->rs_startblock @@ -926,14 +926,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, * In scan-and-sort mode, report increase in number of tuples * scanned */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_SCANNED, *num_tuples); } else { const int ct_index[] = { - PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, - PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN + PROGRESS_REPACK_HEAP_TUPLES_SCANNED, + PROGRESS_REPACK_HEAP_TUPLES_WRITTEN }; int64 ct_val[2]; @@ -966,14 +966,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, double n_tuples = 0; /* Report that we are now sorting tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SORT_TUPLES); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_SORT_TUPLES); tuplesort_performsort(tuplesort); /* Report that we are now writing new heap */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_WRITE_NEW_HEAP); for (;;) { @@ -991,7 +991,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, values, isnull, rwstate); /* Report n_tuples */ - pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, + pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_WRITTEN, n_tuples); } diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 75ae268d753c2..fc64f4343ce02 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -80,10 +80,38 @@ /* - * SetHintBits() + * To be allowed to set hint bits, SetHintBits() needs to call + * BufferBeginSetHintBits(). However, that's not free, and some callsites call + * SetHintBits() on many tuples in a row. For those it makes sense to amortize + * the cost of BufferBeginSetHintBits(). Additionally it's desirable to defer + * the cost of BufferBeginSetHintBits() until a hint bit needs to actually be + * set. This enum serves as the necessary state space passed to + * SetHintBitsExt(). + */ +typedef enum SetHintBitsState +{ + /* not yet checked if hint bits may be set */ + SHB_INITIAL, + /* failed to get permission to set hint bits, don't check again */ + SHB_DISABLED, + /* allowed to set hint bits */ + SHB_ENABLED, +} SetHintBitsState; + +/* + * SetHintBitsExt() * * Set commit/abort hint bits on a tuple, if appropriate at this time. * + * To be allowed to set a hint bit on a tuple, the page must not be undergoing + * IO at this time (otherwise we e.g. could corrupt PG's page checksum or even + * the filesystem's, as is known to happen with btrfs). + * + * The right to set a hint bit can be acquired on a page level with + * BufferBeginSetHintBits(). Only a single backend gets the right to set hint + * bits at a time. Alternatively, if called with a NULL SetHintBitsState*, + * hint bits are set with BufferSetHintBits16(). + * * It is only safe to set a transaction-committed hint bit if we know the * transaction's commit record is guaranteed to be flushed to disk before the * buffer, or if the table is temporary or unlogged and will be obliterated by @@ -111,24 +139,67 @@ * InvalidTransactionId if no check is needed. */ static inline void -SetHintBits(HeapTupleHeader tuple, Buffer buffer, - uint16 infomask, TransactionId xid) +SetHintBitsExt(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid, SetHintBitsState *state) { + /* + * In batched mode, if we previously did not get permission to set hint + * bits, don't try again - in all likelihood IO is still going on. + */ + if (state && *state == SHB_DISABLED) + return; + if (TransactionIdIsValid(xid)) { - /* NB: xid must be known committed here! */ - XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + if (BufferIsPermanent(buffer)) + { + /* NB: xid must be known committed here! */ + XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + + if (XLogNeedsFlush(commitLSN) && + BufferGetLSNAtomic(buffer) < commitLSN) + { + /* not flushed and no LSN interlock, so don't set hint */ + return; + } + } + } + + /* + * If we're not operating in batch mode, use BufferSetHintBits16() to mark + * the page dirty, that's cheaper than + * BufferBeginSetHintBits()/BufferFinishSetHintBits(). That's important + * for cases where we set a lot of hint bits on a page individually. + */ + if (!state) + { + BufferSetHintBits16(&tuple->t_infomask, + tuple->t_infomask | infomask, buffer); + return; + } - if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) && - BufferGetLSNAtomic(buffer) < commitLSN) + if (*state == SHB_INITIAL) + { + if (!BufferBeginSetHintBits(buffer)) { - /* not flushed and no LSN interlock, so don't set hint */ + *state = SHB_DISABLED; return; } - } + *state = SHB_ENABLED; + } tuple->t_infomask |= infomask; - MarkBufferDirtyHint(buffer, true); +} + +/* + * Simple wrapper around SetHintBitExt(), use when operating on a single + * tuple. + */ +static inline void +SetHintBits(HeapTupleHeader tuple, Buffer buffer, + uint16 infomask, TransactionId xid) +{ + SetHintBitsExt(tuple, buffer, infomask, xid, NULL); } /* @@ -864,9 +935,9 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, * inserting/deleting transaction was still running --- which was more cycles * and more contention on ProcArrayLock. */ -static bool +static inline bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, - Buffer buffer) + Buffer buffer, SetHintBitsState *state) { HeapTupleHeader tuple = htup->t_data; @@ -921,8 +992,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) { /* deleting subtransaction must have aborted */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId, state); return true; } @@ -934,13 +1005,13 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) return false; else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + SetHintBitsExt(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple), state); else { /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId, state); return false; } } @@ -1003,14 +1074,14 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) { /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId, state); return true; } /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + SetHintBitsExt(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple), state); } else { @@ -1607,9 +1678,10 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, * ->vistuples_dense is set to contain the offsets of visible tuples. * * The reason this is more efficient than HeapTupleSatisfiesMVCC() is that it - * avoids a cross-translation-unit function call for each tuple and allows the - * compiler to optimize across calls to HeapTupleSatisfiesMVCC. In the future - * it will also allow more efficient setting of hint bits. + * avoids a cross-translation-unit function call for each tuple, allows the + * compiler to optimize across calls to HeapTupleSatisfiesMVCC and allows + * setting hint bits more efficiently (see the one BufferFinishSetHintBits() + * call below). * * Returns the number of visible tuples. */ @@ -1620,6 +1692,7 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, OffsetNumber *vistuples_dense) { int nvis = 0; + SetHintBitsState state = SHB_INITIAL; Assert(IsMVCCSnapshot(snapshot)); @@ -1628,7 +1701,7 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, bool valid; HeapTuple tup = &batchmvcc->tuples[i]; - valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer); + valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer, &state); batchmvcc->visible[i] = valid; if (likely(valid)) @@ -1638,6 +1711,9 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, } } + if (state == SHB_ENABLED) + BufferFinishSetHintBits(buffer, true, true); + return nvis; } @@ -1657,7 +1733,7 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) switch (snapshot->snapshot_type) { case SNAPSHOT_MVCC: - return HeapTupleSatisfiesMVCC(htup, snapshot, buffer); + return HeapTupleSatisfiesMVCC(htup, snapshot, buffer, NULL); case SNAPSHOT_SELF: return HeapTupleSatisfiesSelf(htup, snapshot, buffer); case SNAPSHOT_ANY: diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 65c9f393f41a9..6beeb6956e3e2 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -114,13 +114,6 @@ typedef struct */ HeapPageFreeze pagefrz; - /* - * The snapshot conflict horizon used when freezing tuples. The final - * snapshot conflict horizon for the record may be newer if pruning - * removes newer transaction IDs. - */ - TransactionId frz_conflict_horizon; - /*------------------------------------------------------- * Information about what was done * @@ -377,6 +370,7 @@ prune_freeze_setup(PruneFreezeParams *params, /* initialize page freezing working state */ prstate->pagefrz.freeze_required = false; + prstate->pagefrz.FreezePageConflictXid = InvalidTransactionId; if (prstate->attempt_freeze) { Assert(new_relfrozen_xid && new_relmin_mxid); @@ -407,7 +401,6 @@ prune_freeze_setup(PruneFreezeParams *params, * PruneState. */ prstate->deadoffsets = presult->deadoffsets; - prstate->frz_conflict_horizon = InvalidTransactionId; /* * Vacuum may update the VM after we're done. We can keep track of @@ -746,22 +739,8 @@ heap_page_will_freeze(bool did_tuple_hint_fpi, * critical section. */ heap_pre_freeze_checks(prstate->buffer, prstate->frozen, prstate->nfrozen); - - /* - * Calculate what the snapshot conflict horizon should be for a record - * freezing tuples. We can use the visibility_cutoff_xid as our cutoff - * for conflicts when the whole page is eligible to become all-frozen - * in the VM once we're done with it. Otherwise, we generate a - * conservative cutoff by stepping back from OldestXmin. - */ - if (prstate->set_all_frozen) - prstate->frz_conflict_horizon = prstate->visibility_cutoff_xid; - else - { - /* Avoids false conflicts when hot_standby_feedback in use */ - prstate->frz_conflict_horizon = prstate->cutoffs->OldestXmin; - TransactionIdRetreat(prstate->frz_conflict_horizon); - } + Assert(TransactionIdPrecedes(prstate->pagefrz.FreezePageConflictXid, + prstate->cutoffs->OldestXmin)); } else if (prstate->nfrozen > 0) { @@ -952,18 +931,18 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, /* * The snapshotConflictHorizon for the whole record should be the * most conservative of all the horizons calculated for any of the - * possible modifications. If this record will prune tuples, any - * transactions on the standby older than the youngest xmax of the - * most recently removed tuple this record will prune will - * conflict. If this record will freeze tuples, any transactions - * on the standby with xids older than the youngest tuple this - * record will freeze will conflict. + * possible modifications. If this record will prune tuples, any + * queries on the standby older than the newest xid of the most + * recently removed tuple this record will prune will conflict. If + * this record will freeze tuples, any queries on the standby with + * xids older than the newest tuple this record will freeze will + * conflict. */ TransactionId conflict_xid; - if (TransactionIdFollows(prstate.frz_conflict_horizon, + if (TransactionIdFollows(prstate.pagefrz.FreezePageConflictXid, prstate.latest_xid_removed)) - conflict_xid = prstate.frz_conflict_horizon; + conflict_xid = prstate.pagefrz.FreezePageConflictXid; else conflict_xid = prstate.latest_xid_removed; diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3047bd46def96..e21b96281a637 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -116,6 +116,8 @@ /* Mapping from heap block number to the right bit in the visibility map */ #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_MAPBLOCK_LIMIT(x) \ + (((x) + HEAPBLOCKS_PER_PAGE - 1) / HEAPBLOCKS_PER_PAGE) #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) @@ -600,6 +602,21 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) return newnblocks; } +/* + * visibilitymap_truncation_length - + * compute truncation length for visibility map + * + * Given a proposed truncation length for the main fork, compute the + * correct truncation length for the visibility map. Should return the + * same answer as visibilitymap_prepare_truncate(), but without modifying + * anything. + */ +BlockNumber +visibilitymap_truncation_length(BlockNumber nheapblocks) +{ + return HEAPBLK_TO_MAPBLOCK_LIMIT(nheapblocks); +} + /* * Read a visibility map page. * diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d17aaa5aa0fb8..796e1513ddf96 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -681,20 +681,31 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, { /* * The conflicting tuple (or all HOT chains pointed to by - * all posting list TIDs) is dead to everyone, so mark the - * index entry killed. + * all posting list TIDs) is dead to everyone, so try to + * mark the index entry killed. It's ok if we're not + * allowed to, this isn't required for correctness. */ - ItemIdMarkDead(curitemid); - opaque->btpo_flags |= BTP_HAS_GARBAGE; + Buffer buf; - /* - * Mark buffer with a dirty hint, since state is not - * crucial. Be sure to mark the proper buffer dirty. - */ + /* Be sure to operate on the proper buffer */ if (nbuf != InvalidBuffer) - MarkBufferDirtyHint(nbuf, true); + buf = nbuf; else - MarkBufferDirtyHint(insertstate->buf, true); + buf = insertstate->buf; + + /* + * Use the hint bit infrastructure to check if we can + * update the page while just holding a share lock. + * + * Can't use BufferSetHintBits16() here as we update two + * different locations. + */ + if (BufferBeginSetHintBits(buf)) + { + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + BufferFinishSetHintBits(buf, true, true); + } } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5c50f0dd1bd91..f14ff95cb2b76 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -361,6 +361,17 @@ _bt_killitems(IndexScanDesc scan) */ if (killtuple && !ItemIdIsDead(iid)) { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can + * update the page while just holding a share lock. If we + * are not allowed, there's no point continuing. + */ + if (!BufferBeginSetHintBits(buf)) + goto unlock_page; + } + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; @@ -380,9 +391,10 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(buf, true); + BufferFinishSetHintBits(buf, true, true); } +unlock_page: if (!so->dropPin) _bt_unlockbuf(rel, buf); else diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index a9a1678acc97a..03c85dada710b 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1077,11 +1077,6 @@ XLogCheckBufferNeedsBackup(Buffer buffer) * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. - * - * It is possible that multiple concurrent backends could attempt to write WAL - * records. In that case, multiple copies of the same block would be recorded - * in separate WAL records by different backends, though that is still OK from - * a correctness perspective. */ XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std) @@ -1102,11 +1097,9 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) /* * We assume page LSN is first data on *every* page that can be passed to - * XLogInsert, whether it has the standard page layout or not. Since we're - * only holding a share-lock on the page, we must take the buffer header - * lock when we look at the LSN. + * XLogInsert, whether it has the standard page layout or not. */ - lsn = BufferGetLSNAtomic(buffer); + lsn = PageGetLSN(BufferGetPage(buffer)); if (lsn <= RedoRecPtr) { diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index d55a534b13883..6d2c4a86b9600 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -735,7 +735,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * can't read the last checkpoint because this allows us to * simplify processing around checkpoints. */ - ereport(PANIC, + ereport(FATAL, errmsg("could not locate a valid checkpoint record at %X/%08X", LSN_FORMAT_ARGS(CheckPointLoc))); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 43de42ce39e28..5ee6389d39c47 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -4077,7 +4077,7 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, Assert(!ReindexIsProcessingIndex(indexOid)); /* Set index rebuild count */ - pgstat_progress_update_param(PROGRESS_CLUSTER_INDEX_REBUILD_COUNT, + pgstat_progress_update_param(PROGRESS_REPACK_INDEX_REBUILD_COUNT, i); i++; } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index ecb7c996e8646..339c016e510c7 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1311,14 +1311,15 @@ CREATE VIEW pg_stat_progress_vacuum AS FROM pg_stat_get_progress_info('VACUUM') AS S LEFT JOIN pg_database D ON S.datid = D.oid; -CREATE VIEW pg_stat_progress_cluster AS +CREATE VIEW pg_stat_progress_repack AS SELECT S.pid AS pid, S.datid AS datid, D.datname AS datname, S.relid AS relid, CASE S.param1 WHEN 1 THEN 'CLUSTER' - WHEN 2 THEN 'VACUUM FULL' + WHEN 2 THEN 'REPACK' + WHEN 3 THEN 'VACUUM FULL' END AS command, CASE S.param2 WHEN 0 THEN 'initializing' WHEN 1 THEN 'seq scanning heap' @@ -1329,15 +1330,35 @@ CREATE VIEW pg_stat_progress_cluster AS WHEN 6 THEN 'rebuilding index' WHEN 7 THEN 'performing final cleanup' END AS phase, - CAST(S.param3 AS oid) AS cluster_index_relid, + CAST(S.param3 AS oid) AS repack_index_relid, S.param4 AS heap_tuples_scanned, S.param5 AS heap_tuples_written, S.param6 AS heap_blks_total, S.param7 AS heap_blks_scanned, S.param8 AS index_rebuild_count - FROM pg_stat_get_progress_info('CLUSTER') AS S + FROM pg_stat_get_progress_info('REPACK') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +-- This view is as the one above, except for renaming a column and avoiding +-- 'REPACK' as a command name to report. +CREATE VIEW pg_stat_progress_cluster AS + SELECT + pid, + datid, + datname, + relid, + CASE WHEN command IN ('CLUSTER', 'VACUUM FULL') THEN command + WHEN repack_index_relid = 0 THEN 'VACUUM FULL' + ELSE 'CLUSTER' END AS command, + phase, + repack_index_relid AS cluster_index_relid, + heap_tuples_scanned, + heap_tuples_written, + heap_blks_total, + heap_blks_scanned, + index_rebuild_count + FROM pg_stat_progress_repack; + CREATE VIEW pg_stat_progress_create_index AS SELECT S.pid AS pid, S.datid AS datid, D.datname AS datname, diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 60a4617a5853f..3bfaa6636997e 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1,9 +1,8 @@ /*------------------------------------------------------------------------- * * cluster.c - * CLUSTER a table on an index. This is now also used for VACUUM FULL. - * - * There is hardly anything left of Paul Brown's original implementation... + * REPACK a table; formerly known as CLUSTER. VACUUM FULL also uses + * parts of this code. * * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group @@ -67,27 +66,35 @@ typedef struct Oid indexOid; } RelToCluster; - -static void cluster_multiple_rels(List *rtcs, ClusterParams *params); +static bool cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, + Oid indexOid, Oid userid, int options); static void rebuild_relation(Relation OldHeap, Relation index, bool verbose); static void copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verbose, bool *pSwapToastByContent, TransactionId *pFreezeXid, MultiXactId *pCutoffMulti); -static List *get_tables_to_cluster(MemoryContext cluster_context); -static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context, - Oid indexOid); -static bool cluster_is_permitted_for_relation(Oid relid, Oid userid); +static List *get_tables_to_repack(RepackCommand cmd, bool usingindex, + MemoryContext permcxt); +static List *get_tables_to_repack_partitioned(RepackCommand cmd, + Oid relid, bool rel_is_index, + MemoryContext permcxt); +static bool repack_is_permitted_for_relation(RepackCommand cmd, + Oid relid, Oid userid); +static Relation process_single_relation(RepackStmt *stmt, + ClusterParams *params); +static Oid determine_clustered_index(Relation rel, bool usingindex, + const char *indexname); +static const char *RepackCommandAsString(RepackCommand cmd); -/*--------------------------------------------------------------------------- - * This cluster code allows for clustering multiple tables at once. Because +/* + * The repack code allows for processing multiple tables at once. Because * of this, we cannot just run everything on a single transaction, or we * would be forced to acquire exclusive locks on all the tables being * clustered, simultaneously --- very likely leading to deadlock. * - * To solve this we follow a similar strategy to VACUUM code, - * clustering each relation in a separate transaction. For this to work, - * we need to: + * To solve this we follow a similar strategy to VACUUM code, processing each + * relation in a separate transaction. For this to work, we need to: + * * - provide a separate memory context so that we can pass information in * a way that survives across transactions * - start a new transaction every time a new relation is clustered @@ -98,197 +105,177 @@ static bool cluster_is_permitted_for_relation(Oid relid, Oid userid); * * The single-relation case does not have any such overhead. * - * We also allow a relation to be specified without index. In that case, - * the indisclustered bit will be looked up, and an ERROR will be thrown - * if there is no index with the bit set. - *--------------------------------------------------------------------------- + * We also allow a relation to be repacked following an index, but without + * naming a specific one. In that case, the indisclustered bit will be + * looked up, and an ERROR will be thrown if no so-marked index is found. */ void -cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) +ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel) { - ListCell *lc; ClusterParams params = {0}; - bool verbose = false; Relation rel = NULL; - Oid indexOid = InvalidOid; - MemoryContext cluster_context; + MemoryContext repack_context; List *rtcs; /* Parse option list */ - foreach(lc, stmt->params) + foreach_node(DefElem, opt, stmt->params) { - DefElem *opt = (DefElem *) lfirst(lc); - if (strcmp(opt->defname, "verbose") == 0) - verbose = defGetBoolean(opt); + params.options |= defGetBoolean(opt) ? CLUOPT_VERBOSE : 0; + else if (strcmp(opt->defname, "analyze") == 0 || + strcmp(opt->defname, "analyse") == 0) + params.options |= defGetBoolean(opt) ? CLUOPT_ANALYZE : 0; else ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized %s option \"%s\"", - "CLUSTER", opt->defname), - parser_errposition(pstate, opt->location))); + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized %s option \"%s\"", + RepackCommandAsString(stmt->command), + opt->defname), + parser_errposition(pstate, opt->location)); } - params.options = (verbose ? CLUOPT_VERBOSE : 0); - + /* + * If a single relation is specified, process it and we're done ... unless + * the relation is a partitioned table, in which case we fall through. + */ if (stmt->relation != NULL) { - /* This is the single-relation case. */ - Oid tableOid; - - /* - * Find, lock, and check permissions on the table. We obtain - * AccessExclusiveLock right away to avoid lock-upgrade hazard in the - * single-transaction case. - */ - tableOid = RangeVarGetRelidExtended(stmt->relation, - AccessExclusiveLock, - 0, - RangeVarCallbackMaintainsTable, - NULL); - rel = table_open(tableOid, NoLock); - - /* - * Reject clustering a remote temp table ... their local buffer - * manager is not going to cope. - */ - if (RELATION_IS_OTHER_TEMP(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster temporary tables of other sessions"))); - - if (stmt->indexname == NULL) - { - ListCell *index; - - /* We need to find the index that has indisclustered set. */ - foreach(index, RelationGetIndexList(rel)) - { - indexOid = lfirst_oid(index); - if (get_index_isclustered(indexOid)) - break; - indexOid = InvalidOid; - } - - if (!OidIsValid(indexOid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("there is no previously clustered index for table \"%s\"", - stmt->relation->relname))); - } - else - { - /* - * The index is expected to be in the same namespace as the - * relation. - */ - indexOid = get_relname_relid(stmt->indexname, - rel->rd_rel->relnamespace); - if (!OidIsValid(indexOid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("index \"%s\" for table \"%s\" does not exist", - stmt->indexname, stmt->relation->relname))); - } - - /* For non-partitioned tables, do what we came here to do. */ - if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - { - cluster_rel(rel, indexOid, ¶ms); - /* cluster_rel closes the relation, but keeps lock */ - - return; - } + rel = process_single_relation(stmt, ¶ms); + if (rel == NULL) + return; /* all done */ } + /* + * Don't allow ANALYZE in the multiple-relation case for now. Maybe we + * can add support for this later. + */ + if (params.options & CLUOPT_ANALYZE) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot execute %s on multiple tables", + "REPACK (ANALYZE)")); + /* * By here, we know we are in a multi-table situation. In order to avoid * holding locks for too long, we want to process each table in its own * transaction. This forces us to disallow running inside a user * transaction block. */ - PreventInTransactionBlock(isTopLevel, "CLUSTER"); + PreventInTransactionBlock(isTopLevel, RepackCommandAsString(stmt->command)); /* Also, we need a memory context to hold our list of relations */ - cluster_context = AllocSetContextCreate(PortalContext, - "Cluster", - ALLOCSET_DEFAULT_SIZES); + repack_context = AllocSetContextCreate(PortalContext, + "Repack", + ALLOCSET_DEFAULT_SIZES); + + params.options |= CLUOPT_RECHECK; /* - * Either we're processing a partitioned table, or we were not given any - * table name at all. In either case, obtain a list of relations to - * process. - * - * In the former case, an index name must have been given, so we don't - * need to recheck its "indisclustered" bit, but we have to check that it - * is an index that we can cluster on. In the latter case, we set the - * option bit to have indisclustered verified. - * - * Rechecking the relation itself is necessary here in all cases. + * If we don't have a relation yet, determine a relation list. If we do, + * then it must be a partitioned table, and we want to process its + * partitions. */ - params.options |= CLUOPT_RECHECK; - if (rel != NULL) + if (rel == NULL) { - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - check_index_is_clusterable(rel, indexOid, AccessShareLock); - rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid); - - /* close relation, releasing lock on parent table */ - table_close(rel, AccessExclusiveLock); + Assert(stmt->indexname == NULL); + rtcs = get_tables_to_repack(stmt->command, stmt->usingindex, + repack_context); + params.options |= CLUOPT_RECHECK_ISCLUSTERED; } else { - rtcs = get_tables_to_cluster(cluster_context); - params.options |= CLUOPT_RECHECK_ISCLUSTERED; - } + Oid relid; + bool rel_is_index; - /* Do the job. */ - cluster_multiple_rels(rtcs, ¶ms); + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - /* Start a new transaction for the cleanup work. */ - StartTransactionCommand(); + /* + * If USING INDEX was specified, resolve the index name now and pass + * it down. + */ + if (stmt->usingindex) + { + /* + * If no index name was specified when repacking a partitioned + * table, punt for now. Maybe we can improve this later. + */ + if (!stmt->indexname) + { + if (stmt->command == REPACK_COMMAND_CLUSTER) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("there is no previously clustered index for table \"%s\"", + RelationGetRelationName(rel))); + else + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on partitioned table \"%s\" USING INDEX with no index name", + RepackCommandAsString(stmt->command), + RelationGetRelationName(rel))); + } - /* Clean up working storage */ - MemoryContextDelete(cluster_context); -} + relid = determine_clustered_index(rel, stmt->usingindex, + stmt->indexname); + if (!OidIsValid(relid)) + elog(ERROR, "unable to determine index to cluster on"); + check_index_is_clusterable(rel, relid, AccessExclusiveLock); -/* - * Given a list of relations to cluster, process each of them in a separate - * transaction. - * - * We expect to be in a transaction at start, but there isn't one when we - * return. - */ -static void -cluster_multiple_rels(List *rtcs, ClusterParams *params) -{ - ListCell *lc; + rel_is_index = true; + } + else + { + relid = RelationGetRelid(rel); + rel_is_index = false; + } + + rtcs = get_tables_to_repack_partitioned(stmt->command, + relid, rel_is_index, + repack_context); + + /* close parent relation, releasing lock on it */ + table_close(rel, AccessExclusiveLock); + rel = NULL; + } /* Commit to get out of starting transaction */ PopActiveSnapshot(); CommitTransactionCommand(); /* Cluster the tables, each in a separate transaction */ - foreach(lc, rtcs) + Assert(rel == NULL); + foreach_ptr(RelToCluster, rtc, rtcs) { - RelToCluster *rtc = (RelToCluster *) lfirst(lc); - Relation rel; - /* Start a new transaction for each relation. */ StartTransactionCommand(); + /* + * Open the target table, coping with the case where it has been + * dropped. + */ + rel = try_table_open(rtc->tableOid, AccessExclusiveLock); + if (rel == NULL) + { + CommitTransactionCommand(); + continue; + } + /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); - rel = table_open(rtc->tableOid, AccessExclusiveLock); - /* Process this table */ - cluster_rel(rel, rtc->indexOid, params); + cluster_rel(stmt->command, rel, rtc->indexOid, ¶ms); /* cluster_rel closes the relation, but keeps lock */ PopActiveSnapshot(); CommitTransactionCommand(); } + + /* Start a new transaction for the cleanup work. */ + StartTransactionCommand(); + + /* Clean up working storage */ + MemoryContextDelete(repack_context); } /* @@ -304,11 +291,14 @@ cluster_multiple_rels(List *rtcs, ClusterParams *params) * them incrementally while we load the table. * * If indexOid is InvalidOid, the table will be rewritten in physical order - * instead of index order. This is the new implementation of VACUUM FULL, - * and error messages should refer to the operation as VACUUM not CLUSTER. + * instead of index order. + * + * 'cmd' indicates which command is being executed, to be used for error + * messages. */ void -cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) +cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid, + ClusterParams *params) { Oid tableOid = RelationGetRelid(OldHeap); Oid save_userid; @@ -323,13 +313,8 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) /* Check for user-requested abort. */ CHECK_FOR_INTERRUPTS(); - pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid); - if (OidIsValid(indexOid)) - pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, - PROGRESS_CLUSTER_COMMAND_CLUSTER); - else - pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, - PROGRESS_CLUSTER_COMMAND_VACUUM_FULL); + pgstat_progress_start_command(PROGRESS_COMMAND_REPACK, tableOid); + pgstat_progress_update_param(PROGRESS_REPACK_COMMAND, cmd); /* * Switch to the table owner's userid, so that any index functions are run @@ -350,86 +335,40 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) * *must* skip the one on indisclustered since it would reject an attempt * to cluster a not-previously-clustered index. */ - if (recheck) - { - /* Check that the user still has privileges for the relation */ - if (!cluster_is_permitted_for_relation(tableOid, save_userid)) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - - /* - * Silently skip a temp table for a remote session. Only doing this - * check in the "recheck" case is appropriate (which currently means - * somebody is executing a database-wide CLUSTER or on a partitioned - * table), because there is another check in cluster() which will stop - * any attempt to cluster remote temp tables by name. There is - * another check in cluster_rel which is redundant, but we leave it - * for extra safety. - */ - if (RELATION_IS_OTHER_TEMP(OldHeap)) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - - if (OidIsValid(indexOid)) - { - /* - * Check that the index still exists - */ - if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - - /* - * Check that the index is still the one with indisclustered set, - * if needed. - */ - if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 && - !get_index_isclustered(indexOid)) - { - relation_close(OldHeap, AccessExclusiveLock); - goto out; - } - } - } + if (recheck && + !cluster_rel_recheck(cmd, OldHeap, indexOid, save_userid, + params->options)) + goto out; /* - * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER - * would work in most respects, but the index would only get marked as - * indisclustered in the current database, leading to unexpected behavior - * if CLUSTER were later invoked in another database. + * We allow repacking shared catalogs only when not using an index. It + * would work to use an index in most respects, but the index would only + * get marked as indisclustered in the current database, leading to + * unexpected behavior if CLUSTER were later invoked in another database. */ if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared) ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster a shared catalog"))); + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on a shared catalog", + RepackCommandAsString(cmd))); /* * Don't process temp tables of other backends ... their local buffer * manager is not going to cope. */ if (RELATION_IS_OTHER_TEMP(OldHeap)) - { - if (OidIsValid(indexOid)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster temporary tables of other sessions"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot vacuum temporary tables of other sessions"))); - } + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on temporary tables of other sessions", + RepackCommandAsString(cmd))); /* * Also check for active uses of the relation in the current transaction, * including open scans and pending AFTER trigger events. */ - CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM"); + CheckTableNotInUse(OldHeap, RepackCommandAsString(cmd)); /* Check heap and index are valid to cluster on */ if (OidIsValid(indexOid)) @@ -442,6 +381,24 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) else index = NULL; + /* + * When allow_system_table_mods is turned off, we disallow repacking a + * catalog on a particular index unless that's already the clustered index + * for that catalog. + * + * XXX We don't check for this in CLUSTER, because it's historically been + * allowed. + */ + if (cmd != REPACK_COMMAND_CLUSTER && + !allowSystemTableMods && OidIsValid(indexOid) && + IsCatalogRelation(OldHeap) && !index->rd_index->indisclustered) + ereport(ERROR, + errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied: \"%s\" is a system catalog", + RelationGetRelationName(OldHeap)), + errdetail("System catalogs can only be clustered by the index they're already clustered on, if any, unless \"%s\" is enabled.", + "allow_system_table_mods")); + /* * Quietly ignore the request if this is a materialized view which has not * been populated from its query. No harm is done because there is no data @@ -482,6 +439,63 @@ cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params) pgstat_progress_end_command(); } +/* + * Check if the table (and its index) still meets the requirements of + * cluster_rel(). + */ +static bool +cluster_rel_recheck(RepackCommand cmd, Relation OldHeap, Oid indexOid, + Oid userid, int options) +{ + Oid tableOid = RelationGetRelid(OldHeap); + + /* Check that the user still has privileges for the relation */ + if (!repack_is_permitted_for_relation(cmd, tableOid, userid)) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + + /* + * Silently skip a temp table for a remote session. Only doing this check + * in the "recheck" case is appropriate (which currently means somebody is + * executing a database-wide CLUSTER or on a partitioned table), because + * there is another check in cluster() which will stop any attempt to + * cluster remote temp tables by name. There is another check in + * cluster_rel which is redundant, but we leave it for extra safety. + */ + if (RELATION_IS_OTHER_TEMP(OldHeap)) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + + if (OidIsValid(indexOid)) + { + /* + * Check that the index still exists + */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + + /* + * Check that the index is still the one with indisclustered set, if + * needed. + */ + if ((options & CLUOPT_RECHECK_ISCLUSTERED) != 0 && + !get_index_isclustered(indexOid)) + { + relation_close(OldHeap, AccessExclusiveLock); + return false; + } + } + + return true; +} + /* * Verify that the specified heap and index are valid to cluster on * @@ -642,8 +656,8 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose) Assert(CheckRelationLockedByMe(OldHeap, AccessExclusiveLock, false) && (index == NULL || CheckRelationLockedByMe(index, AccessExclusiveLock, false))); - if (index) - /* Mark the correct index as clustered */ + /* for CLUSTER or REPACK USING INDEX, mark the index as the one to use */ + if (index != NULL) mark_index_clustered(OldHeap, RelationGetRelid(index), true); /* Remember info about rel before closing OldHeap */ @@ -958,20 +972,20 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb /* Log what we're doing */ if (OldIndex != NULL && !use_sort) ereport(elevel, - (errmsg("clustering \"%s.%s\" using index scan on \"%s\"", - nspname, - RelationGetRelationName(OldHeap), - RelationGetRelationName(OldIndex)))); + errmsg("repacking \"%s.%s\" using index scan on \"%s\"", + nspname, + RelationGetRelationName(OldHeap), + RelationGetRelationName(OldIndex))); else if (use_sort) ereport(elevel, - (errmsg("clustering \"%s.%s\" using sequential scan and sort", - nspname, - RelationGetRelationName(OldHeap)))); + errmsg("repacking \"%s.%s\" using sequential scan and sort", + nspname, + RelationGetRelationName(OldHeap))); else ereport(elevel, - (errmsg("vacuuming \"%s.%s\"", - nspname, - RelationGetRelationName(OldHeap)))); + errmsg("repacking \"%s.%s\" in physical order", + nspname, + RelationGetRelationName(OldHeap))); /* * Hand off the actual copying to AM specific function, the generic code @@ -1458,8 +1472,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, int i; /* Report that we are now swapping relation files */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_SWAP_REL_FILES); /* Zero out possible results from swapped_relation_files */ memset(mapped_tables, 0, sizeof(mapped_tables)); @@ -1509,14 +1523,14 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; /* Report that we are now reindexing relations */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_REBUILD_INDEX); reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params); /* Report that we are now doing clean up */ - pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, - PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); + pgstat_progress_update_param(PROGRESS_REPACK_PHASE, + PROGRESS_REPACK_PHASE_FINAL_CLEANUP); /* * If the relation being rebuilt is pg_class, swap_relation_files() @@ -1632,123 +1646,386 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, } } - /* - * Get a list of tables that the current user has privileges on and - * have indisclustered set. Return the list in a List * of RelToCluster - * (stored in the specified memory context), each one giving the tableOid - * and the indexOid on which the table is already clustered. + * Determine which relations to process, when REPACK/CLUSTER is called + * without specifying a table name. The exact process depends on whether + * USING INDEX was given or not, and in any case we only return tables and + * materialized views that the current user has privileges to repack/cluster. + * + * If USING INDEX was given, we scan pg_index to find those that have + * indisclustered set; if it was not given, scan pg_class and return all + * tables. + * + * Return it as a list of RelToCluster in the given memory context. */ static List * -get_tables_to_cluster(MemoryContext cluster_context) +get_tables_to_repack(RepackCommand cmd, bool usingindex, MemoryContext permcxt) { - Relation indRelation; + Relation catalog; TableScanDesc scan; - ScanKeyData entry; - HeapTuple indexTuple; - Form_pg_index index; - MemoryContext old_context; + HeapTuple tuple; List *rtcs = NIL; - /* - * Get all indexes that have indisclustered set and that the current user - * has the appropriate privileges for. - */ - indRelation = table_open(IndexRelationId, AccessShareLock); - ScanKeyInit(&entry, - Anum_pg_index_indisclustered, - BTEqualStrategyNumber, F_BOOLEQ, - BoolGetDatum(true)); - scan = table_beginscan_catalog(indRelation, 1, &entry); - while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + if (usingindex) { - RelToCluster *rtc; + ScanKeyData entry; - index = (Form_pg_index) GETSTRUCT(indexTuple); + /* + * For USING INDEX, scan pg_index to find those with indisclustered. + */ + catalog = table_open(IndexRelationId, AccessShareLock); + ScanKeyInit(&entry, + Anum_pg_index_indisclustered, + BTEqualStrategyNumber, F_BOOLEQ, + BoolGetDatum(true)); + scan = table_beginscan_catalog(catalog, 1, &entry); + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + RelToCluster *rtc; + Form_pg_index index; + MemoryContext oldcxt; - if (!cluster_is_permitted_for_relation(index->indrelid, GetUserId())) - continue; + index = (Form_pg_index) GETSTRUCT(tuple); - /* Use a permanent memory context for the result list */ - old_context = MemoryContextSwitchTo(cluster_context); + /* + * Try to obtain a light lock on the index's table, to ensure it + * doesn't go away while we collect the list. If we cannot, just + * disregard it. Be sure to release this if we ultimately decide + * not to process the table! + */ + if (!ConditionalLockRelationOid(index->indrelid, AccessShareLock)) + continue; - rtc = palloc_object(RelToCluster); - rtc->tableOid = index->indrelid; - rtc->indexOid = index->indexrelid; - rtcs = lappend(rtcs, rtc); + /* Verify that the table still exists; skip if not */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(index->indrelid))) + { + UnlockRelationOid(index->indrelid, AccessShareLock); + continue; + } - MemoryContextSwitchTo(old_context); + /* noisily skip rels which the user can't process */ + if (!repack_is_permitted_for_relation(cmd, index->indrelid, + GetUserId())) + { + UnlockRelationOid(index->indrelid, AccessShareLock); + continue; + } + + /* Use a permanent memory context for the result list */ + oldcxt = MemoryContextSwitchTo(permcxt); + rtc = palloc_object(RelToCluster); + rtc->tableOid = index->indrelid; + rtc->indexOid = index->indexrelid; + rtcs = lappend(rtcs, rtc); + MemoryContextSwitchTo(oldcxt); + } } - table_endscan(scan); + else + { + catalog = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(catalog, 0, NULL); + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + RelToCluster *rtc; + Form_pg_class class; + MemoryContext oldcxt; + + class = (Form_pg_class) GETSTRUCT(tuple); + + /* + * Try to obtain a light lock on the table, to ensure it doesn't + * go away while we collect the list. If we cannot, just + * disregard the table. Be sure to release this if we ultimately + * decide not to process the table! + */ + if (!ConditionalLockRelationOid(class->oid, AccessShareLock)) + continue; + + /* Verify that the table still exists */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(class->oid))) + { + UnlockRelationOid(class->oid, AccessShareLock); + continue; + } + + /* Can only process plain tables and matviews */ + if (class->relkind != RELKIND_RELATION && + class->relkind != RELKIND_MATVIEW) + { + UnlockRelationOid(class->oid, AccessShareLock); + continue; + } - relation_close(indRelation, AccessShareLock); + /* noisily skip rels which the user can't process */ + if (!repack_is_permitted_for_relation(cmd, class->oid, + GetUserId())) + { + UnlockRelationOid(class->oid, AccessShareLock); + continue; + } + + /* Use a permanent memory context for the result list */ + oldcxt = MemoryContextSwitchTo(permcxt); + rtc = palloc_object(RelToCluster); + rtc->tableOid = class->oid; + rtc->indexOid = InvalidOid; + rtcs = lappend(rtcs, rtc); + MemoryContextSwitchTo(oldcxt); + } + } + + table_endscan(scan); + relation_close(catalog, AccessShareLock); return rtcs; } /* - * Given an index on a partitioned table, return a list of RelToCluster for - * all the children leaves tables/indexes. + * Given a partitioned table or its index, return a list of RelToCluster for + * all the leaf child tables/indexes. * - * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock - * on the table containing the index. + * 'rel_is_index' tells whether 'relid' is that of an index (true) or of the + * owning relation. */ static List * -get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid) +get_tables_to_repack_partitioned(RepackCommand cmd, Oid relid, + bool rel_is_index, MemoryContext permcxt) { List *inhoids; - ListCell *lc; List *rtcs = NIL; - MemoryContext old_context; - - /* Do not lock the children until they're processed */ - inhoids = find_all_inheritors(indexOid, NoLock, NULL); - foreach(lc, inhoids) + /* + * Do not lock the children until they're processed. Note that we do hold + * a lock on the parent partitioned table. + */ + inhoids = find_all_inheritors(relid, NoLock, NULL); + foreach_oid(child_oid, inhoids) { - Oid indexrelid = lfirst_oid(lc); - Oid relid = IndexGetRelation(indexrelid, false); + Oid table_oid, + index_oid; RelToCluster *rtc; + MemoryContext oldcxt; - /* consider only leaf indexes */ - if (get_rel_relkind(indexrelid) != RELKIND_INDEX) - continue; + if (rel_is_index) + { + /* consider only leaf indexes */ + if (get_rel_relkind(child_oid) != RELKIND_INDEX) + continue; + + table_oid = IndexGetRelation(child_oid, false); + index_oid = child_oid; + } + else + { + /* consider only leaf relations */ + if (get_rel_relkind(child_oid) != RELKIND_RELATION) + continue; + + table_oid = child_oid; + index_oid = InvalidOid; + } /* * It's possible that the user does not have privileges to CLUSTER the - * leaf partition despite having such privileges on the partitioned - * table. We skip any partitions which the user is not permitted to - * CLUSTER. + * leaf partition despite having them on the partitioned table. Skip + * if so. */ - if (!cluster_is_permitted_for_relation(relid, GetUserId())) + if (!repack_is_permitted_for_relation(cmd, table_oid, GetUserId())) continue; /* Use a permanent memory context for the result list */ - old_context = MemoryContextSwitchTo(cluster_context); - + oldcxt = MemoryContextSwitchTo(permcxt); rtc = palloc_object(RelToCluster); - rtc->tableOid = relid; - rtc->indexOid = indexrelid; + rtc->tableOid = table_oid; + rtc->indexOid = index_oid; rtcs = lappend(rtcs, rtc); - - MemoryContextSwitchTo(old_context); + MemoryContextSwitchTo(oldcxt); } return rtcs; } + /* - * Return whether userid has privileges to CLUSTER relid. If not, this + * Return whether userid has privileges to REPACK relid. If not, this * function emits a WARNING. */ static bool -cluster_is_permitted_for_relation(Oid relid, Oid userid) +repack_is_permitted_for_relation(RepackCommand cmd, Oid relid, Oid userid) { + Assert(cmd == REPACK_COMMAND_CLUSTER || cmd == REPACK_COMMAND_REPACK); + if (pg_class_aclcheck(relid, userid, ACL_MAINTAIN) == ACLCHECK_OK) return true; ereport(WARNING, - (errmsg("permission denied to cluster \"%s\", skipping it", - get_rel_name(relid)))); + errmsg("permission denied to execute %s on \"%s\", skipping it", + RepackCommandAsString(cmd), + get_rel_name(relid))); + return false; } + + +/* + * Given a RepackStmt with an indicated relation name, resolve the relation + * name, obtain lock on it, then determine what to do based on the relation + * type: if it's table and not partitioned, repack it as indicated (using an + * existing clustered index, or following the given one), and return NULL. + * + * On the other hand, if the table is partitioned, do nothing further and + * instead return the opened and locked relcache entry, so that caller can + * process the partitions using the multiple-table handling code. In this + * case, if an index name is given, it's up to the caller to resolve it. + */ +static Relation +process_single_relation(RepackStmt *stmt, ClusterParams *params) +{ + Relation rel; + Oid tableOid; + + Assert(stmt->relation != NULL); + Assert(stmt->command == REPACK_COMMAND_CLUSTER || + stmt->command == REPACK_COMMAND_REPACK); + + /* + * Make sure ANALYZE is specified if a column list is present. + */ + if ((params->options & CLUOPT_ANALYZE) == 0 && stmt->relation->va_cols != NIL) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ANALYZE option must be specified when a column list is provided")); + + /* + * Find, lock, and check permissions on the table. We obtain + * AccessExclusiveLock right away to avoid lock-upgrade hazard in the + * single-transaction case. + */ + tableOid = RangeVarGetRelidExtended(stmt->relation->relation, + AccessExclusiveLock, + 0, + RangeVarCallbackMaintainsTable, + NULL); + rel = table_open(tableOid, NoLock); + + /* + * Reject clustering a remote temp table ... their local buffer manager is + * not going to cope. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /*- translator: first %s is name of a SQL command, eg. REPACK */ + errmsg("cannot execute %s on temporary tables of other sessions", + RepackCommandAsString(stmt->command))); + + /* + * For partitioned tables, let caller handle this. Otherwise, process it + * here and we're done. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return rel; + else + { + Oid indexOid; + + indexOid = determine_clustered_index(rel, stmt->usingindex, + stmt->indexname); + if (OidIsValid(indexOid)) + check_index_is_clusterable(rel, indexOid, AccessExclusiveLock); + cluster_rel(stmt->command, rel, indexOid, params); + + /* + * Do an analyze, if requested. We close the transaction and start a + * new one, so that we don't hold the stronger lock for longer than + * needed. + */ + if (params->options & CLUOPT_ANALYZE) + { + VacuumParams vac_params = {0}; + + PopActiveSnapshot(); + CommitTransactionCommand(); + + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + vac_params.options |= VACOPT_ANALYZE; + if (params->options & CLUOPT_VERBOSE) + vac_params.options |= VACOPT_VERBOSE; + analyze_rel(tableOid, NULL, vac_params, + stmt->relation->va_cols, true, NULL); + PopActiveSnapshot(); + CommandCounterIncrement(); + } + + return NULL; + } +} + +/* + * Given a relation and the usingindex/indexname options in a + * REPACK USING INDEX or CLUSTER command, return the OID of the + * index to use for clustering the table. + * + * Caller must hold lock on the relation so that the set of indexes + * doesn't change, and must call check_index_is_clusterable. + */ +static Oid +determine_clustered_index(Relation rel, bool usingindex, const char *indexname) +{ + Oid indexOid; + + if (indexname == NULL && usingindex) + { + /* + * If USING INDEX with no name is given, find a clustered index, or + * error out if none. + */ + indexOid = InvalidOid; + foreach_oid(idxoid, RelationGetIndexList(rel)) + { + if (get_index_isclustered(idxoid)) + { + indexOid = idxoid; + break; + } + } + + if (!OidIsValid(indexOid)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("there is no previously clustered index for table \"%s\"", + RelationGetRelationName(rel))); + } + else if (indexname != NULL) + { + /* An index was specified; obtain its OID. */ + indexOid = get_relname_relid(indexname, rel->rd_rel->relnamespace); + if (!OidIsValid(indexOid)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" for table \"%s\" does not exist", + indexname, RelationGetRelationName(rel))); + } + else + indexOid = InvalidOid; + + return indexOid; +} + +static const char * +RepackCommandAsString(RepackCommand cmd) +{ + switch (cmd) + { + case REPACK_COMMAND_REPACK: + return "REPACK"; + case REPACK_COMMAND_VACUUMFULL: + return "VACUUM"; + case REPACK_COMMAND_CLUSTER: + return "CLUSTER"; + } + return "???"; /* keep compiler quiet */ +} diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 9e21d7a7df941..724637cff5bd4 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -2993,7 +2993,7 @@ check_pub_dead_tuple_retention(WalReceiverConn *wrconn) if (remote_in_recovery) ereport(ERROR, errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot enable retain_dead_tuples if the publisher is in recovery.")); + errmsg("cannot enable retain_dead_tuples if the publisher is in recovery")); ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 62c1ebdfd9b20..bce3a2daa245d 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -352,7 +352,6 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) } } - /* * Sanity check DISABLE_PAGE_SKIPPING option. */ @@ -2294,8 +2293,9 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, if ((params.options & VACOPT_VERBOSE) != 0) cluster_params.options |= CLUOPT_VERBOSE; - /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ - cluster_rel(rel, InvalidOid, &cluster_params); + /* VACUUM FULL is a variant of REPACK; see cluster.c */ + cluster_rel(REPACK_COMMAND_VACUUMFULL, rel, InvalidOid, + &cluster_params); /* cluster_rel closes the relation, but keeps lock */ rel = NULL; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc2522f..03cc82182ee4a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -72,7 +72,7 @@ InstrStartNode(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); else - INSTR_TIME_SET_CURRENT(instr->starttime); + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } /* save buffer usage totals at node entry, if needed */ @@ -99,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); @@ -294,3 +294,57 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } + +/* GUC hooks for timing_clock_source */ + +#include "portability/instr_time.h" +#include "utils/guc_hooks.h" + +bool +check_timing_clock_source(int *newval, void **extra, GucSource source) +{ + pg_initialize_timing(true); + +#if PG_INSTR_TSC_CLOCK + if (*newval == TIMING_CLOCK_SOURCE_TSC && !has_usable_tsc) + { + GUC_check_errdetail("TSC is not supported as timing clock source"); + return false; + } +#endif + + return true; +} + +void +assign_timing_clock_source(int newval, void *extra) +{ + /* + * Ignore the return code since the check hook already verified TSC is + * usable if its explicitly requested + */ + pg_set_timing_clock_source(newval); +} + +const char * +show_timing_clock_source(void) +{ + switch (timing_clock_source) + { + case TIMING_CLOCK_SOURCE_AUTO: +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + return "auto (tsc)"; +#endif + return "auto (system)"; + case TIMING_CLOCK_SOURCE_SYSTEM: + return "system"; +#if PG_INSTR_TSC_CLOCK + case TIMING_CLOCK_SOURCE_TSC: + return "tsc"; +#endif + } + + /* unreachable */ + return "?"; +} diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7b9b602f3c4b0..bcb45a54678bb 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -90,6 +90,16 @@ main(int argc, char *argv[]) */ startup_hacks(progname); + /* + * Initialize timing infrastructure + */ +#if defined(WIN32) + /* Skip TSC calibration on Windows, its too expensive per connection */ + pg_initialize_timing(false); +#else + pg_initialize_timing(true); +#endif + /* * Remember the physical location of the initially given argv[] array for * possible use by ps display. On some platforms, the argv[] storage must diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index e0c00e26dd5d5..044560da7bf7a 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -1048,6 +1048,7 @@ try_partial_nestloop_path(PlannerInfo *root, initial_cost_nestloop(root, &workspace, jointype, nestloop_subtype, outer_path, inner_path, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1237,6 +1238,7 @@ try_partial_mergejoin_path(PlannerInfo *root, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1369,6 +1371,7 @@ try_partial_hashjoin_path(PlannerInfo *root, initial_cost_hashjoin(root, &workspace, jointype, hashclauses, outer_path, inner_path, extra, parallel_hash); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, NIL)) return; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ef8ef6e89d377..96cc72a776b8d 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -778,10 +778,9 @@ add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * * Because we don't consider parameterized paths here, we also don't * need to consider the row counts as a measure of quality: every path will - * produce the same number of rows. Neither do we need to consider startup - * costs: parallelism is only used for plans that will be run to completion. - * Therefore, this routine is much simpler than add_path: it needs to - * consider only disabled nodes, pathkeys and total cost. + * produce the same number of rows. However, we do need to consider the + * startup costs: this partial path could be used beneath a Limit node, + * so a fast-start plan could be correct. * * As with add_path, we pfree paths that are found to be dominated by * another partial path; this requires that there be no other references to @@ -819,52 +818,41 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) /* Compare pathkeys. */ keyscmp = compare_pathkeys(new_path->pathkeys, old_path->pathkeys); - /* Unless pathkeys are incompatible, keep just one of the two paths. */ + /* + * Unless pathkeys are incompatible, see if one of the paths dominates + * the other (both in startup and total cost). It may happen that one + * path has lower startup cost, the other has lower total cost. + */ if (keyscmp != PATHKEYS_DIFFERENT) { - if (unlikely(new_path->disabled_nodes != old_path->disabled_nodes)) + PathCostComparison costcmp; + + /* + * Do a fuzzy cost comparison with standard fuzziness limit. + */ + costcmp = compare_path_costs_fuzzily(new_path, old_path, + STD_FUZZ_FACTOR); + if (costcmp == COSTS_BETTER1) { - if (new_path->disabled_nodes > old_path->disabled_nodes) - accept_new = false; - else + if (keyscmp != PATHKEYS_BETTER2) remove_old = true; } - else if (new_path->total_cost > old_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_BETTER2) { - /* New path costs more; keep it only if pathkeys are better. */ if (keyscmp != PATHKEYS_BETTER1) accept_new = false; } - else if (old_path->total_cost > new_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_EQUAL) { - /* Old path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER2) + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER1) - { - /* Costs are about the same, new path has better pathkeys. */ - remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER2) - { - /* Costs are about the same, old path has better pathkeys. */ - accept_new = false; - } - else if (old_path->total_cost > new_path->total_cost * 1.0000000001) - { - /* Pathkeys are the same, and the old path costs more. */ - remove_old = true; - } - else - { - /* - * Pathkeys are the same, and new path isn't materially - * cheaper. - */ - accept_new = false; + else if (keyscmp == PATHKEYS_BETTER2) + accept_new = false; + else if (compare_path_costs_fuzzily(new_path, old_path, + 1.0000000001) == COSTS_BETTER1) + remove_old = true; + else + accept_new = false; } } @@ -915,16 +903,16 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) * add_partial_path_precheck * Check whether a proposed new partial path could possibly get accepted. * - * Unlike add_path_precheck, we can ignore startup cost and parameterization, - * since they don't matter for partial paths (see add_partial_path). But - * we do want to make sure we don't add a partial path if there's already - * a complete path that dominates it, since in that case the proposed path - * is surely a loser. + * Unlike add_path_precheck, we can ignore parameterization, since it doesn't + * matter for partial paths (see add_partial_path). But we do want to make + * sure we don't add a partial path if there's already a complete path that + * dominates it, since in that case the proposed path is surely a loser. */ bool add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, - Cost total_cost, List *pathkeys) + Cost startup_cost, Cost total_cost, List *pathkeys) { + bool consider_startup = parent_rel->consider_startup; ListCell *p1; /* @@ -934,25 +922,81 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * is clearly superior to some existing partial path -- at least, modulo * final cost computations. If so, we definitely want to consider it. * - * Unlike add_path(), we always compare pathkeys here. This is because we - * expect partial_pathlist to be very short, and getting a definitive - * answer at this stage avoids the need to call add_path_precheck. + * Unlike add_path(), we never try to exit this loop early. This is + * because we expect partial_pathlist to be very short, and getting a + * definitive answer at this stage avoids the need to call + * add_path_precheck. */ foreach(p1, parent_rel->partial_pathlist) { Path *old_path = (Path *) lfirst(p1); + PathCostComparison costcmp; PathKeysComparison keyscmp; - keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); - if (keyscmp != PATHKEYS_DIFFERENT) + /* + * First, compare costs and disabled nodes. This logic should be + * identical to compare_path_costs_fuzzily, except that one of the + * paths hasn't been created yet, and the fuzz factor is always + * STD_FUZZ_FACTOR. + */ + if (unlikely(old_path->disabled_nodes != disabled_nodes)) + { + if (disabled_nodes < old_path->disabled_nodes) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_BETTER2; + } + else if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER2; + } + else if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR) { - if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER1) - return false; - if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER2) - return true; + if (consider_startup && + startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER1; } + else if (startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER2; + else if (old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_EQUAL; + + /* + * If one path wins on startup cost and the other on total cost, we + * can't say for sure which is better. + */ + if (costcmp == COSTS_DIFFERENT) + continue; + + /* + * If the two paths have different pathkeys, we can't say for sure + * which is better. + */ + keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); + if (keyscmp == PATHKEYS_DIFFERENT) + continue; + + /* + * If the existing path is cheaper and the pathkeys are equal or + * worse, the new path is not interesting. + */ + if (costcmp == COSTS_BETTER2 && keyscmp != PATHKEYS_BETTER1) + return false; + + /* + * If the new path is cheaper and the pathkeys are equal or better, it + * is definitely interesting. + */ + if (costcmp == COSTS_BETTER1 && keyscmp != PATHKEYS_BETTER2) + return true; } /* @@ -960,14 +1004,9 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * clearly good enough that it might replace one. Compare it to * non-parallel plans. If it loses even before accounting for the cost of * the Gather node, we should definitely reject it. - * - * Note that we pass the total_cost to add_path_precheck twice. This is - * because it's never advantageous to consider the startup cost of a - * partial path; the resulting plans, if run in parallel, will be run to - * completion. */ - if (!add_path_precheck(parent_rel, disabled_nodes, total_cost, total_cost, - pathkeys, NULL)) + if (!add_path_precheck(parent_rel, disabled_nodes, startup_cost, + total_cost, pathkeys, NULL)) return false; return true; @@ -1083,6 +1122,14 @@ create_index_path(PlannerInfo *root, cost_index(pathnode, root, loop_count, partial_path); + /* + * cost_index will set disabled_nodes to 1 if this rel is not allowed to + * use index scans in general, but it doesn't have the IndexOptInfo to + * know whether this specific index has been disabled. + */ + if (index->disabled) + pathnode->path.disabled_nodes = 1; + return pathnode; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d63e7390be764..b2fbd6a082bbc 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -57,9 +57,6 @@ /* GUC parameter */ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; -/* Hook for plugins to get control in get_relation_info() */ -get_relation_info_hook_type get_relation_info_hook = NULL; - typedef struct NotnullHashEntry { Oid relid; /* OID of the relation */ @@ -571,17 +568,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, set_relation_partition_info(root, rel, relation); table_close(relation, NoLock); - - /* - * Allow a plugin to editorialize on the info we obtained from the - * catalogs. Actions might include altering the assumed relation size, - * removing an index, or adding a hypothetical index to the indexlist. - * - * An extension can also modify rel->pgs_mask here to control path - * generation. - */ - if (get_relation_info_hook) - (*get_relation_info_hook) (root, relationObjectId, inhparent, rel); } /* diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index d21b4d3bb3563..91bcda34a3786 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -47,6 +47,9 @@ typedef struct JoinHashEntry RelOptInfo *join_rel; } JoinHashEntry; +/* Hook for plugins to get control in build_simple_rel() */ +build_simple_rel_hook_type build_simple_rel_hook = NULL; + /* Hook for plugins to get control during joinrel setup */ joinrel_setup_hook_type joinrel_setup_hook = NULL; @@ -394,6 +397,18 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) break; } + /* + * Allow a plugin to editorialize on the new RelOptInfo. This could + * involve editorializing on the information which get_relation_info + * obtained from the catalogs, such as altering the assumed relation size, + * removing an index, or adding a hypothetical index to the indexlist. + * + * An extension can also modify rel->pgs_mask here to control path + * generation. + */ + if (build_simple_rel_hook) + (*build_simple_rel_hook) (root, rel, rte); + /* * Apply the parent's quals to the child, with appropriate substitution of * variables. If any resulting clause is reduced to constant FALSE or diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 9cbe8eafc4545..f01f5734fe938 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -288,7 +288,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); AlterCompositeTypeStmt AlterUserMappingStmt AlterRoleStmt AlterRoleSetStmt AlterPolicyStmt AlterStatsStmt AlterDefaultPrivilegesStmt DefACLAction - AnalyzeStmt CallStmt ClosePortalStmt ClusterStmt CommentStmt + AnalyzeStmt CallStmt ClosePortalStmt CommentStmt ConstraintsSetStmt CopyStmt CreateAsStmt CreateCastStmt CreateDomainStmt CreateExtensionStmt CreateGroupStmt CreateOpClassStmt CreateOpFamilyStmt AlterOpFamilyStmt CreatePLangStmt @@ -305,7 +305,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); GrantStmt GrantRoleStmt ImportForeignSchemaStmt IndexStmt InsertStmt ListenStmt LoadStmt LockStmt MergeStmt NotifyStmt ExplainableStmt PreparableStmt CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt - RemoveFuncStmt RemoveOperStmt RenameStmt ReturnStmt RevokeStmt RevokeRoleStmt + RemoveFuncStmt RemoveOperStmt RenameStmt RepackStmt ReturnStmt RevokeStmt RevokeRoleStmt RuleActionStmt RuleActionStmtOrEmpty RuleStmt SecLabelStmt SelectStmt TransactionStmt TransactionStmtLegacy TruncateStmt UnlistenStmt UpdateStmt VacuumStmt @@ -324,7 +324,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type opt_single_name %type opt_qualified_name -%type opt_concurrently +%type opt_concurrently opt_usingindex %type opt_drop_behavior %type opt_utility_option_list %type opt_wait_with_clause @@ -620,8 +620,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type opt_provider security_label -%type xml_attribute_el -%type xml_attribute_list xml_attributes +%type labeled_expr +%type labeled_expr_list xml_attributes %type xml_root_version opt_xml_root_standalone %type xmlexists_argument %type document_or_content @@ -776,7 +776,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); QUOTE QUOTES RANGE READ REAL REASSIGN RECURSIVE REF_P REFERENCES REFERENCING - REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA + REFRESH REINDEX RELATIVE_P RELEASE RENAME REPACK REPEATABLE REPLACE REPLICA RESET RESPECT_P RESTART RESTRICT RETURN RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP ROUTINE ROUTINES ROW ROWS RULE @@ -1038,7 +1038,6 @@ stmt: | CallStmt | CheckPointStmt | ClosePortalStmt - | ClusterStmt | CommentStmt | ConstraintsSetStmt | CopyStmt @@ -1112,6 +1111,7 @@ stmt: | RemoveFuncStmt | RemoveOperStmt | RenameStmt + | RepackStmt | RevokeStmt | RevokeRoleStmt | RuleStmt @@ -1149,6 +1149,11 @@ opt_concurrently: | /*EMPTY*/ { $$ = false; } ; +opt_usingindex: + USING INDEX { $$ = true; } + | /* EMPTY */ { $$ = false; } + ; + opt_drop_behavior: CASCADE { $$ = DROP_CASCADE; } | RESTRICT { $$ = DROP_RESTRICT; } @@ -12085,38 +12090,82 @@ CreateConversionStmt: /***************************************************************************** * * QUERY: + * REPACK [ (options) ] [ [ ] [ USING INDEX ] ] + * + * obsolete variants: * CLUSTER (options) [ [ USING ] ] * CLUSTER [VERBOSE] [ [ USING ] ] * CLUSTER [VERBOSE] ON (for pre-8.3) * *****************************************************************************/ -ClusterStmt: - CLUSTER '(' utility_option_list ')' qualified_name cluster_index_specification +RepackStmt: + REPACK opt_utility_option_list vacuum_relation USING INDEX name { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); - n->relation = $5; + n->command = REPACK_COMMAND_REPACK; + n->relation = (VacuumRelation *) $3; n->indexname = $6; + n->usingindex = true; + n->params = $2; + $$ = (Node *) n; + } + | REPACK opt_utility_option_list vacuum_relation opt_usingindex + { + RepackStmt *n = makeNode(RepackStmt); + + n->command = REPACK_COMMAND_REPACK; + n->relation = (VacuumRelation *) $3; + n->indexname = NULL; + n->usingindex = $4; + n->params = $2; + $$ = (Node *) n; + } + | REPACK opt_utility_option_list opt_usingindex + { + RepackStmt *n = makeNode(RepackStmt); + + n->command = REPACK_COMMAND_REPACK; + n->relation = NULL; + n->indexname = NULL; + n->usingindex = $3; + n->params = $2; + $$ = (Node *) n; + } + | CLUSTER '(' utility_option_list ')' qualified_name cluster_index_specification + { + RepackStmt *n = makeNode(RepackStmt); + + n->command = REPACK_COMMAND_CLUSTER; + n->relation = makeNode(VacuumRelation); + n->relation->relation = $5; + n->indexname = $6; + n->usingindex = true; n->params = $3; $$ = (Node *) n; } | CLUSTER opt_utility_option_list { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); + n->command = REPACK_COMMAND_CLUSTER; n->relation = NULL; n->indexname = NULL; + n->usingindex = true; n->params = $2; $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-14 compatibility */ | CLUSTER opt_verbose qualified_name cluster_index_specification { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); - n->relation = $3; + n->command = REPACK_COMMAND_CLUSTER; + n->relation = makeNode(VacuumRelation); + n->relation->relation = $3; n->indexname = $4; + n->usingindex = true; if ($2) n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; @@ -12124,20 +12173,25 @@ ClusterStmt: /* unparenthesized VERBOSE kept for pre-17 compatibility */ | CLUSTER VERBOSE { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); + n->command = REPACK_COMMAND_CLUSTER; n->relation = NULL; n->indexname = NULL; + n->usingindex = true; n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* kept for pre-8.3 compatibility */ | CLUSTER opt_verbose name ON qualified_name { - ClusterStmt *n = makeNode(ClusterStmt); + RepackStmt *n = makeNode(RepackStmt); - n->relation = $5; + n->command = REPACK_COMMAND_CLUSTER; + n->relation = makeNode(VacuumRelation); + n->relation->relation = $5; n->indexname = $3; + n->usingindex = true; if ($2) n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; @@ -16317,7 +16371,7 @@ func_expr_common_subexpr: COERCE_SQL_SYNTAX, @1); } - | XMLFOREST '(' xml_attribute_list ')' + | XMLFOREST '(' labeled_expr_list ')' { $$ = makeXmlExpr(IS_XMLFOREST, NULL, $3, NIL, @1); } @@ -16542,14 +16596,14 @@ opt_xml_root_standalone: ',' STANDALONE_P YES_P { $$ = makeIntConst(XML_STANDALONE_OMITTED, -1); } ; -xml_attributes: XMLATTRIBUTES '(' xml_attribute_list ')' { $$ = $3; } +xml_attributes: XMLATTRIBUTES '(' labeled_expr_list ')' { $$ = $3; } ; -xml_attribute_list: xml_attribute_el { $$ = list_make1($1); } - | xml_attribute_list ',' xml_attribute_el { $$ = lappend($1, $3); } +labeled_expr_list: labeled_expr { $$ = list_make1($1); } + | labeled_expr_list ',' labeled_expr { $$ = lappend($1, $3); } ; -xml_attribute_el: a_expr AS ColLabel +labeled_expr: a_expr AS ColLabel { $$ = makeNode(ResTarget); $$->name = $3; @@ -18194,6 +18248,7 @@ unreserved_keyword: | RELATIVE_P | RELEASE | RENAME + | REPACK | REPEATABLE | REPLACE | REPLICA @@ -18831,6 +18886,7 @@ bare_label_keyword: | RELATIVE_P | RELEASE | RENAME + | REPACK | REPEATABLE | REPLACE | REPLICA diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 742137edad69f..e1aa102f41dce 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -23,6 +23,7 @@ #include "postgres.h" #include "access/timeline.h" +#include "access/visibilitymap.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogrecovery.h" @@ -1351,7 +1352,8 @@ SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) MAIN_FORKNUM, xlrec->blkno); if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0) BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, - VISIBILITYMAP_FORKNUM, xlrec->blkno); + VISIBILITYMAP_FORKNUM, + visibilitymap_truncation_length(xlrec->blkno)); } } diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 7c99290be4d21..f98062668d6dc 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -195,7 +195,7 @@ AcquireRewriteLocks(Query *parsetree, else lockmode = rte->rellockmode; - rel = table_open(rte->relid, lockmode); + rel = relation_open(rte->relid, lockmode); /* * While we have the relation open, update the RTE's relkind, @@ -203,7 +203,7 @@ AcquireRewriteLocks(Query *parsetree, */ rte->relkind = rel->rd_rel->relkind; - table_close(rel, NoLock); + relation_close(rel, NoLock); break; case RTE_JOIN: @@ -2116,7 +2116,7 @@ fireRIRrules(Query *parsetree, List *activeRIRs) * We can use NoLock here since either the parser or * AcquireRewriteLocks should have locked the rel already. */ - rel = table_open(rte->relid, NoLock); + rel = relation_open(rte->relid, NoLock); /* * Collect the RIR rules that we must apply @@ -2226,7 +2226,7 @@ fireRIRrules(Query *parsetree, List *activeRIRs) rte->relkind != RELKIND_PARTITIONED_TABLE)) continue; - rel = table_open(rte->relid, NoLock); + rel = relation_open(rte->relid, NoLock); /* * Fetch any new security quals that must be applied to this RTE. @@ -3445,7 +3445,7 @@ rewriteTargetView(Query *parsetree, Relation view) * already have the right lock!) Since it will become the query target * relation, RowExclusiveLock is always the right thing. */ - base_rel = table_open(base_rte->relid, RowExclusiveLock); + base_rel = relation_open(base_rte->relid, RowExclusiveLock); /* * While we have the relation open, update the RTE's relkind, just in case @@ -4021,7 +4021,7 @@ RewriteQuery(Query *parsetree, List *rewrite_events, int orig_rt_length, * We can use NoLock here since either the parser or * AcquireRewriteLocks should have locked the rel already. */ - rt_entry_relation = table_open(rt_entry->relid, NoLock); + rt_entry_relation = relation_open(rt_entry->relid, NoLock); /* * Rewrite the targetlist as needed for the command type. diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 119f31b5d6584..b332e002ba13b 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -25,21 +25,26 @@ that might need to do such a wait is instead handled by waiting to obtain the relation-level lock, which is why you'd better hold one first.) Pins may not be held across transaction boundaries, however. -Buffer content locks: there are two kinds of buffer lock, shared and exclusive, -which act just as you'd expect: multiple backends can hold shared locks on -the same buffer, but an exclusive lock prevents anyone else from holding -either shared or exclusive lock. (These can alternatively be called READ -and WRITE locks.) These locks are intended to be short-term: they should not -be held for long. Buffer locks are acquired and released by LockBuffer(). -It will *not* work for a single backend to try to acquire multiple locks on -the same buffer. One must pin a buffer before trying to lock it. +Buffer content locks: there are three kinds of buffer lock, shared, +share-exclusive and exclusive: +a) multiple backends can hold shared locks on the same buffer + (alternatively called a READ lock) +b) one backend can hold a share-exclusive lock on a buffer while multiple + backends can hold a share lock +c) an exclusive lock prevents anyone else from holding a shared, + share-exclusive or exclusive lock. + (alternatively called a WRITE lock) + +These locks are intended to be short-term: they should not be held for long. +Buffer locks are acquired and released by LockBuffer(). It will *not* work +for a single backend to try to acquire multiple locks on the same buffer. One +must pin a buffer before trying to lock it. Buffer access rules: -1. To scan a page for tuples, one must hold a pin and either shared or -exclusive content lock. To examine the commit status (XIDs and status bits) -of a tuple in a shared buffer, one must likewise hold a pin and either shared -or exclusive lock. +1. To scan a page for tuples, one must hold a pin and at least a share lock. +To examine the commit status (XIDs and status bits) of a tuple in a shared +buffer, one must likewise hold a pin and at least a share lock. 2. Once one has determined that a tuple is interesting (visible to the current transaction) one may drop the content lock, yet continue to access @@ -55,19 +60,25 @@ one must hold a pin and an exclusive content lock on the containing buffer. This ensures that no one else might see a partially-updated state of the tuple while they are doing visibility checks. -4. It is considered OK to update tuple commit status bits (ie, OR the -values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or -HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and -pin on a buffer. This is OK because another backend looking at the tuple -at about the same time would OR the same bits into the field, so there -is little or no risk of conflicting update; what's more, if there did -manage to be a conflict it would merely mean that one bit-update would -be lost and need to be done again later. These four bits are only hints -(they cache the results of transaction status lookups in pg_xact), so no -great harm is done if they get reset to zero by conflicting updates. -Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID -and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires -an exclusive buffer lock (and it must also be WAL-logged). +4. Non-critical information on a page ("hint bits") may be modified while +holding only a share-exclusive lock and pin on the page. To do so in cases +where only a share lock is already held, use BufferBeginSetHintBits() & +BufferFinishSetHintBits() (if multiple hint bits are to be set) or +BufferSetHintBits16() (if a single hint bit is set). + +E.g. for heapam, a share-exclusive lock allows to update tuple commit status +bits (ie, OR the values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, +HEAP_XMAX_COMMITTED, or HEAP_XMAX_INVALID into t_infomask) while holding only +a share-exclusive lock and pin on a buffer. This is OK because another +backend looking at the tuple at about the same time would OR the same bits +into the field, so there is little or no risk of conflicting update; what's +more, if there did manage to be a conflict it would merely mean that one +bit-update would be lost and need to be done again later. These four bits are +only hints (they cache the results of transaction status lookups in pg_xact), +so no great harm is done if they get reset to zero by conflicting updates. +Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID and +HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires an +exclusive buffer lock (and it must also be WAL-logged). 5. To physically remove a tuple or compact free space on a page, one must hold a pin and an exclusive lock, *and* observe while holding the @@ -80,7 +91,6 @@ buffer (increment the refcount) while one is performing the cleanup, but it won't be able to actually examine the page until it acquires shared or exclusive content lock. - Obtaining the lock needed under rule #5 is done by the bufmgr routines LockBufferForCleanup() or ConditionalLockBufferForCleanup(). They first get an exclusive lock and then check to see if the shared pin count is currently @@ -96,6 +106,10 @@ VACUUM's use, since we don't allow multiple VACUUMs concurrently on a single relation anyway. Anyone wishing to obtain a cleanup lock outside of recovery or a VACUUM must use the conditional variant of the function. +6. To write out a buffer, a share-exclusive lock needs to be held. This +prevents the buffer from being modified while written out, which could corrupt +checksums and cause issues on the OS or device level when direct-IO is used. + Buffer Manager's Internal Locking --------------------------------- diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5f3d083e93886..0546ee0193ce8 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2481,10 +2481,10 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* * If the buffer was dirty, try to write it out. There is a race - * condition here, in that someone might dirty it after we released the - * buffer header lock above, or even while we are writing it out (since - * our share-lock won't prevent hint-bit updates). We will recheck the - * dirty bit after re-locking the buffer header. + * condition here, another backend could dirty the buffer between + * StrategyGetBuffer() checking that it is not in use and invalidating the + * buffer below. That's addressed by InvalidateVictimBuffer() verifying + * that the buffer is not dirty. */ if (buf_state & BM_DIRTY) { @@ -2492,20 +2492,20 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) Assert(buf_state & BM_VALID); /* - * We need a share-lock on the buffer contents to write it out (else - * we might write invalid data, eg because someone else is compacting - * the page contents while we write). We must use a conditional lock - * acquisition here to avoid deadlock. Even though the buffer was not - * pinned (and therefore surely not locked) when StrategyGetBuffer - * returned it, someone else could have pinned and exclusive-locked it - * by the time we get here. If we try to get the lock unconditionally, - * we'd block waiting for them; if they later block waiting for us, - * deadlock ensues. (This has been observed to happen when two - * backends are both trying to split btree index pages, and the second - * one just happens to be trying to split the page the first one got - * from StrategyGetBuffer.) + * We need a share-exclusive lock on the buffer contents to write it + * out (else we might write invalid data, eg because someone else is + * compacting the page contents while we write). We must use a + * conditional lock acquisition here to avoid deadlock. Even though + * the buffer was not pinned (and therefore surely not locked) when + * StrategyGetBuffer returned it, someone else could have pinned and + * (share-)exclusive-locked it by the time we get here. If we try to + * get the lock unconditionally, we'd block waiting for them; if they + * later block waiting for us, deadlock ensues. (This has been + * observed to happen when two backends are both trying to split btree + * index pages, and the second one just happens to be trying to split + * the page the first one got from StrategyGetBuffer.) */ - if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE)) + if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE)) { /* * Someone else has locked the buffer, so give it up and loop back @@ -2518,18 +2518,14 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) /* * If using a nondefault strategy, and writing the buffer would * require a WAL flush, let the strategy decide whether to go ahead - * and write/reuse the buffer or to choose another victim. We need a - * lock to inspect the page LSN, so this can't be done inside + * and write/reuse the buffer or to choose another victim. We need to + * hold the content lock in at least share-exclusive mode to safely + * inspect the page LSN, so this couldn't have been done inside * StrategyGetBuffer. */ if (strategy != NULL) { - XLogRecPtr lsn; - - /* Read the LSN while holding buffer header lock */ - buf_state = LockBufHdr(buf_hdr); - lsn = BufferGetLSN(buf_hdr); - UnlockBufHdr(buf_hdr); + XLogRecPtr lsn = BufferGetLSN(buf_hdr); if (XLogNeedsFlush(lsn) && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) @@ -3019,7 +3015,7 @@ BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode) * * Checks if buffer is already dirty. * - * Buffer must be pinned and exclusive-locked. (Without an exclusive lock, + * Buffer must be pinned and [share-]exclusive-locked. (Without such a lock, * the result may be stale before it's returned.) */ bool @@ -3039,7 +3035,8 @@ BufferIsDirty(Buffer buffer) else { bufHdr = GetBufferDescriptor(buffer - 1); - Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) || + BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); } return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY; @@ -4074,8 +4071,8 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) } /* - * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the - * buffer is clean by the time we've locked it.) + * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do + * nothing if the buffer is clean by the time we've locked it.) */ PinBuffer_Locked(bufHdr); @@ -4405,11 +4402,8 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, * However, we will need to force the changes to disk via fsync before * we can checkpoint WAL. * - * The caller must hold a pin on the buffer and have share-locked the - * buffer contents. (Note: a share-lock does not prevent updates of - * hint bits in the buffer, so the page could change while the write - * is in progress, but we assume that that will not invalidate the data - * written.) + * The caller must hold a pin on the buffer and have + * (share-)exclusively-locked the buffer contents. * * If the caller has an smgr reference for the buffer's relation, pass it * as the second parameter. If not, pass NULL. @@ -4425,6 +4419,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, char *bufToWrite; uint64 buf_state; + Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) || + BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE)); + /* * Try to start an I/O operation. If StartBufferIO returns false, then * someone else flushed the buffer before we could, so we need not do @@ -4452,8 +4449,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, buf_state = LockBufHdr(buf); /* - * Run PageGetLSN while holding header lock, since we don't have the - * buffer locked exclusively in all cases. + * As we hold at least a share-exclusive lock on the buffer, the LSN + * cannot change during the flush (and thus can't be torn). */ recptr = BufferGetLSN(buf); @@ -4557,7 +4554,7 @@ FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, { Buffer buffer = BufferDescriptorGetBuffer(buf); - BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE); + BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE_EXCLUSIVE); FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL); BufferLockUnlock(buffer, buf); } @@ -4629,8 +4626,9 @@ BufferIsPermanent(Buffer buffer) /* * BufferGetLSNAtomic * Retrieves the LSN of the buffer atomically using a buffer header lock. - * This is necessary for some callers who may not have an exclusive lock - * on the buffer. + * This is necessary for some callers who may only hold a share lock on + * the buffer. A share lock allows a concurrent backend to set hint bits + * on the page, which in turn may require a WAL record to be emitted. */ XLogRecPtr BufferGetLSNAtomic(Buffer buffer) @@ -5476,8 +5474,8 @@ FlushDatabaseBuffers(Oid dbid) } /* - * Flush a previously, shared or exclusively, locked and pinned buffer to the - * OS. + * Flush a previously, share-exclusively or exclusively, locked and pinned + * buffer to the OS. */ void FlushOneBuffer(Buffer buffer) @@ -5550,56 +5548,38 @@ IncrBufferRefCount(Buffer buffer) } /* - * MarkBufferDirtyHint - * - * Mark a buffer dirty for non-critical changes. + * Shared-buffer only helper for MarkBufferDirtyHint() and + * BufferSetHintBits16(). * - * This is essentially the same as MarkBufferDirty, except: - * - * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. - * 2. The caller might have only share-lock instead of exclusive-lock on the - * buffer's content lock. - * 3. This function does not guarantee that the buffer is always marked dirty - * (due to a race condition), so it cannot be used for important changes. + * This is separated out because it turns out that the repeated checks for + * local buffers, repeated GetBufferDescriptor() and repeated reading of the + * buffer's state sufficiently hurts the performance of BufferSetHintBits16(). */ -void -MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +static inline void +MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, + bool buffer_std) { - BufferDesc *bufHdr; Page page = BufferGetPage(buffer); - if (!BufferIsValid(buffer)) - elog(ERROR, "bad buffer ID: %d", buffer); - - if (BufferIsLocal(buffer)) - { - MarkLocalBufferDirty(buffer); - return; - } - - bufHdr = GetBufferDescriptor(buffer - 1); - Assert(GetPrivateRefCount(buffer) > 0); - /* here, either share or exclusive lock is OK */ - Assert(BufferIsLockedByMe(buffer)); + + /* here, either share-exclusive or exclusive lock is OK */ + Assert(BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_EXCLUSIVE) || + BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_SHARE_EXCLUSIVE)); /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many - * tuples. So, be as quick as we can if the buffer is already dirty. We - * do this by not acquiring spinlock if it looks like the status bits are - * already set. Since we make this test unlocked, there's a chance we - * might fail to notice that the flags have just been cleared, and failed - * to reset them, due to memory-ordering issues. But since this function - * is only intended to be used in cases where failing to write out the - * data would be harmless anyway, it doesn't really matter. + * tuples. So, be as quick as we can if the buffer is already dirty. + * + * As we are holding (at least) a share-exclusive lock, nobody could have + * cleaned or dirtied the page concurrently, so we can just rely on the + * previously fetched value here without any danger of races. */ - if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) != - (BM_DIRTY | BM_JUST_DIRTIED)) + if (unlikely((lockstate & (BM_DIRTY | BM_JUST_DIRTIED)) != + (BM_DIRTY | BM_JUST_DIRTIED))) { XLogRecPtr lsn = InvalidXLogRecPtr; - bool dirtied = false; bool delayChkptFlags = false; uint64 buf_state; @@ -5612,8 +5592,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * We don't check full_page_writes here because that logic is included * when we call XLogInsert() since the value changes dynamically. */ - if (XLogHintBitIsNeeded() && - (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT)) + if (XLogHintBitIsNeeded() && (lockstate & BM_PERMANENT)) { /* * If we must not write WAL, due to a relfilelocator-specific @@ -5658,27 +5637,29 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) buf_state = LockBufHdr(bufHdr); + /* + * It should not be possible for the buffer to already be dirty, see + * comment above. + */ + Assert(!(buf_state & BM_DIRTY)); Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); - if (!(buf_state & BM_DIRTY)) + if (XLogRecPtrIsValid(lsn)) { - dirtied = true; /* Means "will be dirtied by this action" */ - /* - * Set the page LSN if we wrote a backup block. We aren't supposed - * to set this when only holding a share lock but as long as we - * serialise it somehow we're OK. We choose to set LSN while - * holding the buffer header lock, which causes any reader of an - * LSN who holds only a share lock to also obtain a buffer header - * lock before using PageGetLSN(), which is enforced in - * BufferGetLSNAtomic(). + * Set the page LSN if we wrote a backup block. To allow backends + * that only hold a share lock on the buffer to read the LSN in a + * tear-free manner, we set the page LSN while holding the buffer + * header lock. This allows any reader of an LSN who holds only a + * share lock to also obtain a buffer header lock before using + * PageGetLSN() to read the LSN in a tear free way. This is done + * in BufferGetLSNAtomic(). * * If checksums are enabled, you might think we should reset the * checksum here. That will happen when the page is written * sometime later in this checkpoint cycle. */ - if (XLogRecPtrIsValid(lsn)) - PageSetLSN(page, lsn); + PageSetLSN(page, lsn); } UnlockBufHdrExt(bufHdr, buf_state, @@ -5688,15 +5669,48 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (delayChkptFlags) MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; - if (dirtied) - { - pgBufferUsage.shared_blks_dirtied++; - if (VacuumCostActive) - VacuumCostBalance += VacuumCostPageDirty; - } + pgBufferUsage.shared_blks_dirtied++; + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageDirty; } } +/* + * MarkBufferDirtyHint + * + * Mark a buffer dirty for non-critical changes. + * + * This is essentially the same as MarkBufferDirty, except: + * + * 1. The caller does not write WAL; so if checksums are enabled, we may need + * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * 2. The caller might have only a share-exclusive-lock instead of an + * exclusive-lock on the buffer's content lock. + * 3. This function does not guarantee that the buffer is always marked dirty + * (it e.g. can't always on a hot standby), so it cannot be used for + * important changes. + */ +inline void +MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +{ + BufferDesc *bufHdr; + + bufHdr = GetBufferDescriptor(buffer - 1); + + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + if (BufferIsLocal(buffer)) + { + MarkLocalBufferDirty(buffer); + return; + } + + MarkSharedBufferDirtyHint(buffer, bufHdr, + pg_atomic_read_u64(&bufHdr->state), + buffer_std); +} + /* * Release buffer content locks for shared buffers. * @@ -6798,6 +6812,192 @@ IsBufferCleanupOK(Buffer buffer) return false; } +/* + * Helper for BufferBeginSetHintBits() and BufferSetHintBits16(). + * + * This checks if the current lock mode already suffices to allow hint bits + * being set and, if not, whether the current lock can be upgraded. + * + * Updates *lockstate when returning true. + */ +static inline bool +SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate) +{ + uint64 old_state; + PrivateRefCountEntry *ref; + BufferLockMode mode; + + ref = GetPrivateRefCountEntry(buffer, true); + + if (ref == NULL) + elog(ERROR, "buffer is not pinned"); + + mode = ref->data.lockmode; + if (mode == BUFFER_LOCK_UNLOCK) + elog(ERROR, "buffer is not locked"); + + /* we're done if we are already holding a sufficient lock level */ + if (mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE_EXCLUSIVE) + { + *lockstate = pg_atomic_read_u64(&buf_hdr->state); + return true; + } + + /* + * We are only holding a share lock right now, try to upgrade it to + * SHARE_EXCLUSIVE. + */ + Assert(mode == BUFFER_LOCK_SHARE); + + old_state = pg_atomic_read_u64(&buf_hdr->state); + while (true) + { + uint64 desired_state; + + desired_state = old_state; + + /* + * Can't upgrade if somebody else holds the lock in exclusive or + * share-exclusive mode. + */ + if (unlikely((old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) != 0)) + { + return false; + } + + /* currently held lock state */ + desired_state -= BM_LOCK_VAL_SHARED; + + /* new lock level */ + desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE; + + if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state, + &old_state, desired_state))) + { + ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE; + *lockstate = desired_state; + + return true; + } + } +} + +/* + * Try to acquire the right to set hint bits on the buffer. + * + * To be allowed to set hint bits, this backend needs to hold either a + * share-exclusive or an exclusive lock. In case this backend only holds a + * share lock, this function will try to upgrade the lock to + * share-exclusive. The caller is only allowed to set hint bits if true is + * returned. + * + * Once BufferBeginSetHintBits() has returned true, hint bits may be set + * without further calls to BufferBeginSetHintBits(), until the buffer is + * unlocked. + * + * + * Requiring a share-exclusive lock to set hint bits prevents setting hint + * bits on buffers that are currently being written out, which could corrupt + * the checksum on the page. Flushing buffers also requires a share-exclusive + * lock. + * + * Due to a lock >= share-exclusive being required to set hint bits, only one + * backend can set hint bits at a time. Allowing multiple backends to set hint + * bits would require more complicated locking: For setting hint bits we'd + * need to store the count of backends currently setting hint bits, for I/O we + * would need another lock-level conflicting with the hint-setting + * lock-level. Given that the share-exclusive lock for setting hint bits is + * only held for a short time, that backends often would just set the same + * hint bits and that the cost of occasionally not setting hint bits in hotly + * accessed pages is fairly low, this seems like an acceptable tradeoff. + */ +bool +BufferBeginSetHintBits(Buffer buffer) +{ + BufferDesc *buf_hdr; + uint64 lockstate; + + if (BufferIsLocal(buffer)) + { + /* + * NB: Will need to check if there is a write in progress, once it is + * possible for writes to be done asynchronously. + */ + return true; + } + + buf_hdr = GetBufferDescriptor(buffer - 1); + + return SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate); +} + +/* + * End a phase of setting hint bits on this buffer, started with + * BufferBeginSetHintBits(). + * + * This would strictly speaking not be required (i.e. the caller could do + * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity + * checks. + */ +void +BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std) +{ + if (!BufferIsLocal(buffer)) + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) || + BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE)); + + if (mark_dirty) + MarkBufferDirtyHint(buffer, buffer_std); +} + +/* + * Try to set hint bits on a single 16bit value in a buffer. + * + * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer + * dirty and return true. Otherwise false is returned. + * + * *ptr needs to be a pointer to memory within the buffer. + * + * This is a bit faster than BufferBeginSetHintBits() / + * BufferFinishSetHintBits() when setting hints once in a buffer, but slower + * than the former when setting hint bits multiple times in the same buffer. + */ +bool +BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer) +{ + BufferDesc *buf_hdr; + uint64 lockstate; +#ifdef USE_ASSERT_CHECKING + char *page; + + /* verify that the address is on the page */ + page = BufferGetPage(buffer); + Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ)); +#endif + + if (BufferIsLocal(buffer)) + { + *ptr = val; + + MarkLocalBufferDirty(buffer); + + return true; + } + + buf_hdr = GetBufferDescriptor(buffer - 1); + + if (SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate)) + { + *ptr = val; + + MarkSharedBufferDirtyHint(buffer, buf_hdr, lockstate, true); + + return true; + } + + return false; +} + /* * Functions for buffer I/O handling diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index ad337c0087182..b9a8f368a6372 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -904,13 +904,17 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, max_avail = fsm_get_max_avail(page); /* - * Reset the next slot pointer. This encourages the use of low-numbered - * pages, increasing the chances that a later vacuum can truncate the - * relation. We don't bother with marking the page dirty if it wasn't - * already, since this is just a hint. + * Try to reset the next slot pointer. This encourages the use of + * low-numbered pages, increasing the chances that a later vacuum can + * truncate the relation. We don't bother with marking the page dirty if + * it wasn't already, since this is just a hint. */ LockBuffer(buf, BUFFER_LOCK_SHARE); - ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + if (BufferBeginSetHintBits(buf)) + { + ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + BufferFinishSetHintBits(buf, false, false); + } LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index 33ee825529ca0..a2657c4033b9b 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -298,9 +298,18 @@ fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, * lock and get a garbled next pointer every now and then, than take the * concurrency hit of an exclusive lock. * + * Without an exclusive lock, we need to use the hint bit infrastructure + * to be allowed to modify the page. + * * Wrap-around is handled at the beginning of this function. */ - fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + if (exclusive_lock_held || BufferBeginSetHintBits(buf)) + { + fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + + if (!exclusive_lock_held) + BufferFinishSetHintBits(buf, false, false); + } return slot; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 406b8253f8bbf..0f913897acc92 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -710,8 +710,6 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -752,8 +750,6 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -935,7 +931,6 @@ ProcArrayClearTransaction(PGPROC *proc) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkptFlags); @@ -3526,7 +3521,7 @@ SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflict } /* - * SignalRecoveryConflictWithDatabase --- signal all backends specified database + * SignalRecoveryConflictWithDatabase -- signal backends using specified database * * Like SignalRecoveryConflict, but signals all backends using the database. */ diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index daf70d9ce2a8f..d407725e6027e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -689,6 +689,7 @@ InitAuxiliaryProcess(void) Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); } #endif + pg_atomic_write_u32(&MyProc->pendingRecoveryConflicts, 0); /* * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index bf707f2d57ffb..b4651a641318c 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -279,9 +279,9 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) return COMMAND_OK_IN_RECOVERY | COMMAND_OK_IN_READ_ONLY_TXN; } - case T_ClusterStmt: case T_ReindexStmt: case T_VacuumStmt: + case T_RepackStmt: { /* * These commands write WAL, so they're not strictly @@ -290,9 +290,9 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) * * However, they don't change the database state in a way that * would affect pg_dump output, so it's fine to run them in a - * read-only transaction. (CLUSTER might change the order of - * rows on disk, which could affect the ordering of pg_dump - * output, but that's not semantically significant.) + * read-only transaction. (REPACK/CLUSTER might change the + * order of rows on disk, which could affect the ordering of + * pg_dump output, but that's not semantically significant.) */ return COMMAND_OK_IN_READ_ONLY_TXN; } @@ -856,14 +856,14 @@ standard_ProcessUtility(PlannedStmt *pstmt, ExecuteCallStmt(castNode(CallStmt, parsetree), params, isAtomicContext, dest); break; - case T_ClusterStmt: - cluster(pstate, (ClusterStmt *) parsetree, isTopLevel); - break; - case T_VacuumStmt: ExecVacuum(pstate, (VacuumStmt *) parsetree, isTopLevel); break; + case T_RepackStmt: + ExecRepack(pstate, (RepackStmt *) parsetree, isTopLevel); + break; + case T_ExplainStmt: ExplainQuery(pstate, (ExplainStmt *) parsetree, params, dest); break; @@ -2865,10 +2865,6 @@ CreateCommandTag(Node *parsetree) tag = CMDTAG_CALL; break; - case T_ClusterStmt: - tag = CMDTAG_CLUSTER; - break; - case T_VacuumStmt: if (((VacuumStmt *) parsetree)->is_vacuumcmd) tag = CMDTAG_VACUUM; @@ -2876,6 +2872,13 @@ CreateCommandTag(Node *parsetree) tag = CMDTAG_ANALYZE; break; + case T_RepackStmt: + if (((RepackStmt *) parsetree)->command == REPACK_COMMAND_CLUSTER) + tag = CMDTAG_CLUSTER; + else + tag = CMDTAG_REPACK; + break; + case T_ExplainStmt: tag = CMDTAG_EXPLAIN; break; @@ -3517,7 +3520,7 @@ GetCommandLogLevel(Node *parsetree) lev = LOGSTMT_ALL; break; - case T_ClusterStmt: + case T_RepackStmt: lev = LOGSTMT_DDL; break; diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 50ea9e8fb83a6..5ac022274a738 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -288,8 +288,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_VACUUM; else if (pg_strcasecmp(cmd, "ANALYZE") == 0) cmdtype = PROGRESS_COMMAND_ANALYZE; - else if (pg_strcasecmp(cmd, "CLUSTER") == 0) - cmdtype = PROGRESS_COMMAND_CLUSTER; + else if (pg_strcasecmp(cmd, "REPACK") == 0) + cmdtype = PROGRESS_COMMAND_REPACK; else if (pg_strcasecmp(cmd, "CREATE INDEX") == 0) cmdtype = PROGRESS_COMMAND_CREATE_INDEX; else if (pg_strcasecmp(cmd, "BASEBACKUP") == 0) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 2c8d5a81b751d..79f6cf7b4fa76 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -529,7 +529,7 @@ xmltext(PG_FUNCTION_ARGS) #ifdef USE_LIBXML text *arg = PG_GETARG_TEXT_PP(0); text *result; - volatile xmlChar *xmlbuf = NULL; + xmlChar *volatile xmlbuf = NULL; PgXmlErrorContext *xmlerrcxt; /* First we gotta spin up some error handling. */ @@ -544,19 +544,19 @@ xmltext(PG_FUNCTION_ARGS) "could not allocate xmlChar"); result = cstring_to_text_with_len((const char *) xmlbuf, - xmlStrlen((const xmlChar *) xmlbuf)); + xmlStrlen(xmlbuf)); } PG_CATCH(); { if (xmlbuf) - xmlFree((xmlChar *) xmlbuf); + xmlFree(xmlbuf); pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } PG_END_TRY(); - xmlFree((xmlChar *) xmlbuf); + xmlFree(xmlbuf); pg_xml_done(xmlerrcxt, false); PG_RETURN_XML_P(result); @@ -4247,7 +4247,7 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } else { - volatile xmlChar *str = NULL; + xmlChar *volatile str = NULL; PG_TRY(); { @@ -4267,7 +4267,7 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) PG_FINALLY(); { if (str) - xmlFree((xmlChar *) str); + xmlFree(str); } PG_END_TRY(); } diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 5ee84a639d828..18cd9a0fafdd6 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -806,7 +806,7 @@ }, { name => 'effective_wal_level', type => 'enum', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', - short_desc => 'Show effective WAL level.', + short_desc => 'Shows effective WAL level.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', variable => 'effective_wal_level', boot_val => 'WAL_LEVEL_REPLICA', @@ -2988,6 +2988,17 @@ assign_hook => 'assign_timezone_abbreviations', }, +{ name => 'timing_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME', + short_desc => 'Controls the clock source used for collecting timing measurements.', + long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.', + variable => 'timing_clock_source', + boot_val => 'TIMING_CLOCK_SOURCE_AUTO', + options => 'timing_clock_source_options', + check_hook => 'check_timing_clock_source', + assign_hook => 'assign_timing_clock_source', + show_hook => 'show_timing_clock_source', +}, + { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', short_desc => 'Logs details of pre-authentication connection handshake.', flags => 'GUC_NOT_IN_SAMPLE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 38aaf82f12094..b8bb9590d9c6c 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -92,6 +92,7 @@ #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" +#include "portability/instr_time.h" #include "utils/bytea.h" #include "utils/float.h" #include "utils/guc_hooks.h" @@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry timing_clock_source_options[] = { + {"auto", TIMING_CLOCK_SOURCE_AUTO, false}, + {"system", TIMING_CLOCK_SOURCE_SYSTEM, false}, +#if PG_INSTR_TSC_CLOCK + {"tsc", TIMING_CLOCK_SOURCE_TSC, false}, +#endif + {NULL, 0, false} +}; + static const struct config_enum_entry huge_pages_status_options[] = { {"off", HUGE_PAGES_OFF, false}, {"on", HUGE_PAGES_ON, false}, @@ -723,6 +733,7 @@ const char *const config_group_names[] = [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"), [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"), [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"), + [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"), [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"), [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"), [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e686d88afc427..3cbe96b96edd5 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -193,6 +193,10 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Time - + +#timing_clock_source = auto # auto, system, tsc (if supported) + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index ce9ede4c19697..4b4f1e1965ba3 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -2196,6 +2196,8 @@ make_new_segment(dsa_area *area, size_t requested_pages) /* See if that is enough... */ if (requested_pages > usable_pages) { + size_t total_requested_pages PG_USED_FOR_ASSERTS_ONLY; + /* * We'll make an odd-sized segment, working forward from the requested * number of pages. @@ -2206,10 +2208,37 @@ make_new_segment(dsa_area *area, size_t requested_pages) MAXALIGN(sizeof(FreePageManager)) + usable_pages * sizeof(dsa_pointer); + /* + * We must also account for pagemap entries needed to cover the + * metadata pages themselves. The pagemap must track all pages in the + * segment, including the pages occupied by metadata. + * + * This formula uses integer ceiling division to compute the exact + * number of additional entries needed. The divisor (FPM_PAGE_SIZE - + * sizeof(dsa_pointer)) accounts for the fact that each metadata page + * consumes one pagemap entry of sizeof(dsa_pointer) bytes, leaving + * only (FPM_PAGE_SIZE - sizeof(dsa_pointer)) net bytes per metadata + * page. + */ + metadata_bytes += + ((metadata_bytes + (FPM_PAGE_SIZE - sizeof(dsa_pointer)) - 1) / + (FPM_PAGE_SIZE - sizeof(dsa_pointer))) * + sizeof(dsa_pointer); + /* Add padding up to next page boundary. */ if (metadata_bytes % FPM_PAGE_SIZE != 0) metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE; + total_requested_pages = total_size / FPM_PAGE_SIZE; + + /* + * Verify that we allocated enough pagemap entries for metadata and + * usable pages. This reverse-engineers the new calculation of + * "metadata_bytes" done based on the new "requested_pages" for an + * odd-sized segment. + */ + Assert((metadata_bytes - MAXALIGN(sizeof(dsa_segment_header)) - + MAXALIGN(sizeof(FreePageManager))) / sizeof(dsa_pointer) >= total_requested_pages); /* Is that too large for dsa_pointer's addressing scheme? */ if (total_size > DSA_MAX_SEGMENT_SIZE) diff --git a/src/bin/pg_combinebackup/t/011_ib_truncation.pl b/src/bin/pg_combinebackup/t/011_ib_truncation.pl index 47d84434452fb..c5e0124c04deb 100644 --- a/src/bin/pg_combinebackup/t/011_ib_truncation.pl +++ b/src/bin/pg_combinebackup/t/011_ib_truncation.pl @@ -1,7 +1,8 @@ # Copyright (c) 2025-2026, PostgreSQL Global Development Group # -# This test aims to validate that the calculated truncation block never exceeds -# the segment size. +# This test aims to validate two things: (1) that the calculated truncation +# block never exceeds the segment size and (2) that the correct limit block +# length is calculated for the VM fork. use strict; use warnings FATAL => 'all'; @@ -39,7 +40,7 @@ CREATE TABLE t ( id int, data text STORAGE PLAIN - ); + ) WITH (autovacuum_enabled = false); }); # The tuple size should be enough to prevent two tuples from being on the same @@ -83,6 +84,23 @@ $primary->backup('incr', backup_options => [ '--incremental', "$full_backup/backup_manifest" ]); +# We used to have a bug where the wrong limit block was calculated for the +# VM fork, so verify that the WAL summary records the correct VM fork +# truncation limit. We can't just check whether the restored VM fork is +# the right size on disk, because it's so small that the incremental backup +# code will send the entire file. +my $relfilenode = $primary->safe_psql('postgres', + "SELECT pg_relation_filenode('t');"); +my $vm_limits = $primary->safe_psql('postgres', + "SELECT string_agg(relblocknumber::text, ',') + FROM pg_available_wal_summaries() s, + pg_wal_summary_contents(s.tli, s.start_lsn, s.end_lsn) c + WHERE c.relfilenode = $relfilenode + AND c.relforknumber = 2 + AND c.is_limit_block;"); +is($vm_limits, '1', + 'WAL summary has correct VM fork truncation limit'); + # Combine full and incremental backups. Before the fix, this failed because # the INCREMENTAL file header contained an incorrect truncation_block value. my $restored = PostgreSQL::Test::Cluster->new('node2'); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 8bde1b382defd..137161aa5e059 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -826,52 +826,39 @@ main(int argc, char **argv) if (dopt.column_inserts && dopt.dump_inserts == 0) dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT; - /* reject conflicting "-only" options */ - if (data_only && schema_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "-a/--data-only"); - if (schema_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics-only"); - if (data_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics-only"); - - /* reject conflicting "-only" and "no-" options */ - if (data_only && no_data) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--no-data"); - if (schema_only && no_schema) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--no-schema"); - if (statistics_only && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics-only", "--no-statistics"); - - /* reject conflicting "no-" options */ - if (with_statistics && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics", "--no-statistics"); - - /* reject conflicting "-only" options */ - if (data_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics"); - if (schema_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics"); - - if (schema_only && foreign_servers_include_patterns.head != NULL) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--include-foreign-data"); + /* *-only options are incompatible with each other */ + check_mut_excl_opts(data_only, "-a/--data-only", + schema_only, "-s/--schema-only", + statistics_only, "--statistics-only"); + + /* --no-* and *-only for same thing are incompatible */ + check_mut_excl_opts(data_only, "-a/--data-only", + no_data, "--no-data"); + check_mut_excl_opts(schema_only, "-s/--schema-only", + no_schema, "--no-schema"); + check_mut_excl_opts(statistics_only, "--statistics-only", + no_statistics, "--no-statistics"); + + /* --statistics and --no-statistics are incompatible */ + check_mut_excl_opts(with_statistics, "--statistics", + no_statistics, "--no-statistics"); + + /* --statistics is incompatible with *-only (except --statistics-only) */ + check_mut_excl_opts(with_statistics, "--statistics", + data_only, "-a/--data-only", + schema_only, "-s/--schema-only"); + + /* --include-foreign-data is incompatible with --schema-only */ + check_mut_excl_opts(foreign_servers_include_patterns.head, "--include-foreign-data", + schema_only, "-s/--schema-only"); if (numWorkers > 1 && foreign_servers_include_patterns.head != NULL) pg_fatal("option %s is not supported with parallel backup", "--include-foreign-data"); - if (data_only && dopt.outputClean) - pg_fatal("options %s and %s cannot be used together", - "-c/--clean", "-a/--data-only"); + /* --clean is incompatible with --data-only */ + check_mut_excl_opts(dopt.outputClean, "-c/--clean", + data_only, "-a/--data-only"); if (dopt.if_exists && !dopt.outputClean) pg_fatal("option %s requires option %s", diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 4ded902095288..3d2a1d27aefdd 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -421,7 +421,7 @@ main(int argc, char *argv[]) exit_nicely(1); } - /* --exclude_database is incompatible with global *-only options */ + /* --exclude-database is incompatible with global *-only options */ check_mut_excl_opts(database_exclude_patterns.head, "--exclude-database", globals_only, "-g/--globals-only", roles_only, "-r/--roles-only", @@ -1833,7 +1833,6 @@ dropDBs(PGconn *conn) for (i = 0; i < PQntuples(res); i++) { char *dbname = PQgetvalue(res, i, 0); - PQExpBuffer delQry = createPQExpBuffer(); /* * Skip "postgres" and "template1"; dumpDatabases() will deal with @@ -1846,15 +1845,14 @@ dropDBs(PGconn *conn) { if (archDumpFormat == archNull) { - appendPQExpBuffer(delQry, "DROP DATABASE %s%s;\n", - if_exists ? "IF EXISTS " : "", - fmtId(dbname)); - fprintf(OPF, "%s", delQry->data); + fprintf(OPF, "DROP DATABASE %s%s;\n", + if_exists ? "IF EXISTS " : "", + fmtId(dbname)); } else { - appendPQExpBuffer(delQry, "DROP DATABASE IF EXISTS %s;\n", - fmtId(dbname)); + char *stmt = psprintf("DROP DATABASE IF EXISTS %s;\n", + fmtId(dbname)); ArchiveEntry(fout, nilCatalogId, /* catalog ID */ @@ -1862,10 +1860,9 @@ dropDBs(PGconn *conn) ARCHIVE_OPTS(.tag = psprintf("DATABASE %s", fmtId(dbname)), .description = "DROP_GLOBAL", .section = SECTION_PRE_DATA, - .createStmt = delQry->data)); + .createStmt = stmt)); + pg_free(stmt); } - - destroyPQExpBuffer(delQry); } } diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index 752d859e264b8..fb44c0cfdfe4a 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -385,31 +385,20 @@ main(int argc, char **argv) if (!opts->cparams.dbname && !opts->filename && !opts->tocSummary) pg_fatal("one of -d/--dbname and -f/--file must be specified"); - if (db_exclude_patterns.head != NULL && globals_only) - { - pg_log_error("option %s cannot be used together with %s", - "--exclude-database", "-g/--globals-only"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } + /* --exclude-database and --globals-only are incompatible */ + check_mut_excl_opts(db_exclude_patterns.head, "--exclude-database", + globals_only, "-g/--globals-only"); /* Should get at most one of -d and -f, else user is confused */ - if (opts->cparams.dbname) - { - if (opts->filename) - { - pg_log_error("options %s and %s cannot be used together", - "-d/--dbname", "-f/--file"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } + check_mut_excl_opts(opts->cparams.dbname, "-d/--dbname", + opts->filename, "-f/--file"); - if (opts->restrict_key) - pg_fatal("options %s and %s cannot be used together", - "-d/--dbname", "--restrict-key"); + /* --dbname and --restrict-key are incompatible */ + check_mut_excl_opts(opts->cparams.dbname, "-d/--dbname", + opts->restrict_key, "--restrict-key"); + if (opts->cparams.dbname) opts->useDB = 1; - } else { /* @@ -423,85 +412,54 @@ main(int argc, char **argv) pg_fatal("invalid restrict key"); } - /* reject conflicting "-only" options */ - if (data_only && schema_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "-a/--data-only"); - if (schema_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics-only"); - if (data_only && statistics_only) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics-only"); - - /* reject conflicting "-only" and "no-" options */ - if (data_only && no_data) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--no-data"); - if (schema_only && no_schema) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--no-schema"); - if (statistics_only && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics-only", "--no-statistics"); - - /* reject conflicting "no-" options */ - if (with_statistics && no_statistics) - pg_fatal("options %s and %s cannot be used together", - "--statistics", "--no-statistics"); - - /* reject conflicting "only-" options */ - if (data_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "--statistics"); - if (schema_only && with_statistics) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "--statistics"); - - if (data_only && opts->dropSchema) - pg_fatal("options %s and %s cannot be used together", - "-c/--clean", "-a/--data-only"); - - if (opts->single_txn && opts->txn_size > 0) - pg_fatal("options %s and %s cannot be used together", - "-1/--single-transaction", "--transaction-size"); - - if (opts->single_txn && globals_only) - pg_fatal("options %s and %s cannot be used together when restoring an archive created by pg_dumpall", - "--single-transaction", "-g/--globals-only"); - - if (opts->txn_size && globals_only) - pg_fatal("options %s and %s cannot be used together when restoring an archive created by pg_dumpall", - "--transaction-size", "-g/--globals-only"); - - if (opts->exit_on_error && globals_only) - pg_fatal("options %s and %s cannot be used together when restoring an archive created by pg_dumpall", - "--exit-on-error", "-g/--globals-only"); - - if (data_only && globals_only) - pg_fatal("options %s and %s cannot be used together", - "-a/--data-only", "-g/--globals-only"); - if (schema_only && globals_only) - pg_fatal("options %s and %s cannot be used together", - "-s/--schema-only", "-g/--globals-only"); - if (statistics_only && globals_only) - pg_fatal("options %s and %s cannot be used together", - "--statistics-only", "-g/--globals-only"); - if (with_statistics && globals_only) - pg_fatal("options %s and %s cannot be used together", - "--statistics", "-g/--globals-only"); - - if (no_globals && globals_only) - pg_fatal("options %s and %s cannot be used together", - "--no-globals", "-g/--globals-only"); + /* *-only options are incompatible with each other */ + check_mut_excl_opts(data_only, "-a/--data-only", + globals_only, "-g/--globals-only", + schema_only, "-s/--schema-only", + statistics_only, "--statistics-only"); + + /* --no-* and *-only for same thing are incompatible */ + check_mut_excl_opts(data_only, "-a/--data-only", + no_data, "--no-data"); + check_mut_excl_opts(globals_only, "-g/--globals-only", + no_globals, "--no-globals"); + check_mut_excl_opts(schema_only, "-s/--schema-only", + no_schema, "--no-schema"); + check_mut_excl_opts(statistics_only, "--statistics-only", + no_statistics, "--no-statistics"); + + /* --statistics and --no-statistics are incompatible */ + check_mut_excl_opts(with_statistics, "--statistics", + no_statistics, "--no-statistics"); + + /* --statistics is incompatible with *-only (except --statistics-only) */ + check_mut_excl_opts(with_statistics, "--statistics", + data_only, "-a/--data-only", + globals_only, "-g/--globals-only", + schema_only, "-s/--schema-only"); + + /* --clean and --data-only are incompatible */ + check_mut_excl_opts(opts->dropSchema, "-c/--clean", + data_only, "-a/--data-only"); + + /* + * --globals-only, --single-transaction, and --transaction-size are + * incompatible. + */ + check_mut_excl_opts(globals_only, "-g/--globals-only", + opts->single_txn, "-1/--single-transaction", + opts->txn_size, "--transaction-size"); + + /* --exit-on-error and --globals-only are incompatible */ + check_mut_excl_opts(opts->exit_on_error, "--exit-on-error", + globals_only, "-g/--globals-only"); /* * -C is not compatible with -1, because we can't create a database inside * a transaction block. */ - if (opts->createDB && opts->single_txn) - pg_fatal("options %s and %s cannot be used together", - "-C/--create", "-1/--single-transaction"); + check_mut_excl_opts(opts->createDB, "-C/--create", + opts->single_txn, "-1/--single-transaction"); /* Can't do single-txn mode with multiple connections */ if (opts->single_txn && numWorkers > 1) diff --git a/src/bin/pg_dump/t/001_basic.pl b/src/bin/pg_dump/t/001_basic.pl index 67131a674f446..2f5eb48e7b86c 100644 --- a/src/bin/pg_dump/t/001_basic.pl +++ b/src/bin/pg_dump/t/001_basic.pl @@ -46,8 +46,8 @@ command_fails_like( [ 'pg_dump', '-s', '-a' ], - qr/\Qpg_dump: error: options -s\/--schema-only and -a\/--data-only cannot be used together\E/, - 'pg_dump: options -s/--schema-only and -a/--data-only cannot be used together' + qr/\Qpg_dump: error: options -a\/--data-only and -s\/--schema-only cannot be used together\E/, + 'pg_dump: options -a/--data-only and -s/--schema-only cannot be used together' ); command_fails_like( @@ -64,8 +64,8 @@ command_fails_like( [ 'pg_dump', '-s', '--include-foreign-data=xxx' ], - qr/\Qpg_dump: error: options -s\/--schema-only and --include-foreign-data cannot be used together\E/, - 'pg_dump: options -s/--schema-only and --include-foreign-data cannot be used together' + qr/\Qpg_dump: error: options --include-foreign-data and -s\/--schema-only cannot be used together\E/, + 'pg_dump: options --include-foreign-data and -s/--schema-only cannot be used together' ); command_fails_like( @@ -87,8 +87,8 @@ command_fails_like( [ 'pg_restore', '-s', '-a', '-f -' ], - qr/\Qpg_restore: error: options -s\/--schema-only and -a\/--data-only cannot be used together\E/, - 'pg_restore: options -s/--schema-only and -a/--data-only cannot be used together' + qr/\Qpg_restore: error: options -a\/--data-only and -s\/--schema-only cannot be used together\E/, + 'pg_restore: options -a/--data-only and -s/--schema-only cannot be used together' ); command_fails_like( @@ -300,8 +300,8 @@ command_fails_like( [ 'pg_restore', '--exclude-database=foo', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: option --exclude-database cannot be used together with -g\/--globals-only\E/, - 'pg_restore: option --exclude-database cannot be used together with -g/--globals-only' + qr/\Qpg_restore: error: options --exclude-database and -g\/--globals-only cannot be used together\E/, + 'pg_restore: options --exclude-database and -g/--globals-only cannot be used together' ); command_fails_like( @@ -312,14 +312,14 @@ command_fails_like( [ 'pg_restore', '--schema-only', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: options -s\/--schema-only and -g\/--globals-only cannot be used together\E/, - 'pg_restore: error: options -s/--schema-only and -g/--globals-only cannot be used together' + qr/\Qpg_restore: error: options -g\/--globals-only and -s\/--schema-only cannot be used together\E/, + 'pg_restore: error: options -g/--globals-only and -s/--schema-only cannot be used together' ); command_fails_like( [ 'pg_restore', '--statistics-only', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: options --statistics-only and -g\/--globals-only cannot be used together\E/, - 'pg_restore: error: options --statistics-only and -g/--globals-only cannot be used together' + qr/\Qpg_restore: error: options -g\/--globals-only and --statistics-only cannot be used together\E/, + 'pg_restore: error: options -g/--globals-only and --statistics-only cannot be used together' ); command_fails_like( @@ -339,6 +339,6 @@ 'pg_restore', '--globals-only', '--no-globals', '-d', 'xxx', 'dumpdir' ], - qr/\Qpg_restore: error: options --no-globals and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and --no-globals cannot be used together\E/, 'options --no-globals and --globals-only cannot be used together'); done_testing(); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index e7cc998cfbad2..6d1d38128fcf7 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -5077,8 +5077,8 @@ '--schema-only', '--statistics', ], - qr/\Qpg_dump: error: options -s\/--schema-only and --statistics cannot be used together\E/, - 'cannot use --schema-only and --statistics together'); + qr/\Qpg_dump: error: options --statistics and -s\/--schema-only cannot be used together\E/, + 'cannot use --statistics and --schema-only together'); command_fails_like( [ diff --git a/src/bin/pg_dump/t/007_pg_dumpall.pl b/src/bin/pg_dump/t/007_pg_dumpall.pl index c16c27d7387c2..22f11a13a9a68 100644 --- a/src/bin/pg_dump/t/007_pg_dumpall.pl +++ b/src/bin/pg_dump/t/007_pg_dumpall.pl @@ -520,7 +520,7 @@ '--schema-only', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options -s\/--schema-only and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and -s\/--schema-only cannot be used together\E/, 'When --globals-only and --schema-only are used together'); # report an error when --globals-only and --statistics-only are used together @@ -533,7 +533,7 @@ '--statistics-only', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options --statistics-only and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and --statistics-only cannot be used together\E/, 'When --globals-only and --statistics-only are used together'); # report an error when --globals-only and --statistics are used together @@ -572,7 +572,7 @@ '--single-transaction', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options --single-transaction and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and -1\/--single-transaction cannot be used together\E/, 'When --globals-only and --single-transaction are used together'); # report an error when --globals-only and --transaction-size are used together @@ -585,7 +585,7 @@ '--transaction-size' => '100', '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: options --transaction-size and -g\/--globals-only cannot be used together\E/, + qr/\Qpg_restore: error: options -g\/--globals-only and --transaction-size cannot be used together\E/, 'When --globals-only and --transaction-size are used together'); # verify map.dat preamble exists diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index a5621251afcee..329957d061c7c 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -30,22 +30,39 @@ static long long int largest_diff_count; static void handle_args(int argc, char *argv[]); -static uint64 test_timing(unsigned int duration); +static void test_system_timing(void); +#if PG_INSTR_TSC_CLOCK +static void test_tsc_timing(void); +#endif +static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing); static void output(uint64 loop_count); int main(int argc, char *argv[]) { - uint64 loop_count; - set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing")); progname = get_progname(argv[0]); handle_args(argc, argv); - loop_count = test_timing(test_duration); + /* + * Initialize timing infrastructure (required for INSTR_* calls) + * + * This initialization should match the one in main() so the user can + * reason about what the backend will do. + */ +#if defined(WIN32) + /* Skip TSC calibration on Windows, its too expensive per connection */ + pg_initialize_timing(false); +#else + pg_initialize_timing(true); +#endif - output(loop_count); + test_system_timing(); + +#if PG_INSTR_TSC_CLOCK + test_tsc_timing(); +#endif return 0; } @@ -143,23 +160,91 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", - "Testing timing overhead for %u seconds.\n", + printf(ngettext("Testing timing overhead for %u second.\n\n", + "Testing timing overhead for %u seconds.\n\n", test_duration), test_duration); } +/* + * This tests default (non-fast) timing code. A clock source for that is + * always available. Hence, we can unconditionally output the result. + */ +static void +test_system_timing(void) +{ + uint64 loop_count; + + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false); + output(loop_count); +} + +/* + * If on a supported architecture, test the TSC clock source. This clock + * source is not always available. In that case we print an informational + * message indicating as such. + * + * We first emit "slow" timings (RDTSCP on x86), which are used for higher + * precision measurements when the TSC clock source is enabled. We emit + * "fast" timings second (RDTSC on x86), which is used for faster timing + * measurements with lower precision. + */ +#if PG_INSTR_TSC_CLOCK +static void +test_tsc_timing(void) +{ + uint64 loop_count; + + printf("\n"); + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false); + if (loop_count > 0) + { + output(loop_count); + printf("\n"); + + /* Now, emit fast timing measurements */ + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true); + output(loop_count); + printf("\n"); + + pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO); + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n")); + else + printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n")); + } + else + printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?.\n")); +} +#endif + static uint64 -test_timing(unsigned int duration) +test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing) { - uint64 total_time; - int64 time_elapsed = 0; uint64 loop_count = 0; - uint64 prev, - cur; instr_time start_time, end_time, - temp; + duration_time, + prev, + cur; + char *time_source = NULL; + + if (!pg_set_timing_clock_source(source)) + return 0; + + time_source = PG_INSTR_SYSTEM_CLOCK_NAME; + +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME; +#endif + + if (fast_timing) + printf(_("Fast clock source: %s\n"), time_source); + else if (source == TIMING_CLOCK_SOURCE_SYSTEM) + printf(_("System clock source: %s\n"), time_source); + else + printf(_("Clock source: %s\n"), time_source); /* * Pre-zero the statistics data structures. They're already zero by @@ -171,20 +256,30 @@ test_timing(unsigned int duration) largest_diff = 0; largest_diff_count = 0; - total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0; + INSTR_TIME_SET_NANOSEC(duration_time, duration > 0 ? duration * NS_PER_S : 0); INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_NANOSEC(start_time); + cur = start_time; - while (time_elapsed < total_time) + end_time = start_time; + INSTR_TIME_ADD(end_time, duration_time); + + while (INSTR_TIME_GT(end_time, cur)) { int32 diff, bits; + instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_NANOSEC(temp); - diff = cur - prev; + + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(cur); + else + INSTR_TIME_SET_CURRENT(cur); + + diff_time = cur; + INSTR_TIME_SUBTRACT(diff_time, prev); + diff = INSTR_TIME_GET_NANOSEC(diff_time); /* Did time go backwards? */ if (unlikely(diff < 0)) @@ -217,10 +312,9 @@ test_timing(unsigned int duration) largest_diff_count++; loop_count++; - INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_NANOSEC(temp); } + /* Refresh end time to be the actual time spent (vs the target end time) */ INSTR_TIME_SET_CURRENT(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 1dae918cc09d2..06db4042e8f70 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7334,6 +7334,9 @@ main(int argc, char **argv) initRandomState(&state[i].cs_func_rs); } + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(false); + /* opening connection... */ con = doConnect(); if (con == NULL) diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 9a397ec87b736..83753dab7d3bf 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(false); + SyncVariables(); if (options.list_dbs) diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c index 6484c6a3dd4e2..199fc64ddf5f3 100644 --- a/src/bin/psql/tab-complete.in.c +++ b/src/bin/psql/tab-complete.in.c @@ -1267,7 +1267,7 @@ static const char *const sql_commands[] = { "DELETE FROM", "DISCARD", "DO", "DROP", "END", "EXECUTE", "EXPLAIN", "FETCH", "GRANT", "IMPORT FOREIGN SCHEMA", "INSERT INTO", "LISTEN", "LOAD", "LOCK", "MERGE INTO", "MOVE", "NOTIFY", "PREPARE", - "REASSIGN", "REFRESH MATERIALIZED VIEW", "REINDEX", "RELEASE", + "REASSIGN", "REFRESH MATERIALIZED VIEW", "REINDEX", "RELEASE", "REPACK", "RESET", "REVOKE", "ROLLBACK", "SAVEPOINT", "SECURITY LABEL", "SELECT", "SET", "SHOW", "START", "TABLE", "TRUNCATE", "UNLISTEN", "UPDATE", "VACUUM", "VALUES", @@ -5117,6 +5117,47 @@ match_previous_words(int pattern_id, COMPLETE_WITH_QUERY(Query_for_list_of_tablespaces); } +/* REPACK */ + else if (Matches("REPACK")) + COMPLETE_WITH_SCHEMA_QUERY_PLUS(Query_for_list_of_clusterables, + "(", "USING INDEX"); + else if (Matches("REPACK", "(*)")) + COMPLETE_WITH_SCHEMA_QUERY_PLUS(Query_for_list_of_clusterables, + "USING INDEX"); + else if (Matches("REPACK", MatchAnyExcept("("))) + COMPLETE_WITH("USING INDEX"); + else if (Matches("REPACK", "(*)", MatchAnyExcept("("))) + COMPLETE_WITH("USING INDEX"); + else if (Matches("REPACK", MatchAny, "USING", "INDEX") || + Matches("REPACK", "(*)", MatchAny, "USING", "INDEX")) + { + set_completion_reference(prev3_wd); + COMPLETE_WITH_SCHEMA_QUERY(Query_for_index_of_table); + } + + /* + * Complete ... [ (*) ] USING INDEX, with a list of indexes for + * . + */ + else if (TailMatches(MatchAny, "USING", "INDEX")) + { + set_completion_reference(prev3_wd); + COMPLETE_WITH_SCHEMA_QUERY(Query_for_index_of_table); + } + else if (HeadMatches("REPACK", "(*") && + !HeadMatches("REPACK", "(*)")) + { + /* + * This fires if we're in an unfinished parenthesized option list. + * get_previous_words treats a completed parenthesized option list as + * one word, so the above test is correct. + */ + if (ends_with(prev_wd, '(') || ends_with(prev_wd, ',')) + COMPLETE_WITH("ANALYZE", "VERBOSE"); + else if (TailMatches("ANALYZE", "VERBOSE")) + COMPLETE_WITH("ON", "OFF"); + } + /* SECURITY LABEL */ else if (Matches("SECURITY")) COMPLETE_WITH("LABEL"); diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa50972..1a2fbbe887f22 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 0000000000000..7d74c058d7aa4 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,430 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#ifndef WIN32 +#include +#endif + +#if defined(__APPLE__) +#include +#endif + +#include "port/pg_cpu.h" +#include "portability/instr_time.h" + +/* + * Stores what the number of ticks needs to be multiplied with to end up + * with nanoseconds using integer math. + * + * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows) + * the ticks to nanoseconds conversion requires floating point math because: + * + * sec = ticks / frequency_hz + * ns = ticks / frequency_hz * 1,000,000,000 + * ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz + * + * Here, 'ns' is usually a floating number. For example for a 2.5 GHz CPU + * the scaling factor becomes 1,000,000 / 2,500,000 = 1.2. + * + * To be able to use integer math we work around the lack of precision. We + * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the + * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by + * the same amount. We utilize unsigned integers even though ticks are stored + * as a signed value to encourage compilers to generate better assembly. + * + * We remember the maximum number of ticks that can be multiplied by the scale + * factor without overflowing so we can check via a * b > max <=> a > max / b. + * + * In all other cases we are using clock_gettime(), which uses nanoseconds + * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns + * to return the original value. + */ +uint64 ticks_per_ns_scaled = 0; +uint64 max_ticks_no_overflow = 0; + +static void set_ticks_per_ns(void); + +int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO; +static bool timing_initialized = false; + +#if PG_INSTR_TSC_CLOCK +/* Indicates if TSC instructions (RDTSC and RDTSCP) are usable. */ +bool has_usable_tsc = false; + +static void tsc_initialize(bool allow_tsc_calibration); +static bool tsc_use_by_default(void); +static void set_ticks_per_ns_system(); +static void set_ticks_per_ns_for_tsc(void); +#endif + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + * + * The allow_tsc_calibration argument sets whether the TSC logic (if available) + * is permitted to do calibration if it couldn't get the frequency from CPUID. + * + * Calibration may take up to TSC_CALIBRATION_MAX_NS and delays program start. + */ +void +pg_initialize_timing(bool allow_tsc_calibration) +{ + if (timing_initialized) + return; + +#if PG_INSTR_TSC_CLOCK + tsc_initialize(allow_tsc_calibration); +#endif + + set_ticks_per_ns(); + timing_initialized = true; +} + +bool +pg_set_timing_clock_source(TimingClockSourceType source) +{ + Assert(timing_initialized); + +#if PG_INSTR_TSC_CLOCK + switch (source) + { + case TIMING_CLOCK_SOURCE_AUTO: + use_tsc = has_usable_tsc && tsc_use_by_default(); + break; + case TIMING_CLOCK_SOURCE_SYSTEM: + use_tsc = false; + break; + case TIMING_CLOCK_SOURCE_TSC: + if (!has_usable_tsc) /* Tell caller TSC is not usable */ + return false; + use_tsc = true; + break; + } +#endif + + set_ticks_per_ns(); + timing_clock_source = source; + return true; +} + +#ifndef WIN32 + +static void +set_ticks_per_ns_system() +{ + ticks_per_ns_scaled = 0; + max_ticks_no_overflow = 0; +} + +#else /* WIN32 */ + +/* GetTimerFrequency returns counts per second */ +static inline double +GetTimerFrequency(void) +{ + LARGE_INTEGER f; + + QueryPerformanceFrequency(&f); + return (double) f.QuadPart; +} + +static void +set_ticks_per_ns_system() +{ + ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* WIN32 */ + +static void +set_ticks_per_ns() +{ +#if PG_INSTR_TSC_CLOCK + if (use_tsc) + set_ticks_per_ns_for_tsc(); + else + set_ticks_per_ns_system(); +#else + set_ticks_per_ns_system(); +#endif +} + +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ + +#if PG_INSTR_TSC_CLOCK + +bool use_tsc = false; + +static uint32 tsc_frequency_khz = 0; + +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + +static uint32 tsc_calibrate(void); + +/* + * Decide whether we use the RDTSC/RDTSCP instructions at runtime, for x86-64, + * instead of incurring the overhead of a full clock_gettime() call. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + */ +static void +tsc_initialize(bool allow_tsc_calibration) +{ + /* Determine speed at which the TSC advances */ + tsc_frequency_khz = x86_tsc_frequency_khz(); + + if (tsc_frequency_khz) + { + has_usable_tsc = x86_feature_available(PG_RDTSCP); + return; + } + + /* + * CPUID did not give us the TSC frequency. If TSC is invariant and RDTSCP + * is available, we can measure the frequency by comparing TSC ticks + * against walltime using a short calibration loop. + */ + if (allow_tsc_calibration && x86_feature_available(PG_TSC_INVARIANT) && + x86_feature_available(PG_RDTSCP)) + { + tsc_frequency_khz = tsc_calibrate(); + has_usable_tsc = (tsc_frequency_khz > 0); + } +} + +/* + * Decides whether to use the TSC clock source if the user did not specify it + * one way or the other, and it is available (checked separately). + * + * Mirrors the Linux kernel's clocksource watchdog disable logic as updated in + * 2021 to reflect the reliability of the TSC on Intel platforms, see + * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion + * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/ + * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/ + * for reference. + * + * When the CPU has an invariant TSC (which we require in x86_tsc_frequency_khz), + * TSC_ADJUST bit set (Intel-only), and the system has at most 4 physical + * packages (sockets), we consider the TSC trustworthy by default, matching the + * Linux kernel. + * + * On other CPU platforms (e.g. AMD), in a virtual machine, or on 8+ socket + * systems we don't have an easy way to determine the TSC's reliability. If on + * Linux, we can check if TSC is the active clocksource, based on it having run + * the watchdog logic to monitor TSC correctness. For other platforms the user + * must explicitly enable it via GUC instead. + */ +static bool +tsc_use_by_default(void) +{ + if (x86_feature_available(PG_TSC_ADJUST)) + { + int cpus_per_package = x86_logical_processors_per_package(); + long total_cpus; + +#ifdef _SC_NPROCESSORS_CONF + total_cpus = sysconf(_SC_NPROCESSORS_CONF); +#elif defined(WIN32) + { + SYSTEM_INFO si; + + GetSystemInfo(&si); + total_cpus = si.dwNumberOfProcessors; + } +#else + total_cpus = -1; +#endif /* _SC_NPROCESSORS_CONF / WIN32 */ + + if (total_cpus > 0 && cpus_per_package > 0 && (total_cpus / cpus_per_package) <= 4) + return true; + } + +#if defined(__linux__) + { + FILE *fp; + char buf[128]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp) + { + bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL && + strcmp(buf, "tsc\n") == 0); + + fclose(fp); + if (is_tsc) + return true; + } + } +#endif + + return false; +} + +/* + * Calibrate the TSC frequency by comparing TSC ticks against walltime. + * + * Takes initial TSC and system clock snapshots, then loops, recomputing the + * frequency each iteration from cumulative TSC ticks divided by elapsed time. + * + * Once the frequency estimate stabilizes (consecutive iterations agree), we + * consider it converged and the frequency in KHz is returned. If either too + * many iterations or a time limit passes without convergence, 0 is returned. + */ +#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS) +#define TSC_CALIBRATION_ITERATIONS 1000000 +#define TSC_CALIBRATION_STABLE_CYCLES 3 + +static uint32 +tsc_calibrate(void) +{ + instr_time initial_wall; + int64 initial_tsc; + double freq_khz = 0; + double prev_freq_khz = 0; + int stable_count = 0; + int64 prev_tsc; + uint32 unused; + + /* Ensure INSTR_* time below work on system time */ + set_ticks_per_ns_system(); + + INSTR_TIME_SET_CURRENT(initial_wall); + +#ifdef _MSC_VER + initial_tsc = __rdtscp(&unused); +#else + initial_tsc = __builtin_ia32_rdtscp(&unused); +#endif + prev_tsc = initial_tsc; + + for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++) + { + instr_time now_wall; + int64 now_tsc; + int64 elapsed_ns; + int64 elapsed_ticks; + + INSTR_TIME_SET_CURRENT(now_wall); + +#ifdef _MSC_VER + now_tsc = __rdtscp(&unused); +#else + now_tsc = __builtin_ia32_rdtscp(&unused); +#endif + + INSTR_TIME_SUBTRACT(now_wall, initial_wall); + elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall); + + /* Safety: bail out if we've taken too long */ + if (elapsed_ns >= TSC_CALIBRATION_MAX_NS) + break; + + elapsed_ticks = now_tsc - initial_tsc; + + /* Skip if TSC hasn't advanced, or we walked backwards for some reason */ + if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0) + continue; + + freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000; + + /* + * Once freq_khz / prev_freq_khz is small, check if it stays that way. + * If it does for long enough, we've got a winner frequency. + */ + if (prev_freq_khz != 0 && fabs(freq_khz / prev_freq_khz) < 1.0001) + { + stable_count++; + if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES) + return (uint32) freq_khz; + } + else + stable_count = 0; + + prev_tsc = now_tsc; + prev_freq_khz = freq_khz; + } + + /* did not converge */ + return 0; +} + +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Initialize the AArch64 generic timer as a clock source. + */ +static void +tsc_initialize(bool allow_tsc_calibration) +{ + if (aarch64_has_heterogeneous_cores()) + return; + + tsc_frequency_khz = aarch64_cntvct_frequency_khz(); + if (tsc_frequency_khz != 0) + has_usable_tsc = true; +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; +} + +#endif /* defined(__aarch64__) */ + +#endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/common/meson.build b/src/common/meson.build index 4f9b8b8263d55..9bd55cda95b10 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 24a27cc043afa..ad993c07311c8 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -208,6 +208,18 @@ typedef struct HeapPageFreeze TransactionId FreezePageRelfrozenXid; MultiXactId FreezePageRelminMxid; + /* + * Newest XID that this page's freeze actions will remove from tuple + * visibility metadata (currently xmin and/or xvac). It is used to derive + * the snapshot conflict horizon for a WAL record that freezes tuples. On + * a standby, we must not replay that change while any snapshot could + * still treat that XID as running. + * + * It's only used if we execute freeze plans for this page, so there is no + * corresponding "no freeze" tracker. + */ + TransactionId FreezePageConflictXid; + /* * "No freeze" NewRelfrozenXid/NewRelminMxid trackers. * diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index a0166c5b41035..52cde56be8651 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -45,5 +45,6 @@ extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); extern BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks); +extern BlockNumber visibilitymap_truncation_length(BlockNumber nheapblocks); #endif /* VISIBILITYMAP_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index b6508b60a843d..90f46b0350237 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202603062 +#define CATALOG_VERSION_NO 202603101 #endif diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index 8ea81622f9db1..28741988478a9 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -24,6 +24,7 @@ #define CLUOPT_RECHECK 0x02 /* recheck relation state */ #define CLUOPT_RECHECK_ISCLUSTERED 0x04 /* recheck relation state for * indisclustered */ +#define CLUOPT_ANALYZE 0x08 /* do an ANALYZE */ /* options for CLUSTER */ typedef struct ClusterParams @@ -31,8 +32,11 @@ typedef struct ClusterParams bits32 options; /* bitmask of CLUOPT_* */ } ClusterParams; -extern void cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel); -extern void cluster_rel(Relation OldHeap, Oid indexOid, ClusterParams *params); + +extern void ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel); + +extern void cluster_rel(RepackCommand command, Relation OldHeap, Oid indexOid, + ClusterParams *params); extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode); extern void mark_index_clustered(Relation rel, Oid indexOid, bool is_internal); diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 359221dc29664..9c40772706c3c 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -73,28 +73,34 @@ #define PROGRESS_ANALYZE_STARTED_BY_MANUAL 1 #define PROGRESS_ANALYZE_STARTED_BY_AUTOVACUUM 2 -/* Progress parameters for cluster */ -#define PROGRESS_CLUSTER_COMMAND 0 -#define PROGRESS_CLUSTER_PHASE 1 -#define PROGRESS_CLUSTER_INDEX_RELID 2 -#define PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED 3 -#define PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN 4 -#define PROGRESS_CLUSTER_TOTAL_HEAP_BLKS 5 -#define PROGRESS_CLUSTER_HEAP_BLKS_SCANNED 6 -#define PROGRESS_CLUSTER_INDEX_REBUILD_COUNT 7 - -/* Phases of cluster (as advertised via PROGRESS_CLUSTER_PHASE) */ -#define PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP 1 -#define PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP 2 -#define PROGRESS_CLUSTER_PHASE_SORT_TUPLES 3 -#define PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP 4 -#define PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES 5 -#define PROGRESS_CLUSTER_PHASE_REBUILD_INDEX 6 -#define PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP 7 - -/* Commands of PROGRESS_CLUSTER */ -#define PROGRESS_CLUSTER_COMMAND_CLUSTER 1 -#define PROGRESS_CLUSTER_COMMAND_VACUUM_FULL 2 +/* + * Progress parameters for REPACK. + * + * Values for PROGRESS_REPACK_COMMAND are as in RepackCommand. + * + * Note: Since REPACK shares code with CLUSTER, these values are also + * used by CLUSTER. (CLUSTER being now deprecated, it makes little sense to + * maintain a separate set of constants.) + */ +#define PROGRESS_REPACK_COMMAND 0 +#define PROGRESS_REPACK_PHASE 1 +#define PROGRESS_REPACK_INDEX_RELID 2 +#define PROGRESS_REPACK_HEAP_TUPLES_SCANNED 3 +#define PROGRESS_REPACK_HEAP_TUPLES_WRITTEN 4 +#define PROGRESS_REPACK_TOTAL_HEAP_BLKS 5 +#define PROGRESS_REPACK_HEAP_BLKS_SCANNED 6 +#define PROGRESS_REPACK_INDEX_REBUILD_COUNT 7 + +/* + * Phases of repack (as advertised via PROGRESS_REPACK_PHASE). + */ +#define PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP 1 +#define PROGRESS_REPACK_PHASE_INDEX_SCAN_HEAP 2 +#define PROGRESS_REPACK_PHASE_SORT_TUPLES 3 +#define PROGRESS_REPACK_PHASE_WRITE_NEW_HEAP 4 +#define PROGRESS_REPACK_PHASE_SWAP_REL_FILES 5 +#define PROGRESS_REPACK_PHASE_REBUILD_INDEX 6 +#define PROGRESS_REPACK_PHASE_FINAL_CLEANUP 7 /* Progress parameters for CREATE INDEX */ /* 3, 4 and 5 reserved for "waitfor" metrics */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 4ee092206b0dc..f3d32ef0188df 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3982,18 +3982,6 @@ typedef struct AlterSystemStmt VariableSetStmt *setstmt; /* SET subcommand */ } AlterSystemStmt; -/* ---------------------- - * Cluster Statement (support pbrown's cluster index implementation) - * ---------------------- - */ -typedef struct ClusterStmt -{ - NodeTag type; - RangeVar *relation; /* relation being indexed, or NULL if all */ - char *indexname; /* original index defined */ - List *params; /* list of DefElem nodes */ -} ClusterStmt; - /* ---------------------- * Vacuum and Analyze Statements * @@ -4006,7 +3994,7 @@ typedef struct VacuumStmt NodeTag type; List *options; /* list of DefElem nodes */ List *rels; /* list of VacuumRelation, or NIL for all */ - bool is_vacuumcmd; /* true for VACUUM, false for ANALYZE */ + bool is_vacuumcmd; /* true for VACUUM, false otherwise */ } VacuumStmt; /* @@ -4024,6 +4012,27 @@ typedef struct VacuumRelation List *va_cols; /* list of column names, or NIL for all */ } VacuumRelation; +/* ---------------------- + * Repack Statement + * ---------------------- + */ +typedef enum RepackCommand +{ + REPACK_COMMAND_CLUSTER = 1, + REPACK_COMMAND_REPACK, + REPACK_COMMAND_VACUUMFULL, +} RepackCommand; + +typedef struct RepackStmt +{ + NodeTag type; + RepackCommand command; /* type of command being run */ + VacuumRelation *relation; /* relation being repacked */ + char *indexname; /* order tuples by this index */ + bool usingindex; /* whether USING INDEX is specified */ + List *params; /* list of DefElem nodes */ +} RepackStmt; + /* ---------------------- * Explain Statement * diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index c175ee95b68c5..27758ec16fe66 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1412,6 +1412,8 @@ typedef struct IndexOptInfo bool nullsnotdistinct; /* is uniqueness enforced immediately? */ bool immediate; + /* true if paths using this index should be marked disabled */ + bool disabled; /* true if index doesn't really exist */ bool hypothetical; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index cf8a654fa5368..da2d9b384b596 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -17,6 +17,12 @@ #include "nodes/bitmapset.h" #include "nodes/pathnodes.h" +/* Hook for plugins to get control in build_simple_rel() */ +typedef void (*build_simple_rel_hook_type) (PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +extern PGDLLIMPORT build_simple_rel_hook_type build_simple_rel_hook; + /* * Everything in subpaths or partial_subpaths will become part of the * Append node's subpaths list. Partial and non-partial subpaths can be @@ -55,7 +61,7 @@ extern bool add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, List *pathkeys, Relids required_outer); extern void add_partial_path(RelOptInfo *parent_rel, Path *new_path); extern bool add_partial_path_precheck(RelOptInfo *parent_rel, - int disabled_nodes, + int disabled_nodes, Cost startup_cost, Cost total_cost, List *pathkeys); extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 8d7cc6d9886b1..09baf1a691643 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -17,14 +17,6 @@ #include "nodes/pathnodes.h" #include "utils/relcache.h" -/* Hook for plugins to get control in get_relation_info() */ -typedef void (*get_relation_info_hook_type) (PlannerInfo *root, - Oid relationObjectId, - bool inhparent, - RelOptInfo *rel); -extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; - - extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index f7753c5c8a87d..6f74a8c05c731 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -377,6 +377,7 @@ PG_KEYWORD("reindex", REINDEX, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("relative", RELATIVE_P, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("release", RELEASE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("repack", REPACK, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD, BARE_LABEL) diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac27..82df66f381e03 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -23,6 +23,12 @@ typedef enum X86FeatureId /* scalar registers and 128-bit XMM registers */ PG_SSE4_2, PG_POPCNT, + PG_HYPERVISOR, + + /* TSC flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, /* 512-bit ZMM registers */ PG_AVX512_BW, @@ -45,6 +51,15 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern int x86_logical_processors_per_package(void); +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 8b6baeffd3e46..ac8020bdd629d 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,11 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and + * QueryPerformanceCounter() on Windows. These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -17,8 +19,13 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_NANOSEC(t, x) set t to the specified value (in nanosecs) * + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window + * + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * INSTR_TIME_ADD(x, y) x += y * @@ -78,11 +85,88 @@ typedef struct instr_time #define NS_PER_MS INT64CONST(1000000) #define NS_PER_US INT64CONST(1000) +/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ +#define TICKS_TO_NS_SHIFT 14 -#ifndef WIN32 +/* + * Variables used to translate ticks to nanoseconds, initialized by + * pg_initialize_timing. + */ +extern PGDLLIMPORT uint64 ticks_per_ns_scaled; +extern PGDLLIMPORT uint64 max_ticks_no_overflow; + +typedef enum +{ + TIMING_CLOCK_SOURCE_AUTO, + TIMING_CLOCK_SOURCE_SYSTEM, + TIMING_CLOCK_SOURCE_TSC +} TimingClockSourceType; + +extern int timing_clock_source; + +/* + * Initialize timing infrastructure + * + * This must be called at least once by frontend programs before using + * INSTR_TIME_SET_CURRENT* macros. Backend programs automatically initialize + * this through the GUC check hook. + */ +extern void pg_initialize_timing(bool allow_tsc_calibrate); +/* + * Sets the time source to be used. Mainly intended for frontend programs, + * the backend should set it via the timing_clock_source GUC instead. + * + * Returns false if the clock source could not be set, for example when TSC + * is not available despite being explicitly set. + */ +extern bool pg_set_timing_clock_source(TimingClockSourceType source); + +#if defined(__x86_64__) || defined(_M_X64) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" +#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(__aarch64__) && !defined(WIN32) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(WIN32) +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 0 +#endif + +#if PG_INSTR_TSC_CLOCK +/* Whether the hardware TSC clock is available and usable. */ +extern PGDLLIMPORT bool has_usable_tsc; -/* Use clock_gettime() */ +/* Whether to actually use TSC based on availability and GUC settings. */ +extern PGDLLIMPORT bool use_tsc; + +#endif /* PG_INSTR_TSC_CLOCK */ + +/* + * Returns the current timing clock source effectively in use, resolving + * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or + * TIMING_CLOCK_SOURCE_TSC. + */ +static inline TimingClockSourceType +pg_current_timing_clock_source(void) +{ +#if PG_INSTR_TSC_CLOCK + return use_tsc ? TIMING_CLOCK_SOURCE_TSC : TIMING_CLOCK_SOURCE_SYSTEM; +#else + return TIMING_CLOCK_SOURCE_SYSTEM; +#endif +} + +#ifndef WIN32 + +/* On POSIX, use clock_gettime() for system clock source */ #include @@ -97,43 +181,40 @@ typedef struct instr_time * than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than * their version of CLOCK_MONOTONIC. + * + * Note this does not get used in case the TSC clock source logic is used, + * which directly calls architecture specific timing instructions (e.g. RDTSC). */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)" #elif defined(CLOCK_MONOTONIC) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)" #else -#define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)" #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_get_ticks_system(void) { instr_time now; struct timespec tmp; - clock_gettime(PG_INSTR_CLOCK, &tmp); + clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - - #else /* WIN32 */ +/* On Windows, use QueryPerformanceCounter() for system clock source */ -/* Use QueryPerformanceCounter() */ - -/* helper for INSTR_TIME_SET_CURRENT */ +#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter" static inline instr_time -pg_query_performance_counter(void) +pg_get_ticks_system(void) { instr_time now; LARGE_INTEGER tmp; @@ -144,23 +225,151 @@ pg_query_performance_counter(void) return now; } -static inline double -GetTimerFrequency(void) +#endif /* WIN32 */ + +static inline int64 +pg_ticks_to_ns(int64 ticks) { - LARGE_INTEGER f; +#if PG_INSTR_TICKS_TO_NS + int64 ns = 0; + + /* + * Avoid doing work if we don't use scaled ticks, e.g. system clock on + * Unix + */ + if (ticks_per_ns_scaled == 0) + return ticks; + + /* + * Would multiplication overflow? If so perform computation in two parts. + */ + if (unlikely(ticks > (int64) max_ticks_no_overflow)) + { + /* + * To avoid overflow, first scale total ticks down by the fixed + * factor, and *afterwards* multiply them by the frequency-based scale + * factor. + * + * The remaining ticks can follow the regular formula, since they + * won't overflow. + */ + int64 count = ticks >> TICKS_TO_NS_SHIFT; + + ns = count * ticks_per_ns_scaled; + ticks -= (count << TICKS_TO_NS_SHIFT); + } + + ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT; + + return ns; +#else + return ticks; +#endif /* PG_INSTR_TICKS_TO_NS */ +} + +#if PG_INSTR_TSC_CLOCK + +#if defined(__x86_64__) || defined(_M_X64) - QueryPerformanceFrequency(&f); - return (double) f.QuadPart; +#ifdef _MSC_VER +#include +#endif /* defined(_MSC_VER) */ + +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + +#ifdef _MSC_VER + now.ticks = __rdtsc(); +#else + /* Avoid complex includes on clang/GCC that raise compile times */ + now.ticks = __builtin_ia32_rdtsc(); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_query_performance_counter()) +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + uint32 unused; -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) +#ifdef _MSC_VER + now.ticks = __rdtscp(&unused); +#else + now.ticks = __builtin_ia32_rdtscp(&unused); +#endif /* defined(_MSC_VER) */ + return now; + } -#endif /* WIN32 */ + return pg_get_ticks_system(); +} + +#elif defined(__aarch64__) && !defined(WIN32) + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ + +static inline instr_time +pg_get_ticks_fast(void) +{ + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + return pg_get_ticks_system(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ /* * Common macros @@ -170,6 +379,13 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_NANOSEC(t, n) ((t).ticks = n) + +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) @@ -183,6 +399,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (pg_ticks_to_ns((t).ticks)) + #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index a40adf6b2a8da..4017896f9518e 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -314,6 +314,10 @@ extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); +extern bool BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer); +extern bool BufferBeginSetHintBits(Buffer buffer); +extern void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std); + extern void UnlockBuffers(void); extern void UnlockBuffer(Buffer buffer); extern void LockBufferInternal(Buffer buffer, BufferLockMode mode); diff --git a/src/include/tcop/cmdtaglist.h b/src/include/tcop/cmdtaglist.h index 1290c9bab6842..652dc61b834c8 100644 --- a/src/include/tcop/cmdtaglist.h +++ b/src/include/tcop/cmdtaglist.h @@ -196,6 +196,7 @@ PG_CMDTAG(CMDTAG_REASSIGN_OWNED, "REASSIGN OWNED", false, false, false) PG_CMDTAG(CMDTAG_REFRESH_MATERIALIZED_VIEW, "REFRESH MATERIALIZED VIEW", true, false, false) PG_CMDTAG(CMDTAG_REINDEX, "REINDEX", true, false, false) PG_CMDTAG(CMDTAG_RELEASE, "RELEASE", false, false, false) +PG_CMDTAG(CMDTAG_REPACK, "REPACK", false, false, false) PG_CMDTAG(CMDTAG_RESET, "RESET", false, false, false) PG_CMDTAG(CMDTAG_REVOKE, "REVOKE", true, false, false) PG_CMDTAG(CMDTAG_REVOKE_ROLE, "REVOKE ROLE", false, false, false) diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index 19f63b414310f..6300dbd15d5be 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -24,10 +24,10 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_INVALID, PROGRESS_COMMAND_VACUUM, PROGRESS_COMMAND_ANALYZE, - PROGRESS_COMMAND_CLUSTER, PROGRESS_COMMAND_CREATE_INDEX, PROGRESS_COMMAND_BASEBACKUP, PROGRESS_COMMAND_COPY, + PROGRESS_COMMAND_REPACK, } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 9c90670d9b8d8..a396e746415e3 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -162,6 +162,9 @@ extern const char *show_timezone(void); extern bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); extern void assign_timezone_abbreviations(const char *newval, void *extra); +extern void assign_timing_clock_source(int newval, void *extra); +extern bool check_timing_clock_source(int *newval, void **extra, GucSource source); +extern const char *show_timing_clock_source(void); extern bool check_transaction_buffers(int *newval, void **extra, GucSource source); extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 71a8016196138..63440b8e36c83 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -60,6 +60,7 @@ enum config_group CONN_AUTH_TCP, CONN_AUTH_AUTH, CONN_AUTH_SSL, + RESOURCES_TIME, RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, diff --git a/src/port/meson.build b/src/port/meson.build index 7296f8e3c037f..110bcd28edd4c 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_localeconv_r.c', 'pg_numa.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 0000000000000..6fd9dd892ec98 --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(WIN32) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */ diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 7575838245cd7..fc29212f38ccf 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -17,12 +17,17 @@ #if defined(USE_SSE2) || defined(__i386__) -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#if defined(HAVE__CPUID) || defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || defined(HAVE__CPUIDEX) +#if defined(_MSC_VER) +#include +#else #include +#endif /* defined(_MSC_VER) */ #endif -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) -#include +#ifdef __linux__ +#include +#include #endif #ifdef HAVE_XSAVE_INTRINSICS @@ -53,6 +58,44 @@ mask_available(uint32 value, uint32 mask) return (value & mask) == mask; } +/* General purpose registers used by CPUID */ +typedef struct CPUIDResult +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +} CPUIDResult; + +StaticAssertDecl(sizeof(CPUIDResult) == 4 * sizeof(unsigned int), + "CPUIDResult must have no padding"); + +static inline void +pg_cpuid(int leaf, CPUIDResult *r) +{ +#if defined(HAVE__GET_CPUID) + __get_cpuid(leaf, &r->eax, &r->ebx, &r->ecx, &r->edx); +#elif defined(HAVE__CPUID) + __cpuid((int *) r, leaf); +#else +#error cpuid instruction not available +#endif +} + +static inline bool +pg_cpuid_subleaf(int leaf, int subleaf, CPUIDResult *r) +{ +#if defined(HAVE__GET_CPUID_COUNT) + return __get_cpuid_count(leaf, subleaf, &r->eax, &r->ebx, &r->ecx, &r->edx) == 1; +#elif defined(HAVE__CPUIDEX) + __cpuidex((int *) r, leaf, subleaf); + return true; +#else + memset(r, 0, sizeof(CPUIDResult)); + return false; +#endif +} + /* * Parse the CPU ID info for runtime checks. */ @@ -62,33 +105,22 @@ pg_attribute_target("xsave") void set_x86_features(void) { - unsigned int exx[4] = {0, 0, 0, 0}; + CPUIDResult r = {0}, r2 = {0}; -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif + pg_cpuid(0x01, &r); - X86Features[PG_SSE4_2] = exx[2] >> 20 & 1; - X86Features[PG_POPCNT] = exx[2] >> 23 & 1; + X86Features[PG_SSE4_2] = r.ecx >> 20 & 1; + X86Features[PG_POPCNT] = r.ecx >> 23 & 1; + X86Features[PG_HYPERVISOR] = r.ecx >> 31 & 1; - /* All these features depend on OSXSAVE */ - if (exx[2] & (1 << 27)) - { - uint32 xcr0_val = 0; - - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + pg_cpuid_subleaf(0x07, 0, &r2); - memset(exx, 0, 4 * sizeof(exx[0])); + X86Features[PG_TSC_ADJUST] = (r2.ebx & (1 << 1)) != 0; -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#endif + /* leaf 7 features that depend on OSXSAVE */ + if (r.ecx & (1 << 27)) + { + uint32 xcr0_val = 0; #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ @@ -99,15 +131,169 @@ set_x86_features(void) if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) { - X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1; - X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1; + X86Features[PG_AVX512_BW] = r2.ebx >> 30 & 1; + X86Features[PG_AVX512_VL] = r2.ebx >> 31 & 1; - X86Features[PG_AVX512_VPCLMULQDQ] = exx[2] >> 10 & 1; - X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; + X86Features[PG_AVX512_VPCLMULQDQ] = r2.ecx >> 10 & 1; + X86Features[PG_AVX512_VPOPCNTDQ] = r2.ecx >> 14 & 1; } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, &r); + X86Features[PG_RDTSCP] = r.edx >> 27 & 1; + + pg_cpuid(0x80000007, &r); + X86Features[PG_TSC_INVARIANT] = r.edx >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* + * Return the number of logical processors per physical CPU package (socket). + * + * This uses CPUID.0B (Extended Topology Enumeration) to enumerate topology + * levels. Each sub-leaf reports a level type in ECX[15:8] (1 = SMT, 2 = Core) + * and the number of logical processors at that level and below in EBX[15:0]. + * The value at the highest level gives us logical processors per package. + * + * Vendor-specific leaves (0x1F for Intel, 0x80000026 for AMD) provide + * finer-grained sub-package topology but are assumed to report the same + * per-package totals on current hardware. + * + * Returns 0 if topology information is not available. + */ +int +x86_logical_processors_per_package(void) +{ + int logical_per_package = 0; + + for (int subleaf = 0; subleaf < 8; subleaf++) + { + CPUIDResult r = {0}; + uint32 level_type; + + if (!pg_cpuid_subleaf(0x0B, subleaf, &r)) + return 0; + + level_type = (r.ecx >> 8) & 0xff; + + /* level_type 0 means end of enumeration */ + if (level_type == 0) + break; + + logical_per_package = r.ebx & 0xffff; + } + + return logical_per_package; +} + +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates TSC is not invariant, or the frequency information was not + * accessible and the instructions should not be used. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + + if (!x86_feature_available(PG_TSC_INVARIANT)) + return 0; + + if (x86_feature_available(PG_HYPERVISOR)) + return x86_hypervisor_tsc_frequency_khz(); + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "Intel® 64 and IA-32 Architectures Software Developer’s Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, &r); + if (r.ecx > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (r.eax == 0 || r.ebx == 0) + return 0; + + return r.ecx / 1000 * r.ebx / r.eax; + } + + /* + * When CPUID.15H is not available/incomplete, but we have verified an + * invariant TSC is used, we can instead get the processor base frequency + * in MHz from CPUID.16H:EAX, the "Processor Frequency Information Leaf". + */ + pg_cpuid(0x16, &r); + if (r.eax > 0) + return r.eax * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access an MSR to get the frequency (which is typically not available + * for unprivileged processes), so we instead rely on the TSC calibration logic. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r.ebx == 0x61774d56 && r.ecx == 0x4d566572 && r.edx == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r.ebx == 0x4b4d564b && r.ecx == 0x564b4d56 && r.edx == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + +/* + * The hypervisor is determined using the 0x40000000 Hypervisor information + * leaf, which requires use of __cpuidex to set ECX to 0 to access it. + * + * The similar __get_cpuid_count function does not work as expected since it + * contains a check for __get_cpuid_max, which has been observed to be lower + * than the special Hypervisor leaf, despite it being available. + */ +#if defined(HAVE__CPUIDEX) + __cpuidex((int *) &r, 0x40000000, 0); + + if (r.eax >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) + { + __cpuidex((int *) &r, 0x40000010, 0); + if (r.eax > 0) + return r.eax; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/test/modules/test_dsa/expected/test_dsa.out b/src/test/modules/test_dsa/expected/test_dsa.out index 266010e77fe9e..4b53a7de4a443 100644 --- a/src/test/modules/test_dsa/expected/test_dsa.out +++ b/src/test/modules/test_dsa/expected/test_dsa.out @@ -11,3 +11,19 @@ SELECT test_dsa_resowners(); (1 row) +-- Test allocations across a pre-defined range of pages. This covers enough +-- range to check for the case of odd-sized segments, without making the test +-- too slow. +SELECT test_dsa_allocate(1001, 2000, 100); + test_dsa_allocate +------------------- + +(1 row) + +-- Larger size with odd-sized segment. +SELECT test_dsa_allocate(6501, 6600, 100); + test_dsa_allocate +------------------- + +(1 row) + diff --git a/src/test/modules/test_dsa/sql/test_dsa.sql b/src/test/modules/test_dsa/sql/test_dsa.sql index c3d8db9437206..99b4a60dd14ca 100644 --- a/src/test/modules/test_dsa/sql/test_dsa.sql +++ b/src/test/modules/test_dsa/sql/test_dsa.sql @@ -2,3 +2,10 @@ CREATE EXTENSION test_dsa; SELECT test_dsa_basic(); SELECT test_dsa_resowners(); + +-- Test allocations across a pre-defined range of pages. This covers enough +-- range to check for the case of odd-sized segments, without making the test +-- too slow. +SELECT test_dsa_allocate(1001, 2000, 100); +-- Larger size with odd-sized segment. +SELECT test_dsa_allocate(6501, 6600, 100); diff --git a/src/test/modules/test_dsa/test_dsa--1.0.sql b/src/test/modules/test_dsa/test_dsa--1.0.sql index 2904cb23525e3..3ee2e44cc0068 100644 --- a/src/test/modules/test_dsa/test_dsa--1.0.sql +++ b/src/test/modules/test_dsa/test_dsa--1.0.sql @@ -10,3 +10,7 @@ CREATE FUNCTION test_dsa_basic() CREATE FUNCTION test_dsa_resowners() RETURNS pg_catalog.void AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION test_dsa_allocate(int, int, int) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c index ed2a07c962fc6..edcab105de621 100644 --- a/src/test/modules/test_dsa/test_dsa.c +++ b/src/test/modules/test_dsa/test_dsa.c @@ -16,6 +16,7 @@ #include "storage/dsm_registry.h" #include "storage/lwlock.h" #include "utils/dsa.h" +#include "utils/freepage.h" #include "utils/resowner.h" PG_MODULE_MAGIC; @@ -120,3 +121,42 @@ test_dsa_resowners(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * test_dsa_allocate + * + * Test DSA allocation across a range of sizes to exercise the pagemap + * sizing logic in make_new_segment(). A fresh DSA is created for each + * iteration so that each allocation triggers a new segment creation, + * including the odd-sized segment path. + */ +PG_FUNCTION_INFO_V1(test_dsa_allocate); +Datum +test_dsa_allocate(PG_FUNCTION_ARGS) +{ + int start_num_pages = PG_GETARG_INT32(0); + int end_num_pages = PG_GETARG_INT32(1); + int step = PG_GETARG_INT32(2); + size_t usable_pages; + int *tranche_id; + bool found; + dsa_area *a; + dsa_pointer dp; + + if (start_num_pages > end_num_pages) + elog(ERROR, "incorrect start and end parameters"); + + tranche_id = GetNamedDSMSegment("test_dsa", sizeof(int), + init_tranche, &found, NULL); + + for (usable_pages = start_num_pages; usable_pages < end_num_pages; usable_pages += step) + { + a = dsa_create(*tranche_id); + dp = dsa_allocate(a, usable_pages * FPM_PAGE_SIZE); + + dsa_free(a, dp); + dsa_detach(a); + } + + PG_RETURN_VOID(); +} diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 8d20488952e69..36d789720a3c8 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -60,6 +60,7 @@ tests += { 't/049_wait_for_lsn.pl', 't/050_redo_segment_missing.pl', 't/051_effective_wal_level.pl', + 't/052_checkpoint_segment_missing.pl', ], }, } diff --git a/src/test/recovery/t/052_checkpoint_segment_missing.pl b/src/test/recovery/t/052_checkpoint_segment_missing.pl new file mode 100644 index 0000000000000..da54d141f0dea --- /dev/null +++ b/src/test/recovery/t/052_checkpoint_segment_missing.pl @@ -0,0 +1,59 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group +# +# Verify crash recovery behavior when the WAL segment containing the +# checkpoint record referenced by pg_controldata is missing. This +# checks the code path where there is no backup_label file, where the +# startup process should fail with FATAL and log a message about the +# missing checkpoint record. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node = PostgreSQL::Test::Cluster->new('testnode'); +$node->init; +$node->append_conf('postgresql.conf', 'log_checkpoints = on'); +$node->start; + +# Force a checkpoint so as pg_controldata points to a checkpoint record we +# can target. +$node->safe_psql('postgres', 'CHECKPOINT;'); + +# Retrieve the checkpoint LSN and derive the WAL segment name. +my $checkpoint_walfile = $node->safe_psql('postgres', + "SELECT pg_walfile_name(checkpoint_lsn) FROM pg_control_checkpoint()"); + +ok($checkpoint_walfile ne '', + "derived checkpoint WAL file name: $checkpoint_walfile"); + +# Stop the node. +$node->stop('immediate'); + +# Remove the WAL segment containing the checkpoint record. +my $walpath = $node->data_dir . "/pg_wal/$checkpoint_walfile"; +ok(-f $walpath, "checkpoint WAL file exists before deletion: $walpath"); + +unlink $walpath + or die "could not remove WAL file $walpath: $!"; + +ok(!-e $walpath, "checkpoint WAL file removed: $walpath"); + +# Use run_log instead of node->start because this test expects that +# the server ends with an error during recovery. +run_log( + [ + 'pg_ctl', + '--pgdata' => $node->data_dir, + '--log' => $node->logfile, + 'start', + ]); + +# Confirm that recovery has failed as expected. +my $logfile = slurp_file($node->logfile()); +ok( $logfile =~ + qr/FATAL: .* could not locate a valid checkpoint record at .*/, + "FATAL logged for missing checkpoint record (no backup_label path)"); + +done_testing(); diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out index 4d40a6809ab46..24b0b1a8fce86 100644 --- a/src/test/regress/expected/cluster.out +++ b/src/test/regress/expected/cluster.out @@ -495,6 +495,46 @@ ALTER TABLE clstrpart SET WITHOUT CLUSTER; ERROR: cannot mark index clustered in partitioned table ALTER TABLE clstrpart CLUSTER ON clstrpart_idx; ERROR: cannot mark index clustered in partitioned table +-- and they cannot get an index-ordered REPACK without an explicit index name +REPACK clstrpart USING INDEX; +ERROR: cannot execute REPACK on partitioned table "clstrpart" USING INDEX with no index name +-- Check that REPACK sets new relfilenodes: it should process exactly the same +-- tables as CLUSTER did. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart USING INDEX clstrpart_idx; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + relname | level | relkind | ?column? +-------------+-------+---------+---------- + clstrpart | 0 | p | t + clstrpart1 | 1 | p | t + clstrpart11 | 2 | r | f + clstrpart12 | 2 | p | t + clstrpart2 | 1 | r | f + clstrpart3 | 1 | p | t + clstrpart33 | 2 | r | f +(7 rows) + +-- And finally the same for REPACK w/o index. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + relname | level | relkind | ?column? +-------------+-------+---------+---------- + clstrpart | 0 | p | t + clstrpart1 | 1 | p | t + clstrpart11 | 2 | r | f + clstrpart12 | 2 | p | t + clstrpart2 | 1 | r | f + clstrpart3 | 1 | p | t + clstrpart33 | 2 | r | f +(7 rows) + DROP TABLE clstrpart; -- Ownership of partitions is checked CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i); @@ -513,7 +553,7 @@ CREATE TEMP TABLE ptnowner_oldnodes AS JOIN pg_class AS c ON c.oid=tree.relid; SET SESSION AUTHORIZATION regress_ptnowner; CLUSTER ptnowner USING ptnowner_i_idx; -WARNING: permission denied to cluster "ptnowner2", skipping it +WARNING: permission denied to execute CLUSTER on "ptnowner2", skipping it RESET SESSION AUTHORIZATION; SELECT a.relname, a.relfilenode=b.relfilenode FROM pg_class a JOIN ptnowner_oldnodes b USING (oid) ORDER BY a.relname COLLATE "C"; @@ -665,6 +705,101 @@ SELECT * FROM clstr_expression WHERE -a = -3 ORDER BY -a, b; (4 rows) COMMIT; +---------------------------------------------------------------------- +-- +-- REPACK +-- +---------------------------------------------------------------------- +-- REPACK handles individual tables identically to CLUSTER, but it's worth +-- checking if it handles table hierarchies identically as well. +REPACK clstr_tst USING INDEX clstr_tst_c; +-- Verify that inheritance link still works +INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table 2'); +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst; + a | b | c | substring | length +----+-----+------------------+--------------------------------+-------- + 10 | 14 | catorce | | + 18 | 5 | cinco | | + 9 | 4 | cuatro | | + 26 | 19 | diecinueve | | + 12 | 18 | dieciocho | | + 30 | 16 | dieciseis | | + 24 | 17 | diecisiete | | + 2 | 10 | diez | | + 23 | 12 | doce | | + 11 | 2 | dos | | + 25 | 9 | nueve | | + 31 | 8 | ocho | | + 1 | 11 | once | | + 28 | 15 | quince | | + 32 | 6 | seis | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000 + 29 | 7 | siete | | + 15 | 13 | trece | | + 22 | 30 | treinta | | + 17 | 32 | treinta y dos | | + 3 | 31 | treinta y uno | | + 5 | 3 | tres | | + 20 | 1 | uno | | + 6 | 20 | veinte | | + 14 | 25 | veinticinco | | + 21 | 24 | veinticuatro | | + 4 | 22 | veintidos | | + 19 | 29 | veintinueve | | + 16 | 28 | veintiocho | | + 27 | 26 | veintiseis | | + 13 | 27 | veintisiete | | + 7 | 23 | veintitres | | + 8 | 21 | veintiuno | | + 0 | 100 | in child table | | + 0 | 100 | in child table 2 | | +(34 rows) + +-- Verify that foreign key link still works +INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); +ERROR: insert or update on table "clstr_tst" violates foreign key constraint "clstr_tst_con" +DETAIL: Key (b)=(1111) is not present in table "clstr_tst_s". +SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass +ORDER BY 1; + conname +---------------------- + clstr_tst_a_not_null + clstr_tst_con + clstr_tst_pkey +(3 rows) + +-- Verify partial analyze works +REPACK (ANALYZE) clstr_tst (a); +REPACK (ANALYZE) clstr_tst; +REPACK (VERBOSE) clstr_tst (a); +ERROR: ANALYZE option must be specified when a column list is provided +-- REPACK w/o argument performs no ordering, so we can only check which tables +-- have the relfilenode changed. +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_old AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); +SET SESSION AUTHORIZATION regress_clstr_user; +SET client_min_messages = ERROR; -- order of "skipping" warnings may vary +REPACK; +RESET client_min_messages; +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_new AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); +-- Do the actual comparison. Unlike CLUSTER, clstr_3 should have been +-- processed because there is nothing like clustering index here. +SELECT o.relname FROM relnodes_old o +JOIN relnodes_new n ON o.relname = n.relname +WHERE o.relfilenode <> n.relfilenode +ORDER BY o.relname; + relname +--------- + clstr_1 + clstr_3 +(2 rows) + -- clean up DROP TABLE clustertest; DROP TABLE clstr_1; diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index cce49e509abe4..6af54d9803f6d 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -828,10 +828,13 @@ COMMENT ON COLUMN ft1.c1 IS NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c4 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c5 integer DEFAULT 0; ALTER FOREIGN TABLE ft1 ADD COLUMN c6 integer; +ALTER FOREIGN TABLE ft1 ADD COLUMN IF NOT EXISTS c6 integer; +NOTICE: column "c6" of relation "ft1" already exists, skipping ALTER FOREIGN TABLE ft1 ADD COLUMN c7 integer NOT NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c8 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c9 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); +ALTER FOREIGN TABLE ft1 ADD c11 integer; ALTER FOREIGN TABLE ft1 ALTER COLUMN c4 SET DEFAULT 0; ALTER FOREIGN TABLE ft1 ALTER COLUMN c5 DROP DEFAULT; ALTER FOREIGN TABLE ft1 ALTER COLUMN c6 SET NOT NULL; @@ -863,6 +866,7 @@ ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET STORAGE PLAIN; c8 | text | | | | (p2 'V2') | plain | | c9 | integer | | | | | plain | | c10 | integer | | | | (p1 'v1') | plain | | + c11 | integer | | | | | plain | | Check constraints: "ft1_c2_check" CHECK (c2 <> ''::text) "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date) @@ -897,6 +901,7 @@ ERROR: column "no_column" of relation "ft1" does not exist ALTER FOREIGN TABLE ft1 DROP COLUMN IF EXISTS no_column; NOTICE: column "no_column" of relation "ft1" does not exist, skipping ALTER FOREIGN TABLE ft1 DROP COLUMN c9; +ALTER FOREIGN TABLE ft1 DROP c11; ALTER FOREIGN TABLE ft1 ADD COLUMN c11 serial; ALTER FOREIGN TABLE ft1 SET SCHEMA foreign_schema; ALTER FOREIGN TABLE ft1 SET TABLESPACE ts; -- ERROR @@ -931,6 +936,8 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c4 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c6 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN IF NOT EXISTS c6 integer; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c7 integer NOT NULL; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c8 integer; @@ -939,6 +946,8 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c9 integer; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD c11 integer; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c6 SET NOT NULL; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c7 DROP NOT NULL; @@ -960,10 +969,14 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OWNER TO regress_test_role; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN no_column; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN IF EXISTS no_column; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN c9; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP c11; +NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 SET SCHEMA foreign_schema; NOTICE: relation "doesnt_exist_ft1" does not exist, skipping ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME c1 TO foreign_column_1; diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index fdec5b9ba52a9..1e6e020fea836 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1450,21 +1450,23 @@ explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1 set enable_incremental_sort = on; explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; - QUERY PLAN ----------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Limit -> Incremental Sort Sort Key: a, b, (sum(c)) Presorted Key: a, b - -> GroupAggregate + -> Finalize GroupAggregate Group Key: a, b -> Gather Merge Workers Planned: 2 - -> Incremental Sort - Sort Key: a, b - Presorted Key: a - -> Parallel Index Scan using t_a_idx on t -(12 rows) + -> Partial GroupAggregate + Group Key: a, b + -> Incremental Sort + Sort Key: a, b + Presorted Key: a + -> Parallel Index Scan using t_a_idx on t +(14 rows) -- Incremental sort vs. set operations with varno 0 set enable_hashagg to off; @@ -1580,8 +1582,8 @@ from tenk1 t1 join tenk1 t2 on t1.unique1 = t2.unique2 join tenk1 t3 on t2.unique1 = t3.unique1 order by count(*); - QUERY PLAN ------------------------------------------------------------------------------------------------ + QUERY PLAN +---------------------------------------------------------------------------------------------------- Sort Sort Key: (count(*)) -> Finalize Aggregate @@ -1591,10 +1593,10 @@ order by count(*); -> Parallel Hash Join Hash Cond: (t2.unique1 = t3.unique1) -> Parallel Hash Join - Hash Cond: (t1.unique1 = t2.unique2) - -> Parallel Index Only Scan using tenk1_unique1 on tenk1 t1 + Hash Cond: (t2.unique2 = t1.unique1) + -> Parallel Index Scan using tenk1_unique2 on tenk1 t2 -> Parallel Hash - -> Parallel Index Scan using tenk1_unique2 on tenk1 t2 + -> Parallel Index Only Scan using tenk1_unique1 on tenk1 t1 -> Parallel Hash -> Parallel Index Only Scan using tenk1_unique1 on tenk1 t3 (15 rows) diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 4749f6ed70d5f..bc7cc76467ffa 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -76,8 +76,8 @@ insert into extremely_skewed update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 -- batch, we stick to that number, and peak memory usage stays within @@ -922,7 +922,7 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); QUERY PLAN ---------------------------------------------------------------- Finalize Aggregate @@ -934,10 +934,11 @@ explain (costs off) -> Parallel Seq Scan on wide -> Parallel Hash -> Parallel Seq Scan on wide wide_1 -(9 rows) + Filter: (id < 3) +(10 rows) select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); length -------- 320000 @@ -947,7 +948,7 @@ select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); multibatch ------------ diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index deb6e2ad6a94b..f373ad704b690 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2002,34 +2002,23 @@ pg_stat_progress_basebackup| SELECT pid, ELSE NULL::text END AS backup_type FROM pg_stat_get_progress_info('BASEBACKUP'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20); -pg_stat_progress_cluster| SELECT s.pid, - s.datid, - d.datname, - s.relid, - CASE s.param1 - WHEN 1 THEN 'CLUSTER'::text - WHEN 2 THEN 'VACUUM FULL'::text - ELSE NULL::text +pg_stat_progress_cluster| SELECT pid, + datid, + datname, + relid, + CASE + WHEN (command = ANY (ARRAY['CLUSTER'::text, 'VACUUM FULL'::text])) THEN command + WHEN (repack_index_relid = (0)::oid) THEN 'VACUUM FULL'::text + ELSE 'CLUSTER'::text END AS command, - CASE s.param2 - WHEN 0 THEN 'initializing'::text - WHEN 1 THEN 'seq scanning heap'::text - WHEN 2 THEN 'index scanning heap'::text - WHEN 3 THEN 'sorting tuples'::text - WHEN 4 THEN 'writing new heap'::text - WHEN 5 THEN 'swapping relation files'::text - WHEN 6 THEN 'rebuilding index'::text - WHEN 7 THEN 'performing final cleanup'::text - ELSE NULL::text - END AS phase, - (s.param3)::oid AS cluster_index_relid, - s.param4 AS heap_tuples_scanned, - s.param5 AS heap_tuples_written, - s.param6 AS heap_blks_total, - s.param7 AS heap_blks_scanned, - s.param8 AS index_rebuild_count - FROM (pg_stat_get_progress_info('CLUSTER'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) - LEFT JOIN pg_database d ON ((s.datid = d.oid))); + phase, + repack_index_relid AS cluster_index_relid, + heap_tuples_scanned, + heap_tuples_written, + heap_blks_total, + heap_blks_scanned, + index_rebuild_count + FROM pg_stat_progress_repack; pg_stat_progress_copy| SELECT s.pid, s.datid, d.datname, @@ -2089,6 +2078,35 @@ pg_stat_progress_create_index| SELECT s.pid, s.param15 AS partitions_done FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_progress_repack| SELECT s.pid, + s.datid, + d.datname, + s.relid, + CASE s.param1 + WHEN 1 THEN 'CLUSTER'::text + WHEN 2 THEN 'REPACK'::text + WHEN 3 THEN 'VACUUM FULL'::text + ELSE NULL::text + END AS command, + CASE s.param2 + WHEN 0 THEN 'initializing'::text + WHEN 1 THEN 'seq scanning heap'::text + WHEN 2 THEN 'index scanning heap'::text + WHEN 3 THEN 'sorting tuples'::text + WHEN 4 THEN 'writing new heap'::text + WHEN 5 THEN 'swapping relation files'::text + WHEN 6 THEN 'rebuilding index'::text + WHEN 7 THEN 'performing final cleanup'::text + ELSE NULL::text + END AS phase, + (s.param3)::oid AS repack_index_relid, + s.param4 AS heap_tuples_scanned, + s.param5 AS heap_tuples_written, + s.param6 AS heap_blks_total, + s.param7 AS heap_blks_scanned, + s.param8 AS index_rebuild_count + FROM (pg_stat_get_progress_info('REPACK'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) + LEFT JOIN pg_database d ON ((s.datid = d.oid))); pg_stat_progress_vacuum| SELECT s.pid, s.datid, d.datname, diff --git a/src/test/regress/expected/stats_import.out b/src/test/regress/expected/stats_import.out index 1f24e306f5b60..c7adb783da211 100644 --- a/src/test/regress/expected/stats_import.out +++ b/src/test/regress/expected/stats_import.out @@ -1,4 +1,158 @@ CREATE SCHEMA stats_import; +-- +-- Setup functions for set-difference convenience functions +-- +-- Test to detect any new columns added to pg_statistic. If any columns +-- are added, we may need to update pg_statistic_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_statistic'::regclass AND + attnum > 0; + count +------- + 31 +(1 row) + +-- Create a view that is used purely for the type based on pg_statistic. +CREATE VIEW stats_import.pg_statistic_flat_t AS + SELECT + a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, + s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, + s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, + s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, + s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, + s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, + s.stavalues5::text AS sv5 + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + WHERE FALSE; +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_statistic. +CREATE FUNCTION stats_import.pg_statistic_flat(p_relname text) +RETURNS SETOF stats_import.pg_statistic_flat_t +BEGIN ATOMIC + SELECT a.attname, s.stainherit, s.stanullfrac, s.stawidth, + s.stadistinct, s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, s.stacoll1, s.stacoll2, + s.stacoll3, s.stacoll4, s.stacoll5, s.stanumbers1, s.stanumbers2, + s.stanumbers3, s.stanumbers4, s.stanumbers5, s.stavalues1::text, + s.stavalues2::text, s.stavalues3::text, + s.stavalues4::text, s.stavalues5::text + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + JOIN pg_class c ON c.oid = a.attrelid + WHERE c.relnamespace = 'stats_import'::regnamespace + AND c.relname = p_relname; +END; +-- Comparison function for pg_statistic. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_statistic_get_difference(a text, b text) +RETURNS TABLE (relname text, stats stats_import.pg_statistic_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_statistic_flat(a)), + bset AS (SELECT * FROM stats_import.pg_statistic_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_statistic_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_statistic_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; +-- Test to detect any new columns added to pg_stats_ext. If any columns +-- are added, we may need to update pg_stats_ext_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext'::regclass AND + attnum > 0; + count +------- + 15 +(1 row) + +-- Create a view that is used purely for the type based on pg_stats_ext. +CREATE VIEW stats_import.pg_stats_ext_flat_t AS + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE FALSE; +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext. +CREATE FUNCTION stats_import.pg_stats_ext_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_flat_t +BEGIN ATOMIC + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE statistics_schemaname = 'stats_import' + AND statistics_name = p_statname; +END; +-- Comparison function for pg_stats_ext. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; +-- Test to detect any new columns added to pg_stats_ext_exprs. If any columns +-- are added, we may need to update pg_stats_ext_exprs_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext_exprs'::regclass AND + attnum > 0; + count +------- + 20 +(1 row) + +-- Create a view that is used purely for the type based on pg_stats_ext_exprs. +CREATE VIEW stats_import.pg_stats_ext_exprs_flat_t AS + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE FALSE; +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext_exprs. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_exprs_flat_t +BEGIN ATOMIC + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE n.statistics_schemaname = 'stats_import' AND + n.statistics_name = p_statname; +END; +-- Comparison function for pg_stats_ext_exprs. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_exprs_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; +-- +-- Schema setup. +-- CREATE TYPE stats_import.complex_type AS ( a integer, b real, @@ -1220,124 +1374,14 @@ ORDER BY c.relname; test_clone | 5 (4 rows) --- check test minus test_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- -(0 rows) - --- check test_clone minus test -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- -(0 rows) - --- check is_odd minus is_odd_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('test', 'test_clone') +\gx (0 rows) --- check is_odd_clone minus is_odd -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass; - attname | stainherit | stanullfrac | stawidth | stadistinct | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | staop1 | staop2 | staop3 | staop4 | staop5 | stacoll1 | stacoll2 | stacoll3 | stacoll4 | stacoll5 | stanumbers1 | stanumbers2 | stanumbers3 | stanumbers4 | stanumbers5 | sv1 | sv2 | sv3 | sv4 | sv5 | direction ----------+------------+-------------+----------+-------------+----------+----------+----------+----------+----------+--------+--------+--------+--------+--------+----------+----------+----------+----------+----------+-------------+-------------+-------------+-------------+-------------+-----+-----+-----+-----+-----+----------- +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('is_odd', 'is_odd_clone') +\gx (0 rows) -- attribute stats exist before a clear, but not after @@ -3169,108 +3213,14 @@ AND e.statistics_name = 'test_stat'; test_stat | t (1 row) --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_stat', 'test_stat_clone') +\gx (0 rows) --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_stat', 'test_stat_clone') +\gx (0 rows) ANALYZE stats_import.test_mr; @@ -3316,112 +3266,18 @@ AND e.statistics_name = 'test_mr_stat'; test_mr_stat | t (1 row) --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; - inherited | n_distinct | dependencies | most_common_vals | most_common_freqs | most_common_base_freqs ------------+------------+--------------+------------------+-------------------+------------------------ -(0 rows) - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx (0 rows) --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; - inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram | range_length_histogram | range_empty_frac | range_bounds_histogram ------------+-----------+-----------+------------+------------------+-------------------+------------------+-------------+-------------------+------------------------+----------------------+------------------------+------------------+------------------------ +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx (0 rows) -- range_length_histogram, range_empty_frac, and range_bounds_histogram --- have been added to pg_stat_ext_exprs in PostgreSQL 19. When dumping +-- have been added to pg_stats_ext_exprs in PostgreSQL 19. When dumping -- expression statistics in a cluster with an older version, these fields -- are dumped as NULL, pg_restore_extended_stats() authorizing the partial -- restore state of the extended statistics data. This test emulates such @@ -3506,8 +3362,17 @@ SELECT COUNT(*) FROM stats_import.test_range_expr_null (1 row) DROP SCHEMA stats_import CASCADE; -NOTICE: drop cascades to 9 other objects -DETAIL: drop cascades to type stats_import.complex_type +NOTICE: drop cascades to 18 other objects +DETAIL: drop cascades to view stats_import.pg_statistic_flat_t +drop cascades to function stats_import.pg_statistic_flat(text) +drop cascades to function stats_import.pg_statistic_get_difference(text,text) +drop cascades to view stats_import.pg_stats_ext_flat_t +drop cascades to function stats_import.pg_stats_ext_flat(text) +drop cascades to function stats_import.pg_stats_ext_get_difference(text,text) +drop cascades to view stats_import.pg_stats_ext_exprs_flat_t +drop cascades to function stats_import.pg_stats_ext_exprs_flat(text) +drop cascades to function stats_import.pg_stats_ext_exprs_get_difference(text,text) +drop cascades to type stats_import.complex_type drop cascades to table stats_import.test drop cascades to table stats_import.test_mr drop cascades to table stats_import.part_parent diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql index b7115f861044d..f90c6ec200b4a 100644 --- a/src/test/regress/sql/cluster.sql +++ b/src/test/regress/sql/cluster.sql @@ -76,7 +76,6 @@ INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass ORDER BY 1; - SELECT relname, relkind, EXISTS(SELECT 1 FROM pg_class WHERE oid = c.reltoastrelid) AS hastoast FROM pg_class c WHERE relname LIKE 'clstr_tst%' ORDER BY relname; @@ -229,6 +228,26 @@ SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM o CLUSTER clstrpart; ALTER TABLE clstrpart SET WITHOUT CLUSTER; ALTER TABLE clstrpart CLUSTER ON clstrpart_idx; +-- and they cannot get an index-ordered REPACK without an explicit index name +REPACK clstrpart USING INDEX; + +-- Check that REPACK sets new relfilenodes: it should process exactly the same +-- tables as CLUSTER did. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart USING INDEX clstrpart_idx; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + +-- And finally the same for REPACK w/o index. +DROP TABLE old_cluster_info; +DROP TABLE new_cluster_info; +CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +REPACK clstrpart; +CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ; +SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C"; + DROP TABLE clstrpart; -- Ownership of partitions is checked @@ -313,6 +332,57 @@ EXPLAIN (COSTS OFF) SELECT * FROM clstr_expression WHERE -a = -3 ORDER BY -a, b; SELECT * FROM clstr_expression WHERE -a = -3 ORDER BY -a, b; COMMIT; +---------------------------------------------------------------------- +-- +-- REPACK +-- +---------------------------------------------------------------------- + +-- REPACK handles individual tables identically to CLUSTER, but it's worth +-- checking if it handles table hierarchies identically as well. +REPACK clstr_tst USING INDEX clstr_tst_c; + +-- Verify that inheritance link still works +INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table 2'); +SELECT a,b,c,substring(d for 30), length(d) from clstr_tst; + +-- Verify that foreign key link still works +INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail'); + +SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass +ORDER BY 1; + +-- Verify partial analyze works +REPACK (ANALYZE) clstr_tst (a); +REPACK (ANALYZE) clstr_tst; +REPACK (VERBOSE) clstr_tst (a); + +-- REPACK w/o argument performs no ordering, so we can only check which tables +-- have the relfilenode changed. +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_old AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); + +SET SESSION AUTHORIZATION regress_clstr_user; +SET client_min_messages = ERROR; -- order of "skipping" warnings may vary +REPACK; +RESET client_min_messages; + +RESET SESSION AUTHORIZATION; +CREATE TEMP TABLE relnodes_new AS +(SELECT relname, relfilenode +FROM pg_class +WHERE relname IN ('clstr_1', 'clstr_2', 'clstr_3')); + +-- Do the actual comparison. Unlike CLUSTER, clstr_3 should have been +-- processed because there is nothing like clustering index here. +SELECT o.relname FROM relnodes_old o +JOIN relnodes_new n ON o.relname = n.relname +WHERE o.relfilenode <> n.relfilenode +ORDER BY o.relname; + -- clean up DROP TABLE clustertest; DROP TABLE clstr_1; diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql index aa147b14a90a0..084d5559e098e 100644 --- a/src/test/regress/sql/foreign_data.sql +++ b/src/test/regress/sql/foreign_data.sql @@ -383,10 +383,12 @@ COMMENT ON COLUMN ft1.c1 IS NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c4 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c5 integer DEFAULT 0; ALTER FOREIGN TABLE ft1 ADD COLUMN c6 integer; +ALTER FOREIGN TABLE ft1 ADD COLUMN IF NOT EXISTS c6 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c7 integer NOT NULL; ALTER FOREIGN TABLE ft1 ADD COLUMN c8 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c9 integer; ALTER FOREIGN TABLE ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); +ALTER FOREIGN TABLE ft1 ADD c11 integer; ALTER FOREIGN TABLE ft1 ALTER COLUMN c4 SET DEFAULT 0; ALTER FOREIGN TABLE ft1 ALTER COLUMN c5 DROP DEFAULT; @@ -419,6 +421,7 @@ ALTER FOREIGN TABLE ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); ALTER FOREIGN TABLE ft1 DROP COLUMN no_column; -- ERROR ALTER FOREIGN TABLE ft1 DROP COLUMN IF EXISTS no_column; ALTER FOREIGN TABLE ft1 DROP COLUMN c9; +ALTER FOREIGN TABLE ft1 DROP c11; ALTER FOREIGN TABLE ft1 ADD COLUMN c11 serial; ALTER FOREIGN TABLE ft1 SET SCHEMA foreign_schema; ALTER FOREIGN TABLE ft1 SET TABLESPACE ts; -- ERROR @@ -430,10 +433,12 @@ ALTER FOREIGN TABLE foreign_schema.ft1 RENAME TO foreign_table_1; -- alter noexisting table ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c4 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c6 integer; +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN IF NOT EXISTS c6 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c7 integer NOT NULL; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c8 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c9 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1'); +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD c11 integer; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c6 SET NOT NULL; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ALTER COLUMN c7 DROP NOT NULL; @@ -447,8 +452,10 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP CONSTRAINT IF EXISTS no_cons ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP CONSTRAINT ft1_c1_check; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OWNER TO regress_test_role; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@'); +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN no_column; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN IF EXISTS no_column; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP COLUMN c9; +ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 DROP c11; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 SET SCHEMA foreign_schema; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME c1 TO foreign_column_1; ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME TO foreign_table_1; diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index 49d3fd6185629..53db1754bb261 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -83,8 +83,8 @@ update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 @@ -496,14 +496,14 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); rollback to settings; diff --git a/src/test/regress/sql/stats_import.sql b/src/test/regress/sql/stats_import.sql index 61535a971dc1d..0518bbf6f4256 100644 --- a/src/test/regress/sql/stats_import.sql +++ b/src/test/regress/sql/stats_import.sql @@ -1,5 +1,157 @@ CREATE SCHEMA stats_import; +-- +-- Setup functions for set-difference convenience functions +-- + +-- Test to detect any new columns added to pg_statistic. If any columns +-- are added, we may need to update pg_statistic_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_statistic'::regclass AND + attnum > 0; + +-- Create a view that is used purely for the type based on pg_statistic. +CREATE VIEW stats_import.pg_statistic_flat_t AS + SELECT + a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, + s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, + s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, + s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, + s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, + s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, + s.stavalues5::text AS sv5 + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + WHERE FALSE; + +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_statistic. +CREATE FUNCTION stats_import.pg_statistic_flat(p_relname text) +RETURNS SETOF stats_import.pg_statistic_flat_t +BEGIN ATOMIC + SELECT a.attname, s.stainherit, s.stanullfrac, s.stawidth, + s.stadistinct, s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, + s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, s.stacoll1, s.stacoll2, + s.stacoll3, s.stacoll4, s.stacoll5, s.stanumbers1, s.stanumbers2, + s.stanumbers3, s.stanumbers4, s.stanumbers5, s.stavalues1::text, + s.stavalues2::text, s.stavalues3::text, + s.stavalues4::text, s.stavalues5::text + FROM pg_statistic s + JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum + JOIN pg_class c ON c.oid = a.attrelid + WHERE c.relnamespace = 'stats_import'::regnamespace + AND c.relname = p_relname; +END; + +-- Comparison function for pg_statistic. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_statistic_get_difference(a text, b text) +RETURNS TABLE (relname text, stats stats_import.pg_statistic_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_statistic_flat(a)), + bset AS (SELECT * FROM stats_import.pg_statistic_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_statistic_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_statistic_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; + +-- Test to detect any new columns added to pg_stats_ext. If any columns +-- are added, we may need to update pg_stats_ext_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext'::regclass AND + attnum > 0; + +-- Create a view that is used purely for the type based on pg_stats_ext. +CREATE VIEW stats_import.pg_stats_ext_flat_t AS + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE FALSE; + +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext. +CREATE FUNCTION stats_import.pg_stats_ext_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_flat_t +BEGIN ATOMIC + SELECT inherited, n_distinct, dependencies, most_common_vals, + most_common_freqs, most_common_base_freqs + FROM pg_stats_ext + WHERE statistics_schemaname = 'stats_import' + AND statistics_name = p_statname; +END; + +-- Comparison function for pg_stats_ext. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; + +-- Test to detect any new columns added to pg_stats_ext_exprs. If any columns +-- are added, we may need to update pg_stats_ext_exprs_flat() and the facilities +-- we are testing. +SELECT COUNT(*) FROM pg_attribute + WHERE attrelid = 'pg_catalog.pg_stats_ext_exprs'::regclass AND + attnum > 0; + +-- Create a view that is used purely for the type based on pg_stats_ext_exprs. +CREATE VIEW stats_import.pg_stats_ext_exprs_flat_t AS + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE FALSE; + +-- Function to retrieve data used for diff comparisons between two +-- relations based on the contents of pg_stats_ext_exprs. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_flat(p_statname text) +RETURNS SETOF stats_import.pg_stats_ext_exprs_flat_t +BEGIN ATOMIC + SELECT inherited, null_frac, avg_width, n_distinct, + most_common_vals::text AS most_common_vals, + most_common_freqs, histogram_bounds::text AS histogram_bounds, + correlation, most_common_elems::text AS most_common_elems, + most_common_elem_freqs, elem_count_histogram, + range_length_histogram::text AS range_length_histogram, + range_empty_frac, range_bounds_histogram::text AS range_bounds_histogram + FROM pg_stats_ext_exprs AS n + WHERE n.statistics_schemaname = 'stats_import' AND + n.statistics_name = p_statname; +END; + +-- Comparison function for pg_stats_ext_exprs. The two relations defined by +-- the function caller are compared. +CREATE FUNCTION stats_import.pg_stats_ext_exprs_get_difference(a text, b text) +RETURNS TABLE (statname text, stats stats_import.pg_stats_ext_exprs_flat_t) +BEGIN ATOMIC + WITH aset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(a)), + bset AS (SELECT * FROM stats_import.pg_stats_ext_exprs_flat(b)) + SELECT a AS relname, a_minus_b::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE aset EXCEPT TABLE bset) AS a_minus_b + UNION ALL + SELECT b AS relname, b_minus_a::stats_import.pg_stats_ext_exprs_flat_t + FROM (TABLE bset EXCEPT TABLE aset) AS b_minus_a; +END; + +-- +-- Schema setup. +-- CREATE TYPE stats_import.complex_type AS ( a integer, b real, @@ -884,113 +1036,13 @@ AND c.relname IN ('test', 'test_clone', 'is_odd', 'is_odd_clone') GROUP BY c.relname ORDER BY c.relname; --- check test minus test_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass; - --- check test_clone minus test -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'test_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.test'::regclass; - --- check is_odd minus is_odd_clone -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass; - --- check is_odd_clone minus is_odd -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd_clone'::regclass -EXCEPT -SELECT - a.attname, s.stainherit, s.stanullfrac, s.stawidth, s.stadistinct, - s.stakind1, s.stakind2, s.stakind3, s.stakind4, s.stakind5, - s.staop1, s.staop2, s.staop3, s.staop4, s.staop5, - s.stacoll1, s.stacoll2, s.stacoll3, s.stacoll4, s.stacoll5, - s.stanumbers1, s.stanumbers2, s.stanumbers3, s.stanumbers4, s.stanumbers5, - s.stavalues1::text AS sv1, s.stavalues2::text AS sv2, - s.stavalues3::text AS sv3, s.stavalues4::text AS sv4, - s.stavalues5::text AS sv5, 'is_odd_clone' AS direction -FROM pg_statistic s -JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum -WHERE s.starelid = 'stats_import.is_odd'::regclass; +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('test', 'test_clone') +\gx + +SELECT relname, (stats).* +FROM stats_import.pg_statistic_get_difference('is_odd', 'is_odd_clone') +\gx -- attribute stats exist before a clear, but not after SELECT COUNT(*) @@ -2171,96 +2223,14 @@ CROSS JOIN LATERAL ( WHERE e.statistics_schemaname = 'stats_import' AND e.statistics_name = 'test_stat'; --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone'; - --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_stat'; +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_stat', 'test_stat_clone') +\gx + +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_stat', 'test_stat_clone') +\gx + ANALYZE stats_import.test_mr; @@ -2302,99 +2272,16 @@ CROSS JOIN LATERAL ( WHERE e.statistics_schemaname = 'stats_import' AND e.statistics_name = 'test_mr_stat'; --- Set difference old MINUS new. -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; --- Set difference new MINUS old. -SELECT n.inherited, - n.n_distinct, n.dependencies, n.most_common_vals, - n.most_common_freqs, n.most_common_base_freqs - FROM pg_stats_ext AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.n_distinct, o.dependencies, o.most_common_vals, - o.most_common_freqs, o.most_common_base_freqs - FROM pg_stats_ext AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; - --- Set difference for exprs: old MINUS new. -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat' -EXCEPT -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone'; - --- Set difference for exprs: new MINUS old. -SELECT n.inherited, - n.null_frac, n.avg_width, n.n_distinct, - n.most_common_vals::text AS most_common_vals, - n.most_common_freqs, - n.histogram_bounds::text AS histogram_bounds, - n.correlation, - n.most_common_elems::text AS most_common_elems, - n.most_common_elem_freqs, n.elem_count_histogram, - n.range_length_histogram::text AS range_length_histogram, - n.range_empty_frac, - n.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS n - WHERE n.statistics_schemaname = 'stats_import' AND - n.statistics_name = 'test_mr_stat_clone' -EXCEPT -SELECT o.inherited, - o.null_frac, o.avg_width, o.n_distinct, - o.most_common_vals::text AS most_common_vals, - o.most_common_freqs, - o.histogram_bounds::text AS histogram_bounds, - o.correlation, - o.most_common_elems::text AS most_common_elems, - o.most_common_elem_freqs, o.elem_count_histogram, - o.range_length_histogram::text AS range_length_histogram, - o.range_empty_frac, - o.range_bounds_histogram::text AS range_bounds_histogram - FROM pg_stats_ext_exprs AS o - WHERE o.statistics_schemaname = 'stats_import' AND - o.statistics_name = 'test_mr_stat'; +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx + +SELECT statname, (stats).* +FROM stats_import.pg_stats_ext_exprs_get_difference('test_mr_stat', 'test_mr_stat_clone') +\gx -- range_length_histogram, range_empty_frac, and range_bounds_histogram --- have been added to pg_stat_ext_exprs in PostgreSQL 19. When dumping +-- have been added to pg_stats_ext_exprs in PostgreSQL 19. When dumping -- expression statistics in a cluster with an older version, these fields -- are dumped as NULL, pg_restore_extended_stats() authorizing the partial -- restore state of the extended statistics data. This test emulates such diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3250564d4ff67..a3b76886caa61 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -549,6 +549,7 @@ CostSelector Counters CoverExt CoverPos +CPUIDResult CreateAmStmt CreateCastStmt CreateConversionStmt @@ -2581,6 +2582,8 @@ ReorderBufferTupleCidEnt ReorderBufferTupleCidKey ReorderBufferUpdateProgressTxnCB ReorderTuple +RepackCommand +RepackStmt ReparameterizeForeignPathByChild_function ReplOriginId ReplOriginXactState @@ -2763,6 +2766,7 @@ SetConstraintStateData SetConstraintTriggerData SetExprState SetFunctionReturnMode +SetHintBitsState SetOp SetOpCmd SetOpPath @@ -3114,6 +3118,7 @@ TimeoutId TimeoutType Timestamp TimestampTz +TimingClockSourceType TmFromChar TmToChar ToastAttrInfo