diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index d3fea738ca33c..a57761c6facd3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2533,6 +2533,60 @@ include_dir 'conf.d' + + Timing + + + + timing_clock_source (enum) + + timing_clock_source configuration parameter + + + + + Selects the method for making timing measurements using the OS or specialized CPU + instructions. Possible values are: + + + + auto (automatically chooses TSC clock source for modern CPUs, + otherwise uses the OS system clock) + + + + + system (measures timing using the OS system clock) + + + + + tsc (measures timing using the x86-64 Time-Stamp Counter (TSC) + by directly executing RDTSC/RDTSCP instructions, see below) + + + + The default is auto. + + + If enabled, the TSC clock source will use the RDTSC instruction for the x86-64 + Time-Stamp Counter (TSC) to perform certain time measurements, for example during + EXPLAIN ANALYZE. The RDTSC instruction has less overhead than going through the OS + clock source, which for an EXPLAIN ANALYZE statement will show timing closer to the + actual runtime when timing is off. For timings that require higher precision the + RDTSCP instruction is used, which avoids inaccuracies due to CPU instruction re-ordering. + Use of RDTSC/RDTSC is not supported on older CPUs or hypervisors that don't pass the TSC + frequency to guest VMs, and is not advised on systems that utilize an emulated TSC. + + + To help decide which clock source to use on an x86-64 system you can run the + pg_test_timing utility to check TSC availability, and + perform timing measurements. + + + + + Background Writer diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc2522f..f217a72461d62 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -72,7 +72,7 @@ InstrStartNode(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); else - INSTR_TIME_SET_CURRENT(instr->starttime); + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } /* save buffer usage totals at node entry, if needed */ @@ -99,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); @@ -294,3 +294,78 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } + +/* GUC hooks for timing_clock_source */ + +#include "portability/instr_time.h" +#include "utils/guc_hooks.h" + +bool +check_timing_clock_source(int *newval, void **extra, GucSource source) +{ + /* + * Do nothing if timing is not initialized. This is only expected on child + * processes in EXEC_BACKEND builds, as GUC hooks can be called during + * InitializeGUCOptions() before InitProcessGlobals() has had a chance to + * run pg_initialize_timing(). Instead, TSC will be initialized via + * restore_backend_variables. + */ +#ifdef EXEC_BACKEND + if (!timing_initialized) + return true; +#else + Assert(timing_initialized); +#endif + +#if PG_INSTR_TSC_CLOCK + pg_initialize_timing_tsc(); + + if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0) + { + GUC_check_errdetail("TSC is not supported as timing clock source"); + return false; + } +#endif + + return true; +} + +void +assign_timing_clock_source(int newval, void *extra) +{ +#ifdef EXEC_BACKEND + if (!timing_initialized) + return; +#else + Assert(timing_initialized); +#endif + + /* + * Ignore the return code since the check hook already verified TSC is + * usable if its explicitly requested. + */ + pg_set_timing_clock_source(newval); +} + +const char * +show_timing_clock_source(void) +{ + switch (timing_clock_source) + { + case TIMING_CLOCK_SOURCE_AUTO: +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + return "auto (tsc)"; +#endif + return "auto (system)"; + case TIMING_CLOCK_SOURCE_SYSTEM: + return "system"; +#if PG_INSTR_TSC_CLOCK + case TIMING_CLOCK_SOURCE_TSC: + return "tsc"; +#endif + } + + /* unreachable */ + return "?"; +} diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 434e06430220e..4883f297a1c4e 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -55,6 +55,7 @@ #ifdef EXEC_BACKEND #include "nodes/queryjumble.h" +#include "portability/instr_time.h" #include "storage/pg_shmem.h" #include "storage/spin.h" #endif @@ -127,6 +128,10 @@ typedef struct int MyPMChildSlot; +#if PG_INSTR_TSC_CLOCK + int32 timing_tsc_frequency_khz; +#endif + /* * These are only used by backend processes, but are here because passing * a socket needs some special handling on Windows. 'client_sock' is an @@ -743,6 +748,10 @@ save_backend_variables(BackendParameters *param, param->MaxBackends = MaxBackends; param->num_pmchild_slots = num_pmchild_slots; +#if PG_INSTR_TSC_CLOCK + param->timing_tsc_frequency_khz = timing_tsc_frequency_khz; +#endif + #ifdef WIN32 param->PostmasterHandle = PostmasterHandle; if (!write_duplicated_handle(¶m->initial_signal_pipe, @@ -997,6 +1006,14 @@ restore_backend_variables(BackendParameters *param) MaxBackends = param->MaxBackends; num_pmchild_slots = param->num_pmchild_slots; +#if PG_INSTR_TSC_CLOCK + timing_tsc_frequency_khz = param->timing_tsc_frequency_khz; + + /* Re-run logic usually done by assign_timing_clock_source */ + pg_initialize_timing(); + pg_set_timing_clock_source(timing_clock_source); +#endif + #ifdef WIN32 PostmasterHandle = param->PostmasterHandle; pgwin32_initial_signal_pipe = param->initial_signal_pipe; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index eb4f3eb72d456..aa6b750d28ce5 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1937,6 +1937,11 @@ InitProcessGlobals(void) MyStartTimestamp = GetCurrentTimestamp(); MyStartTime = timestamptz_to_time_t(MyStartTimestamp); + /* + * Initialize timing infrastructure + */ + pg_initialize_timing(); + /* * Set a different global seed in every process. We want something * unpredictable, so if possible, use high-quality random bits for the diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index a315c4ab8aba2..233b8216677fe 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3044,6 +3044,17 @@ assign_hook => 'assign_timezone_abbreviations', }, +{ name => 'timing_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME', + short_desc => 'Controls the clock source used for collecting timing measurements.', + long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.', + variable => 'timing_clock_source', + boot_val => 'TIMING_CLOCK_SOURCE_AUTO', + options => 'timing_clock_source_options', + check_hook => 'check_timing_clock_source', + assign_hook => 'assign_timing_clock_source', + show_hook => 'show_timing_clock_source', +}, + { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', short_desc => 'Logs details of pre-authentication connection handshake.', flags => 'GUC_NOT_IN_SAMPLE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index d9ca13baff97d..9f9d8d17be917 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -92,6 +92,7 @@ #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" +#include "portability/instr_time.h" #include "utils/bytea.h" #include "utils/float.h" #include "utils/guc_hooks.h" @@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry timing_clock_source_options[] = { + {"auto", TIMING_CLOCK_SOURCE_AUTO, false}, + {"system", TIMING_CLOCK_SOURCE_SYSTEM, false}, +#if PG_INSTR_TSC_CLOCK + {"tsc", TIMING_CLOCK_SOURCE_TSC, false}, +#endif + {NULL, 0, false} +}; + static const struct config_enum_entry huge_pages_status_options[] = { {"off", HUGE_PAGES_OFF, false}, {"on", HUGE_PAGES_ON, false}, @@ -731,6 +741,7 @@ const char *const config_group_names[] = [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"), [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"), [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"), + [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"), [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"), [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"), [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 6d0337853e01b..ae027b2f2ae30 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -196,6 +196,10 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Time - + +#timing_clock_source = auto # auto, system, tsc (if supported) + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index aee41dbe3f9b7..9be5b09652ac3 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -30,22 +30,29 @@ static long long int largest_diff_count; static void handle_args(int argc, char *argv[]); -static uint64 test_timing(unsigned int duration); +static void test_system_timing(void); +#if PG_INSTR_TSC_CLOCK +static void test_tsc_timing(void); +#endif +static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing); static void output(uint64 loop_count); int main(int argc, char *argv[]) { - uint64 loop_count; - set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing")); progname = get_progname(argv[0]); handle_args(argc, argv); - loop_count = test_timing(test_duration); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); - output(loop_count); + test_system_timing(); + +#if PG_INSTR_TSC_CLOCK + test_tsc_timing(); +#endif return 0; } @@ -143,20 +150,92 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", - "Testing timing overhead for %u seconds.\n", + printf(ngettext("Testing timing overhead for %u second.\n\n", + "Testing timing overhead for %u seconds.\n\n", test_duration), test_duration); } +/* + * This tests default (non-fast) timing code. A clock source for that is + * always available. Hence, we can unconditionally output the result. + */ +static void +test_system_timing(void) +{ + uint64 loop_count; + + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false); + output(loop_count); +} + +/* + * If on a supported architecture, test the TSC clock source. This clock + * source is not always available. In that case we print an informational + * message indicating as such. + * + * We first emit "slow" timings (RDTSCP on x86), which are used for higher + * precision measurements when the TSC clock source is enabled. We emit + * "fast" timings second (RDTSC on x86), which is used for faster timing + * measurements with lower precision. + */ +#if PG_INSTR_TSC_CLOCK +static void +test_tsc_timing(void) +{ + uint64 loop_count; + + printf("\n"); + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false); + if (loop_count > 0) + { + output(loop_count); + printf("\n"); + + /* Now, emit fast timing measurements */ + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true); + output(loop_count); + printf("\n"); + + printf(_("TSC frequency: %u kHz\n"), timing_tsc_frequency_khz); + + pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO); + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n")); + else + printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n")); + } + else + printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?.\n")); +} +#endif + static uint64 -test_timing(unsigned int duration) +test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing) { uint64 loop_count = 0; instr_time start_time, end_time, prev, cur; + char *time_source = NULL; + + if (!pg_set_timing_clock_source(source)) + return 0; + + time_source = PG_INSTR_SYSTEM_CLOCK_NAME; + +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME; +#endif + + if (fast_timing) + printf(_("Fast clock source: %s\n"), time_source); + else if (source == TIMING_CLOCK_SOURCE_SYSTEM) + printf(_("System clock source: %s\n"), time_source); + else + printf(_("Clock source: %s\n"), time_source); /* * Pre-zero the statistics data structures. They're already zero by @@ -181,7 +260,11 @@ test_timing(unsigned int duration) instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(cur); + + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(cur); + else + INSTR_TIME_SET_CURRENT(cur); diff_time = cur; INSTR_TIME_SUBTRACT(diff_time, prev); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 1dae918cc09d2..c969afab3a595 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -6820,6 +6820,9 @@ main(int argc, char **argv) int exit_code = 0; struct timeval tv; + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + /* * Record difference between Unix time and instr_time time. We'll use * this for logging and aggregation. diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 9a397ec87b736..69d044d405d5b 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa50972..1a2fbbe887f22 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 0000000000000..8fcf49023bd6d --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,421 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#if defined(__APPLE__) +#include +#endif + +#include "port/pg_cpu.h" +#include "portability/instr_time.h" + +/* + * Stores what the number of ticks needs to be multiplied with to end up + * with nanoseconds using integer math. + * + * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows) + * the ticks to nanoseconds conversion requires floating point math because: + * + * sec = ticks / frequency_hz + * ns = ticks / frequency_hz * 1,000,000,000 + * ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz + * + * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU + * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4. + * + * To be able to use integer math we work around the lack of precision. We + * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the + * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by + * the same amount. + * + * We remember the maximum number of ticks that can be multiplied by the scale + * factor without overflowing so we can check via a * b > max <=> a > max / b. + * + * However, as this is meant for interval measurements, it is unlikely that the + * overflow path is actually taken in typical scenarios, since overflows would + * only occur for intervals longer than 6.5 days. + * + * Note we utilize unsigned integers even though ticks are stored as a signed + * value to encourage compilers to generate better assembly, since we can be + * sure these values are not negative. + * + * In all other cases we are using clock_gettime(), which uses nanoseconds + * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns + * to return the original value. + */ +uint64 ticks_per_ns_scaled = 0; +uint64 max_ticks_no_overflow = 0; +bool timing_initialized = false; +int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO; + +static void set_ticks_per_ns(void); +static void set_ticks_per_ns_system(void); + +#if PG_INSTR_TSC_CLOCK +static bool tsc_use_by_default(void); +static void set_ticks_per_ns_for_tsc(void); +#endif + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + * + * The allow_tsc_calibration argument sets whether the TSC logic (if available) + * is permitted to do calibration if it couldn't get the frequency from CPUID. + * + * Calibration may take up to TSC_CALIBRATION_MAX_NS and delays program start. + */ +void +pg_initialize_timing(void) +{ + if (timing_initialized) + return; + + set_ticks_per_ns(); + timing_initialized = true; +} + +bool +pg_set_timing_clock_source(TimingClockSourceType source) +{ + Assert(timing_initialized); + +#if PG_INSTR_TSC_CLOCK + pg_initialize_timing_tsc(); + + switch (source) + { + case TIMING_CLOCK_SOURCE_AUTO: + timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default(); + break; + case TIMING_CLOCK_SOURCE_SYSTEM: + timing_tsc_enabled = false; + break; + case TIMING_CLOCK_SOURCE_TSC: + /* Tell caller TSC is not usable */ + if (timing_tsc_frequency_khz <= 0) + return false; + timing_tsc_enabled = true; + break; + } +#endif + + set_ticks_per_ns(); + timing_clock_source = source; + return true; +} + +static void +set_ticks_per_ns(void) +{ +#if PG_INSTR_TSC_CLOCK + if (timing_tsc_enabled) + set_ticks_per_ns_for_tsc(); + else + set_ticks_per_ns_system(); +#else + set_ticks_per_ns_system(); +#endif +} + +#ifndef WIN32 + +static void +set_ticks_per_ns_system(void) +{ + ticks_per_ns_scaled = 0; + max_ticks_no_overflow = 0; +} + +#else /* WIN32 */ + +/* GetTimerFrequency returns counts per second */ +static inline double +GetTimerFrequency(void) +{ + LARGE_INTEGER f; + + QueryPerformanceFrequency(&f); + return (double) f.QuadPart; +} + +static void +set_ticks_per_ns_system(void) +{ + ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* WIN32 */ + +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ + +#if PG_INSTR_TSC_CLOCK + +bool timing_tsc_enabled = false; + +int32 timing_tsc_frequency_khz = -1; + +static void tsc_detect_frequency(void); + +/* + * Initialize the TSC clock source by determining its usability and frequency. + * + * This can be called multiple times, as timing_tsc_frequency_khz will be set to 0 + * if a prior call determined the TSC is not usable. On EXEC_BACKEND (Windows), + * the TSC frequency may also be set by restore_backend_variables. + */ +void +pg_initialize_timing_tsc(void) +{ + if (timing_tsc_frequency_khz < 0) + { + tsc_detect_frequency(); + +#ifndef FRONTEND + elog(DEBUG1, "detected TSC frequency: %d kHz", timing_tsc_frequency_khz); +#endif + } +} + +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + +static uint32 tsc_calibrate(void); + +/* + * Detect the TSC frequency and whether RDTSCP is available on x86-64. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + */ +static void +tsc_detect_frequency(void) +{ + timing_tsc_frequency_khz = 0; + + /* We require RDTSCP support and an invariant TSC, bail if not available */ + if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT)) + return; + + /* Determine speed at which the TSC advances */ + timing_tsc_frequency_khz = x86_tsc_frequency_khz(); + if (timing_tsc_frequency_khz > 0) + return; + + /* + * CPUID did not give us the TSC frequency. We can instead measure the + * frequency by comparing ticks against walltime in a calibration loop. + */ + timing_tsc_frequency_khz = tsc_calibrate(); +} + +/* + * Decides whether to use the TSC clock source if the user did not specify it + * one way or the other, and it is available (checked separately). + * + * Mirrors the Linux kernel's clocksource watchdog disable logic as updated in + * 2021 to reflect the reliability of the TSC on Intel platforms, see + * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion + * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/ + * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/ + * for reference. + * + * When the CPU has an invariant TSC (which we require in x86_tsc_frequency_khz), + * TSC_ADJUST bit set (Intel-only), we consider the TSC trustworthy by default, + * matching the Linux kernel. + * + * On other CPU platforms (e.g. AMD), in a virtual machine, or on systems we + * don't have an easy way to determine the TSC's reliability. If on Linux, we + * can check if TSC is the active clocksource, based on it having run the + * watchdog logic to monitor TSC correctness. For other platforms the user must + * explicitly enable it via GUC instead. + */ +static bool +tsc_use_by_default(void) +{ + if (x86_feature_available(PG_TSC_ADJUST)) + return true; + +#if defined(__linux__) + { + FILE *fp; + char buf[128]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp) + { + bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL && + strcmp(buf, "tsc\n") == 0); + + fclose(fp); + if (is_tsc) + return true; + } + } +#endif + + return false; +} + +/* + * Calibrate the TSC frequency by comparing TSC ticks against walltime. + * + * Takes initial TSC and system clock snapshots, then loops, recomputing the + * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC + * ticks divided by elapsed time. + * + * Once the frequency estimate stabilizes (consecutive iterations agree), we + * consider it converged and the frequency in KHz is returned. If either too + * many iterations or a time limit passes without convergence, 0 is returned. + */ +#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS) +#define TSC_CALIBRATION_ITERATIONS 1000000 +#define TSC_CALIBRATION_SKIPS 100 +#define TSC_CALIBRATION_STABLE_CYCLES 10 + +static uint32 +tsc_calibrate(void) +{ + instr_time initial_wall; + int64 initial_tsc; + double freq_khz = 0; + double prev_freq_khz = 0; + int stable_count = 0; + int64 prev_tsc; + + /* Ensure INSTR_* time below work on system time */ + set_ticks_per_ns_system(); + + INSTR_TIME_SET_CURRENT(initial_wall); + + initial_tsc = pg_rdtscp(); + prev_tsc = initial_tsc; + + for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++) + { + instr_time now_wall; + int64 now_tsc; + int64 elapsed_ns; + int64 elapsed_ticks; + + INSTR_TIME_SET_CURRENT(now_wall); + + now_tsc = pg_rdtscp(); + + INSTR_TIME_SUBTRACT(now_wall, initial_wall); + elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall); + + /* Safety: bail out if we've taken too long */ + if (elapsed_ns >= TSC_CALIBRATION_MAX_NS) + break; + + elapsed_ticks = now_tsc - initial_tsc; + + /* + * Skip if this is not the Nth cycle where we measure, if TSC hasn't + * advanced, or we walked backwards for some reason. + */ + if (i % TSC_CALIBRATION_SKIPS != 0 || now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0) + continue; + + freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000; + + /* + * Once freq_khz / prev_freq_khz is small, check if it stays that way. + * If it does for long enough, we've got a winner frequency. + */ + if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001) + { + stable_count++; + if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES) + return (uint32) freq_khz; + } + else + stable_count = 0; + + prev_tsc = now_tsc; + prev_freq_khz = freq_khz; + } + + /* did not converge */ + return 0; +} + +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Detect the generic timer frequency on AArch64. + */ +static void +tsc_detect_frequency(void) +{ + if (aarch64_has_heterogeneous_cores()) + { + timing_tsc_frequency_khz = 0; + return; + } + + timing_tsc_frequency_khz = aarch64_cntvct_frequency_khz(); +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; +} + +#endif /* defined(__aarch64__) */ + +#endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/common/meson.build b/src/common/meson.build index 4f9b8b8263d55..9bd55cda95b10 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index c5d96bb4f479f..aee501a4ecdc4 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -32,8 +32,16 @@ typedef enum X86FeatureId PG_AVX512_VL, PG_AVX512_VPCLMULQDQ, PG_AVX512_VPOPCNTDQ, + + /* identification */ + PG_HYPERVISOR, + + /* Time-Stamp Counter (TSC) flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, } X86FeatureId; -#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1) +#define X86FeaturesSize (PG_TSC_ADJUST + 1) extern PGDLLIMPORT bool X86Features[]; @@ -48,6 +56,14 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 0a1fff7c487ae..3c9815a3a25eb 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,11 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and + * QueryPerformanceCounter() on Windows. These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -17,7 +19,11 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window + * + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * * INSTR_TIME_ADD(x, y) x += y @@ -80,11 +86,103 @@ typedef struct instr_time #define NS_PER_MS INT64CONST(1000000) #define NS_PER_US INT64CONST(1000) +/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ +#define TICKS_TO_NS_SHIFT 14 -#ifndef WIN32 +/* + * Variables used to translate ticks to nanoseconds, initialized by + * pg_initialize_timing and adjusted by pg_set_timing_clock_source calls or + * changes of the "timing_clock_source" GUC. + * + * Note that changing these values after setting an instr_time and before + * reading/converting it will lead to incorrect results. This is technically + * possibly because the GUC can be changed at runtime, but unlikely, and we + * allow changing this at runtime to simplify testing of different sources. + */ +extern PGDLLIMPORT uint64 ticks_per_ns_scaled; +extern PGDLLIMPORT uint64 max_ticks_no_overflow; +extern PGDLLIMPORT bool timing_initialized; + +typedef enum +{ + TIMING_CLOCK_SOURCE_AUTO, + TIMING_CLOCK_SOURCE_SYSTEM, + TIMING_CLOCK_SOURCE_TSC +} TimingClockSourceType; + +extern int timing_clock_source; + +/* + * Initialize timing infrastructure + * + * This must be called at least once before using INSTR_TIME_SET_CURRENT* + * macros. + * + * If you want to use the TSC clock source in a client program you just also + * call pg_set_timing_clock_source afterwards. + */ +extern void pg_initialize_timing(void); + +/* + * Sets the time source to be used. Mainly intended for frontend programs, + * the backend should set it via the timing_clock_source GUC instead. + * + * Returns false if the clock source could not be set, for example when TSC + * is not available despite being explicitly set. + */ +extern bool pg_set_timing_clock_source(TimingClockSourceType source); + +#if defined(__x86_64__) || defined(_M_X64) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" +#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(__aarch64__) && !defined(WIN32) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(WIN32) +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 0 +#endif + +#if PG_INSTR_TSC_CLOCK +/* Whether to actually use TSC based on availability and GUC settings. */ +extern PGDLLIMPORT bool timing_tsc_enabled; + +/* + * TSC frequency in kHz, set during initialization. + * + * -1 = not yet initialized, 0 = TSC not usable, >0 = frequency in kHz. + */ +extern PGDLLIMPORT int32 timing_tsc_frequency_khz; + +extern void pg_initialize_timing_tsc(void); + +#endif /* PG_INSTR_TSC_CLOCK */ + +/* + * Returns the current timing clock source effectively in use, resolving + * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or + * TIMING_CLOCK_SOURCE_TSC. + */ +static inline TimingClockSourceType +pg_current_timing_clock_source(void) +{ +#if PG_INSTR_TSC_CLOCK + return timing_tsc_enabled ? TIMING_CLOCK_SOURCE_TSC : TIMING_CLOCK_SOURCE_SYSTEM; +#else + return TIMING_CLOCK_SOURCE_SYSTEM; +#endif +} +#ifndef WIN32 -/* Use clock_gettime() */ +/* On POSIX, use clock_gettime() for system clock source */ #include @@ -99,76 +197,252 @@ typedef struct instr_time * than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than * their version of CLOCK_MONOTONIC. + * + * Note this does not get used in case the TSC clock source logic is used, + * which directly calls architecture specific timing instructions (e.g. RDTSC). */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)" #elif defined(CLOCK_MONOTONIC) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)" #else -#define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)" #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_get_ticks_system(void) { instr_time now; struct timespec tmp; - clock_gettime(PG_INSTR_CLOCK, &tmp); + Assert(timing_initialized); + + clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += (n)) - - #else /* WIN32 */ +/* On Windows, use QueryPerformanceCounter() for system clock source */ -/* Use QueryPerformanceCounter() */ - -/* helper for INSTR_TIME_SET_CURRENT */ +#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter" static inline instr_time -pg_query_performance_counter(void) +pg_get_ticks_system(void) { instr_time now; LARGE_INTEGER tmp; + Assert(timing_initialized); + QueryPerformanceCounter(&tmp); now.ticks = tmp.QuadPart; return now; } -static inline double -GetTimerFrequency(void) +#endif /* WIN32 */ + +static inline int64 +pg_ticks_to_ns(int64 ticks) { - LARGE_INTEGER f; +#if PG_INSTR_TICKS_TO_NS + int64 ns = 0; + + Assert(timing_initialized); + + /* + * Avoid doing work if we don't use scaled ticks, e.g. system clock on + * Unix + */ + if (ticks_per_ns_scaled == 0) + return ticks; + + /* + * Would multiplication overflow? If so perform computation in two parts. + */ + if (unlikely(ticks > (int64) max_ticks_no_overflow)) + { + /* + * To avoid overflow, first scale total ticks down by the fixed + * factor, and *afterwards* multiply them by the frequency-based scale + * factor. + * + * The remaining ticks can follow the regular formula, since they + * won't overflow. + */ + int64 count = ticks >> TICKS_TO_NS_SHIFT; + + ns = count * ticks_per_ns_scaled; + ticks -= (count << TICKS_TO_NS_SHIFT); + } + + ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT; + + return ns; +#else + Assert(timing_initialized); - QueryPerformanceFrequency(&f); - return (double) f.QuadPart; + return ticks; +#endif /* PG_INSTR_TICKS_TO_NS */ } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_query_performance_counter()) +static inline int64 +pg_ns_to_ticks(int64 ns) +{ +#if PG_INSTR_TICKS_TO_NS + int64 ticks = 0; -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) + Assert(timing_initialized); -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency()))) + /* + * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g. + * system clock on Unix). + */ + if (ticks_per_ns_scaled == 0) + return ns; -#endif /* WIN32 */ + /* + * The reverse of pg_ticks_to_ns to avoid a similar overflow problem. + */ + if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT))) + { + int64 count = ns / ticks_per_ns_scaled; + + ticks = count << TICKS_TO_NS_SHIFT; + ns -= count * ticks_per_ns_scaled; + } + + ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled; + + return ticks; +#else + Assert(timing_initialized); + + return ns; +#endif /* PG_INSTR_TICKS_TO_NS */ +} + +#if PG_INSTR_TSC_CLOCK + +#if defined(__x86_64__) || defined(_M_X64) + +#ifdef _MSC_VER +#include +#endif /* defined(_MSC_VER) */ + +/* Helpers to abstract compiler differences for reading the x86 TSC. */ +static inline int64 +pg_rdtsc(void) +{ +#ifdef _MSC_VER + return __rdtsc(); +#else + return __builtin_ia32_rdtsc(); +#endif /* defined(_MSC_VER) */ +} + +static inline int64 +pg_rdtscp(void) +{ + uint32 unused; + +#ifdef _MSC_VER + return __rdtscp(&unused); +#else + return __builtin_ia32_rdtscp(&unused); +#endif /* defined(_MSC_VER) */ +} + +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = pg_rdtsc(); + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = pg_rdtscp(); + return now; + } + + return pg_get_ticks_system(); +} + +#elif defined(__aarch64__) && !defined(WIN32) + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ + +static inline instr_time +pg_get_ticks_fast(void) +{ + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + return pg_get_ticks_system(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ /* * Common macros @@ -178,10 +452,19 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) + #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) +#define INSTR_TIME_ADD_NANOSEC(t, n) \ + ((t).ticks += pg_ns_to_ticks(n)) + #define INSTR_TIME_SUBTRACT(x,y) \ ((x).ticks -= (y).ticks) @@ -191,6 +474,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (pg_ticks_to_ns((t).ticks)) + #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index b01697c1f606d..307f4fbaefe08 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -163,6 +163,9 @@ extern const char *show_timezone(void); extern bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); extern void assign_timezone_abbreviations(const char *newval, void *extra); +extern void assign_timing_clock_source(int newval, void *extra); +extern bool check_timing_clock_source(int *newval, void **extra, GucSource source); +extern const char *show_timing_clock_source(void); extern bool check_transaction_buffers(int *newval, void **extra, GucSource source); extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 71a8016196138..63440b8e36c83 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -60,6 +60,7 @@ enum config_group CONN_AUTH_TCP, CONN_AUTH_AUTH, CONN_AUTH_SSL, + RESOURCES_TIME, RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, diff --git a/src/port/meson.build b/src/port/meson.build index 922b3f646768d..d695f92b769e1 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_getopt_ctx.c', 'pg_localeconv_r.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 0000000000000..6fd9dd892ec98 --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(WIN32) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */ diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 40ff78633ca3f..f076db0ff1c38 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -101,19 +101,24 @@ void set_x86_features(void) { unsigned int reg[4] = {0}; + bool have_osxsave; pg_cpuid(0x01, reg); X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1; X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1; + X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1; + have_osxsave = reg[ECX] >> 27 & 1; + + pg_cpuid_subleaf(0x07, 0, reg); + + X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1; /* leaf 7 features that depend on OSXSAVE */ - if (reg[ECX] & (1 << 27)) + if (have_osxsave) { uint32 xcr0_val = 0; - pg_cpuid_subleaf(0x07, 0, reg); - #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ xcr0_val = _xgetbv(0); @@ -135,7 +140,121 @@ set_x86_features(void) } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, reg); + X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1; + + pg_cpuid(0x80000007, reg); + X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU through CPUID, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates the frequency information was not accessible via CPUID. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + + if (x86_feature_available(PG_HYPERVISOR)) + return x86_hypervisor_tsc_frequency_khz(); + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, reg); + if (reg[ECX] > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (reg[EAX] == 0 || reg[EBX] == 0) + return 0; + + return reg[ECX] / 1000 * reg[EBX] / reg[EAX]; + } + + /* + * When CPUID.15H is not available/incomplete, we can instead try to get + * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor + * Frequency Information Leaf". + */ + pg_cpuid(0x16, reg); + if (reg[EAX] > 0) + return reg[EAX] * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access a model-specific register (MSR) to get the frequency. MSRs are + * separate from CPUID and typically not available for unprivileged processes, + * so we can't get the frequency this way. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + +#if defined(HAVE__CPUIDEX) + + /* + * The hypervisor is determined using the 0x40000000 Hypervisor + * information leaf, which requires use of __cpuidex to set ECX to 0 to + * access it. + * + * The similar __get_cpuid_count function does not work as expected since + * it contains a check for __get_cpuid_max, which has been observed to be + * lower than the special Hypervisor leaf, despite it being available. + */ + __cpuidex((int *) reg, 0x40000000, 0); + + if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg))) + { + __cpuidex((int *) reg, 0x40000010, 0); + if (reg[EAX] > 0) + return reg[EAX]; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index cf55cdf3688d9..c3261bff209fb 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -850,3 +850,14 @@ SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats(); RESET ROLE; DROP ROLE regress_multixact_funcs; +-- test instr_time nanosecond<->ticks conversion +CREATE FUNCTION test_instr_time() + RETURNS bool + AS :'regresslib' + LANGUAGE C; +SELECT test_instr_time(); + test_instr_time +----------------- + t +(1 row) + diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 9a918156437b2..0c0620569829b 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -2181,6 +2181,8 @@ regression_main(int argc, char *argv[], progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress")); + pg_initialize_timing(); + get_restricted_token(); atexit(stop_postmaster); diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 68a01a1dde014..c2eaa96f08605 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -38,6 +38,7 @@ #include "optimizer/plancat.h" #include "parser/parse_coerce.h" #include "port/atomics.h" +#include "portability/instr_time.h" #include "postmaster/postmaster.h" /* for MAX_BACKENDS */ #include "storage/spin.h" #include "tcop/tcopprot.h" @@ -1384,3 +1385,38 @@ test_translation(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* Verify that pg_ticks_to_ns behaves correct, including overflow */ +PG_FUNCTION_INFO_V1(test_instr_time); +Datum +test_instr_time(PG_FUNCTION_ARGS) +{ + instr_time t; + int64 test_ns[] = {0, 1000, INT64CONST(1000000000000000)}; + int64 max_err; + + /* + * The ns-to-ticks-to-ns roundtrip may lose precision due to integer + * truncation in the fixed-point conversion. The maximum error depends on + * ticks_per_ns_scaled relative to the shift factor. + */ + max_err = (ticks_per_ns_scaled >> TICKS_TO_NS_SHIFT) + 1; + + for (int i = 0; i < lengthof(test_ns); i++) + { + int64 result; + + INSTR_TIME_SET_ZERO(t); + INSTR_TIME_ADD_NANOSEC(t, test_ns[i]); + result = INSTR_TIME_GET_NANOSEC(t); + + if (result < test_ns[i] - max_err || result > test_ns[i]) + elog(ERROR, + "INSTR_TIME_GET_NANOSEC(t) yielded " INT64_FORMAT + ", expected " INT64_FORMAT " (max_err " INT64_FORMAT + ") in file \"%s\" line %u", + result, test_ns[i], max_err, __FILE__, __LINE__); + } + + PG_RETURN_BOOL(true); +} diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index c8226652f2c94..946ee5726cdd7 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -349,3 +349,10 @@ SET ROLE regress_multixact_funcs; SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats(); RESET ROLE; DROP ROLE regress_multixact_funcs; + +-- test instr_time nanosecond<->ticks conversion +CREATE FUNCTION test_instr_time() + RETURNS bool + AS :'regresslib' + LANGUAGE C; +SELECT test_instr_time(); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index c72f6c595730a..d609599f419a7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3170,6 +3170,7 @@ TimeoutId TimeoutType Timestamp TimestampTz +TimingClockSourceType TmFromChar TmToChar ToastAttrInfo