From 55e607671c69cc37e732aa5b33049cd9a9e5f819 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 23:38:41 -0700 Subject: [PATCH 1/7] Refactor handling of x86 CPUID instructions Introduce two helpers for CPUID, pg_cpuid and pg_cpuid_subleaf that wrap the platform specific __get_cpuid/__cpuid and __get_cpuid_count/__cpuidex functions. Additionally, introduce the CPUIDResult struct to make code working with CPUID easier to read by referencing the register name (e.g. ECX) instead of a numeric index. Author: Lukas Fittl Suggested-By: John Naylor Reviewed-by: Discussion: --- src/port/pg_cpu_x86.c | 90 ++++++++++++++++++++++++-------- src/tools/pgindent/typedefs.list | 1 + 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 7575838245cd7..751f52f1e4214 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -53,6 +53,64 @@ mask_available(uint32 value, uint32 mask) return (value & mask) == mask; } +/* General purpose registers used by CPUID */ +typedef struct CPUIDResult +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +} CPUIDResult; + +StaticAssertDecl(sizeof(CPUIDResult) == 4 * sizeof(unsigned int), + "CPUIDResult must have no padding"); + +/* + * Request CPUID information for the specified leaf. + */ +static inline void +pg_cpuid(int leaf, CPUIDResult *r) +{ +#if defined(HAVE__GET_CPUID) + __get_cpuid(leaf, &r->eax, &r->ebx, &r->ecx, &r->edx); +#elif defined(HAVE__CPUID) + int exx[4] = {0}; + + __cpuid(exx, leaf); + r->eax = exx[0]; + r->ebx = exx[1]; + r->ecx = exx[2]; + r->edx = exx[3]; +#else +#error cpuid instruction not available +#endif +} + +/* + * Request CPUID information for the specified leaf and subleaf. + * + * Returns false if the CPUID leaf/subleaf is not supported. + */ +static inline bool +pg_cpuid_subleaf(int leaf, int subleaf, CPUIDResult *r) +{ +#if defined(HAVE__GET_CPUID_COUNT) + return __get_cpuid_count(leaf, subleaf, &r->eax, &r->ebx, &r->ecx, &r->edx) == 1; +#elif defined(HAVE__CPUIDEX) + int exx[4] = {0}; + + __cpuidex(exx, leaf, subleaf); + r->eax = exx[0]; + r->ebx = exx[1]; + r->ecx = exx[2]; + r->edx = exx[3]; + return true; +#else + memset(r, 0, sizeof(CPUIDResult)); + return false; +#endif +} + /* * Parse the CPU ID info for runtime checks. */ @@ -62,33 +120,21 @@ pg_attribute_target("xsave") void set_x86_features(void) { - unsigned int exx[4] = {0, 0, 0, 0}; + CPUIDResult r = {0}; -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif + pg_cpuid(0x01, &r); - X86Features[PG_SSE4_2] = exx[2] >> 20 & 1; - X86Features[PG_POPCNT] = exx[2] >> 23 & 1; + X86Features[PG_SSE4_2] = r.ecx >> 20 & 1; + X86Features[PG_POPCNT] = r.ecx >> 23 & 1; /* All these features depend on OSXSAVE */ - if (exx[2] & (1 << 27)) + if (r.ecx & (1 << 27)) { uint32 xcr0_val = 0; /* second cpuid call on leaf 7 to check extended AVX-512 support */ - memset(exx, 0, 4 * sizeof(exx[0])); - -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#endif + pg_cpuid_subleaf(0x07, 0, &r); #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ @@ -99,11 +145,11 @@ set_x86_features(void) if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) { - X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1; - X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1; + X86Features[PG_AVX512_BW] = r.ebx >> 30 & 1; + X86Features[PG_AVX512_VL] = r.ebx >> 31 & 1; - X86Features[PG_AVX512_VPCLMULQDQ] = exx[2] >> 10 & 1; - X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; + X86Features[PG_AVX512_VPCLMULQDQ] = r.ecx >> 10 & 1; + X86Features[PG_AVX512_VPOPCNTDQ] = r.ecx >> 14 & 1; } } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0042c33fa662f..7546babac1e48 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -555,6 +555,7 @@ CostSelector Counters CoverExt CoverPos +CPUIDResult CreateAmStmt CreateCastStmt CreateConversionStmt From 46219b20c12ad15a90100b4cf80b50c8cafb9548 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 31 Jan 2026 08:49:46 -0800 Subject: [PATCH 2/7] Check for HAVE__CPUIDEX and HAVE__GET_CPUID_COUNT separately Previously we would only check for the availability of __cpuidex if the related __get_cpuid_count was not available on a platform. But there are cases where we want to be able to call __cpuidex as the only viable option, specifically, when accessing a high leaf like VM Hypervisor information (0x40000000), which __get_cpuid_count does not allow. This will be used in an future commit to access Hypervisor information about the TSC frequency of x86 CPUs, where available. Note that __cpuidex is defined in cpuid.h for GCC/clang, but in intrin.h for MSVC. Because we now set HAVE__CPUIDEX for GCC/clang when available, adjust existing code to check for _MSC_VER when including intrin.h. Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- configure | 22 +++++++++++++--------- configure.ac | 30 +++++++++++++++++------------- meson.build | 12 +++++++++--- src/port/pg_cpu_x86.c | 10 +++++----- 4 files changed, 44 insertions(+), 30 deletions(-) diff --git a/configure b/configure index 0d3f634abec28..407f773e771b2 100755 --- a/configure +++ b/configure @@ -17746,7 +17746,8 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi fi -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 $as_echo_n "checking for __get_cpuid_count... " >&6; } if ${pgac_cv__get_cpuid_count+:} false; then : @@ -17779,21 +17780,25 @@ if test x"$pgac_cv__get_cpuid_count" = x"yes"; then $as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h -else - # __cpuidex() - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +fi +# __cpuidex() +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 $as_echo_n "checking for __cpuidex... " >&6; } if ${pgac_cv__cpuidex+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include +#ifdef _MSC_VER + #include + #else + #include + #endif int main () { -unsigned int exx[4] = {0, 0, 0, 0}; - __cpuidex(exx, 7, 0); +int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); ; return 0; @@ -17809,11 +17814,10 @@ rm -f core conftest.err conftest.$ac_objext \ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 $as_echo "$pgac_cv__cpuidex" >&6; } - if test x"$pgac_cv__cpuidex" = x"yes"; then +if test x"$pgac_cv__cpuidex" = x"yes"; then $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h - fi fi # Check for XSAVE intrinsics diff --git a/configure.ac b/configure.ac index f8327a7020a03..96674e8005b66 100644 --- a/configure.ac +++ b/configure.ac @@ -2107,7 +2107,8 @@ else fi fi -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2117,18 +2118,21 @@ AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], [pgac_cv__get_cpuid_count="no"])]) if test x"$pgac_cv__get_cpuid_count" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) -else - # __cpuidex() - AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], - [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], - [[unsigned int exx[4] = {0, 0, 0, 0}; - __cpuidex(exx, 7, 0); - ]])], - [pgac_cv__cpuidex="yes"], - [pgac_cv__cpuidex="no"])]) - if test x"$pgac_cv__cpuidex" = x"yes"; then - AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) - fi +fi +# __cpuidex() +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#ifdef _MSC_VER + #include + #else + #include + #endif], + [[int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) fi # Check for XSAVE intrinsics diff --git a/meson.build b/meson.build index 0a181909fab15..d1208c747e250 100644 --- a/meson.build +++ b/meson.build @@ -2132,7 +2132,8 @@ elif cc.links(''' endif -# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes +# need __cpuidex() even if __get_cpuid_count() is available. if cc.links(''' #include int main(int arg, char **argv) @@ -2143,11 +2144,16 @@ if cc.links(''' ''', name: '__get_cpuid_count', args: test_c_args) cdata.set('HAVE__GET_CPUID_COUNT', 1) -elif cc.links(''' +endif +if cc.links(''' + #ifdef _MSC_VER #include + #else + #include + #endif int main(int arg, char **argv) { - unsigned int exx[4] = {0, 0, 0, 0}; + int exx[4] = {0, 0, 0, 0}; __cpuidex(exx, 7, 0); } ''', name: '__cpuidex', diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 751f52f1e4214..b1aa71b888739 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -17,12 +17,12 @@ #if defined(USE_SSE2) || defined(__i386__) -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) -#include -#endif - -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#if defined(HAVE__CPUID) || defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || defined(HAVE__CPUIDEX) +#if defined(_MSC_VER) #include +#else +#include +#endif /* defined(_MSC_VER) */ #endif #ifdef HAVE_XSAVE_INTRINSICS From ba1d96e64f0eae8833ce17149f9c78c549abc073 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 24 Feb 2026 23:44:28 -0800 Subject: [PATCH 3/7] pg_test_timing: Reduce per-loop overhead The pg_test_timing program was previously using INSTR_TIME_GET_NANOSEC on an absolute instr_time value in order to do a diff, which goes against the spirit of how the GET_* macros are supposed to be used, and will cause overhead in a future change that assumes these macros are typically used on intervals only. Additionally the program was doing unnecessary work in the test loop by measuring the time elapsed, instead of checking the existing current time measurement against a target end time. To support that, introduce a new INSTR_TIME_ADD_NANOSEC macro that allows adding user-defined nanoseconds to an instr_time variable. Author: Lukas Fittl Reviewed-by: Discussion: --- src/bin/pg_test_timing/pg_test_timing.c | 28 ++++++++++++------------- src/include/portability/instr_time.h | 8 +++++++ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index a5621251afcee..264903ebbf621 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -152,14 +152,11 @@ handle_args(int argc, char *argv[]) static uint64 test_timing(unsigned int duration) { - uint64 total_time; - int64 time_elapsed = 0; uint64 loop_count = 0; - uint64 prev, - cur; instr_time start_time, end_time, - temp; + prev, + cur; /* * Pre-zero the statistics data structures. They're already zero by @@ -171,20 +168,24 @@ test_timing(unsigned int duration) largest_diff = 0; largest_diff_count = 0; - total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0; - INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_NANOSEC(start_time); + cur = start_time; - while (time_elapsed < total_time) + end_time = start_time; + INSTR_TIME_ADD_NANOSEC(end_time, duration > 0 ? duration * NS_PER_S : 0); + + while (INSTR_TIME_GT(end_time, cur)) { int32 diff, bits; + instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_NANOSEC(temp); - diff = cur - prev; + INSTR_TIME_SET_CURRENT(cur); + + diff_time = cur; + INSTR_TIME_SUBTRACT(diff_time, prev); + diff = INSTR_TIME_GET_NANOSEC(diff_time); /* Did time go backwards? */ if (unlikely(diff < 0)) @@ -217,10 +218,9 @@ test_timing(unsigned int duration) largest_diff_count++; loop_count++; - INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_NANOSEC(temp); } + /* Refresh end time to be the actual time spent (vs the target end time) */ INSTR_TIME_SET_CURRENT(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 8b6baeffd3e46..0a1fff7c487ae 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -22,6 +22,8 @@ * * INSTR_TIME_ADD(x, y) x += y * + * INSTR_TIME_ADD_NANOSEC(t, n) x += y in nanoseconds (converts to ticks) + * * INSTR_TIME_SUBTRACT(x, y) x -= y * * INSTR_TIME_ACCUM_DIFF(x, y, z) x += (y - z) @@ -125,6 +127,9 @@ pg_clock_gettime_ns(void) #define INSTR_TIME_GET_NANOSEC(t) \ ((int64) (t).ticks) +#define INSTR_TIME_ADD_NANOSEC(t, n) \ + ((t).ticks += (n)) + #else /* WIN32 */ @@ -159,6 +164,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GET_NANOSEC(t) \ ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) +#define INSTR_TIME_ADD_NANOSEC(t, n) \ + ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency()))) + #endif /* WIN32 */ From 8a863f7b31cf30bbb7c9771e6cc84063a259a597 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Fri, 25 Jul 2025 17:57:20 -0700 Subject: [PATCH 4/7] instrumentation: Streamline ticks to nanosecond conversion across platforms The timing infrastructure (INSTR_* macros) measures time elapsed using clock_gettime() on POSIX systems, which returns the time as nanoseconds, and QueryPerformanceCounter() on Windows, which is a specialized timing clock source that returns a tick counter that needs to be converted to nanoseconds using the result of QueryPerformanceFrequency(). This conversion currently happens ad-hoc on Windows, e.g. when calling INSTR_TIME_GET_NANOSEC, which calls QueryPerformanceFrequency() on every invocation, despite the frequency being stable after program start, incurring unnecessary overhead. It also causes a fractured implementation where macros are defined differently between platforms. To ease code readability, and prepare for a future change that intends to use a ticks-to-nanosecond conversion on x86-64 for TSC use, introduce a new pg_ticks_to_ns() function that gets called on all platforms. This function relies on a separately initialized ticks_per_ns_scaled value, that represents the conversion ratio. This value is initialized from QueryPerformanceFrequency() on Windows, and set to zero on x86-64 POSIX systems, which results in the ticks being treated as nanoseconds. Other architectures always directly return the original ticks. To support this, pg_initialize_timing() is introduced, and is now mandatory for both the backend and any frontend programs to call before utilizing INSTR_* macros. Author: Lukas Fittl Author: Andres Freund Author: David Geier Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/backend/postmaster/postmaster.c | 5 + src/bin/pg_test_timing/pg_test_timing.c | 3 + src/bin/pgbench/pgbench.c | 3 + src/bin/psql/startup.c | 4 + src/common/Makefile | 1 + src/common/instr_time.c | 91 +++++++++++++++ src/common/meson.build | 1 + src/include/portability/instr_time.h | 143 +++++++++++++++++++----- src/test/regress/pg_regress.c | 2 + 9 files changed, 223 insertions(+), 30 deletions(-) create mode 100644 src/common/instr_time.c diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 3fac46c402b20..6c5ba723e78ce 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1945,6 +1945,11 @@ InitProcessGlobals(void) MyStartTimestamp = GetCurrentTimestamp(); MyStartTime = timestamptz_to_time_t(MyStartTimestamp); + /* + * Initialize timing infrastructure + */ + pg_initialize_timing(); + /* * Set a different global seed in every process. We want something * unpredictable, so if possible, use high-quality random bits for the diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index 264903ebbf621..1d9ee4fb5882c 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -43,6 +43,9 @@ main(int argc, char *argv[]) handle_args(argc, argv); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + loop_count = test_timing(test_duration); output(loop_count); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 1dae918cc09d2..c969afab3a595 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -6820,6 +6820,9 @@ main(int argc, char **argv) int exit_code = 0; struct timeval tv; + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + /* * Record difference between Unix time and instr_time time. We'll use * this for logging and aggregation. diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 9a397ec87b736..69d044d405d5b 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize timing infrastructure (required for INSTR_* calls) */ + pg_initialize_timing(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa50972..1a2fbbe887f22 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 0000000000000..48e8283d166a1 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,91 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "portability/instr_time.h" + +/* + * Stores what the number of ticks needs to be multiplied with to end up + * with nanoseconds using integer math. + * + * On certain platforms (currently Windows) the ticks to nanoseconds conversion + * requires floating point math because: + * + * sec = ticks / frequency_hz + * ns = ticks / frequency_hz * 1,000,000,000 + * ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz + * + * Here, 'ns' is usually a floating number. For example for a 2.5 GHz CPU + * the scaling factor becomes 1,000,000 / 2,500,000 = 1.2. + * + * To be able to use integer math we work around the lack of precision. We + * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the + * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by + * the same amount. We utilize unsigned integers even though ticks are stored + * as a signed value to encourage compilers to generate better assembly. + * + * We remember the maximum number of ticks that can be multiplied by the scale + * factor without overflowing so we can check via a * b > max <=> a > max / b. + * + * On all other platforms we are using clock_gettime(), which uses nanoseconds + * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns + * to return the original value. + */ +uint64 ticks_per_ns_scaled = 0; +uint64 max_ticks_no_overflow = 0; +bool timing_initialized = false; + +static void set_ticks_per_ns(void); + +void +pg_initialize_timing(void) +{ + if (timing_initialized) + return; + + set_ticks_per_ns(); + timing_initialized = true; +} + +#ifndef WIN32 + +static void +set_ticks_per_ns(void) +{ + ticks_per_ns_scaled = 0; + max_ticks_no_overflow = 0; +} + +#else /* WIN32 */ + +/* GetTimerFrequency returns counts per second */ +static inline double +GetTimerFrequency(void) +{ + LARGE_INTEGER f; + + QueryPerformanceFrequency(&f); + return (double) f.QuadPart; +} + +static void +set_ticks_per_ns(void) +{ + ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* WIN32 */ diff --git a/src/common/meson.build b/src/common/meson.build index 4f9b8b8263d55..9bd55cda95b10 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 0a1fff7c487ae..e1584695520a3 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -80,11 +80,33 @@ typedef struct instr_time #define NS_PER_MS INT64CONST(1000000) #define NS_PER_US INT64CONST(1000) +/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ +#define TICKS_TO_NS_SHIFT 14 -#ifndef WIN32 +#ifdef WIN32 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TICKS_TO_NS 0 +#endif + +/* + * Variables used to translate ticks to nanoseconds, initialized by + * pg_initialize_timing. + */ +extern PGDLLIMPORT uint64 ticks_per_ns_scaled; +extern PGDLLIMPORT uint64 max_ticks_no_overflow; +extern PGDLLIMPORT bool timing_initialized; + +/* + * Initialize timing infrastructure + * + * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros. + */ +extern void pg_initialize_timing(void); +#ifndef WIN32 -/* Use clock_gettime() */ +/* On POSIX, use clock_gettime() for system clock source */ #include @@ -108,67 +130,119 @@ typedef struct instr_time #define PG_INSTR_CLOCK CLOCK_REALTIME #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_get_ticks(void) { instr_time now; struct timespec tmp; + Assert(timing_initialized); + clock_gettime(PG_INSTR_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) - -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += (n)) - - #else /* WIN32 */ +/* On Windows, use QueryPerformanceCounter() for system clock source */ -/* Use QueryPerformanceCounter() */ - -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_query_performance_counter(void) +pg_get_ticks(void) { instr_time now; LARGE_INTEGER tmp; + Assert(timing_initialized); + QueryPerformanceCounter(&tmp); now.ticks = tmp.QuadPart; return now; } -static inline double -GetTimerFrequency(void) +#endif /* WIN32 */ + +static inline int64 +pg_ticks_to_ns(int64 ticks) { - LARGE_INTEGER f; +#if PG_INSTR_TICKS_TO_NS + int64 ns = 0; + + Assert(timing_initialized); + + /* + * Avoid doing work if we don't use scaled ticks, e.g. system clock on + * Unix + */ + if (ticks_per_ns_scaled == 0) + return ticks; + + /* + * Would multiplication overflow? If so perform computation in two parts. + */ + if (unlikely(ticks > (int64) max_ticks_no_overflow)) + { + /* + * To avoid overflow, first scale total ticks down by the fixed + * factor, and *afterwards* multiply them by the frequency-based scale + * factor. + * + * The remaining ticks can follow the regular formula, since they + * won't overflow. + */ + int64 count = ticks >> TICKS_TO_NS_SHIFT; + + ns = count * ticks_per_ns_scaled; + ticks -= (count << TICKS_TO_NS_SHIFT); + } + + ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT; + + return ns; +#else + Assert(timing_initialized); - QueryPerformanceFrequency(&f); - return (double) f.QuadPart; + return ticks; +#endif /* PG_INSTR_TICKS_TO_NS */ } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_query_performance_counter()) +static inline int64 +pg_ns_to_ticks(int64 ns) +{ +#if PG_INSTR_TICKS_TO_NS + int64 ticks = 0; -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) + Assert(timing_initialized); -#define INSTR_TIME_ADD_NANOSEC(t, n) \ - ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency()))) + /* + * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g. + * system clock on Unix). + */ + if (ticks_per_ns_scaled == 0) + return ns; -#endif /* WIN32 */ + /* + * The reverse of pg_ticks_to_ns to avoid a similar overflow problem. + */ + if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT))) + { + int64 count = ns / ticks_per_ns_scaled; + + ticks = count << TICKS_TO_NS_SHIFT; + ns -= count * ticks_per_ns_scaled; + } + ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled; + + return ticks; +#else + Assert(timing_initialized); + + return ns; +#endif /* PG_INSTR_TICKS_TO_NS */ +} /* * Common macros @@ -178,10 +252,16 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) + #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) +#define INSTR_TIME_ADD_NANOSEC(t, n) \ + ((t).ticks += pg_ns_to_ticks(n)) + #define INSTR_TIME_SUBTRACT(x,y) \ ((x).ticks -= (y).ticks) @@ -191,6 +271,9 @@ GetTimerFrequency(void) #define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (pg_ticks_to_ns((t).ticks)) + #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S) diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index b8b6a91198763..5cd1c9195d4d9 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -2110,6 +2110,8 @@ regression_main(int argc, char *argv[], progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress")); + pg_initialize_timing(); + get_restricted_token(); atexit(stop_postmaster); From 0d53aa6cdcdc34c293cccc4c97a3a191087de6ef Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Wed, 11 Mar 2026 00:55:03 -0700 Subject: [PATCH 5/7] instrumentation: Use Time-Stamp Counter (TSC) on x86-64 for faster measurements This allows the direct use of the Time-Stamp Counter (TSC) value retrieved from the CPU using RDTSC/RDTSC instructions, instead of APIs like clock_gettime() on POSIX systems. This reduces the overhead of EXPLAIN with ANALYZE and TIMING ON. Tests showed that runtime when instrumented can be reduced by up to 10% for queries moving lots of rows through the plan. To control use of the TSC, the new "timing_clock_source" GUC is introduced, whose default ("auto") automatically uses the TSC when running on Linux/x86-64, in case the system clocksource is reported as "tsc". The use of the system APIs can be enforced by setting "system", or on x86-64 architectures the use of TSC can be enforced by explicitly setting "tsc". In order to use the TSC the frequency is first determined by use of CPUID, and if not available, by running a short calibration loop at program start, falling back to the system time if TSC values are not stable. Note, that we split TSC usage into the RDTSC CPU instruction which does not wait for out-of-order execution (faster, less precise) and the RDTSCP instruction, which waits for outstanding instructions to retire. RDTSCP is deemed to have little benefit in the typical InstrStartNode() / InstrStopNode() use case of EXPLAIN, and can be up to twice as slow. To separate these use cases, the new macro INSTR_TIME_SET_CURRENT_FAST() is introduced, which uses RDTSC. The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be used when precision is more important than performance. When the system timing clock source is used both of these macros instead utilize the system APIs (clock_gettime / QueryPerformanceCounter) like before. Author: David Geier Author: Andres Freund Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- doc/src/sgml/config.sgml | 54 ++++ src/backend/executor/instrument.c | 65 +++- src/backend/postmaster/launch_backend.c | 13 + src/backend/utils/misc/guc_parameters.dat | 11 + src/backend/utils/misc/guc_tables.c | 11 + src/backend/utils/misc/postgresql.conf.sample | 4 + src/bin/pg_test_timing/pg_test_timing.c | 4 +- src/common/instr_time.c | 303 +++++++++++++++++- src/include/port/pg_cpu.h | 9 + src/include/portability/instr_time.h | 171 ++++++++-- src/include/utils/guc_hooks.h | 3 + src/include/utils/guc_tables.h | 1 + src/port/pg_cpu_x86.c | 200 +++++++++++- src/tools/pgindent/typedefs.list | 1 + 14 files changed, 810 insertions(+), 40 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 8cdd826fbd37a..99a6593d9ac59 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2533,6 +2533,60 @@ include_dir 'conf.d' + + Timing + + + + timing_clock_source (enum) + + timing_clock_source configuration parameter + + + + + Selects the method for making timing measurements using the OS or specialized CPU + instructions. Possible values are: + + + + auto (automatically chooses TSC clock source for modern CPUs, + otherwise uses the OS system clock) + + + + + system (measures timing using the OS system clock) + + + + + tsc (measures timing using the x86-64 Time-Stamp Counter (TSC) + by directly executing RDTSC/RDTSCP instructions, see below) + + + + The default is auto. + + + If enabled, the TSC clock source will use the RDTSC instruction for the x86-64 + Time-Stamp Counter (TSC) to perform certain time measurements, for example during + EXPLAIN ANALYZE. The RDTSC instruction has less overhead than going through the OS + clock source, which for an EXPLAIN ANALYZE statement will show timing closer to the + actual runtime when timing is off. For timings that require higher precision the + RDTSCP instruction is used, which avoids inaccuracies due to CPU instruction re-ordering. + Use of RDTSC/RDTSC is not supported on older CPUs or hypervisors that don't pass the TSC + frequency to guest VMs, and is not advised on systems that utilize an emulated TSC. + + + To help decide which clock source to use on an x86-64 system you can run the + pg_test_timing utility to check TSC availability, and + perform timing measurements. + + + + + Background Writer diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc2522f..2a144d18113f6 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -72,7 +72,7 @@ InstrStartNode(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); else - INSTR_TIME_SET_CURRENT(instr->starttime); + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } /* save buffer usage totals at node entry, if needed */ @@ -99,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); @@ -294,3 +294,64 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } + +/* GUC hooks for timing_clock_source */ + +#include "portability/instr_time.h" +#include "utils/guc_hooks.h" + +bool +check_timing_clock_source(int *newval, void **extra, GucSource source) +{ + /* + * Ensure timing is initialized. On Windows (EXEC_BACKEND), GUC hooks can + * be called during InitializeGUCOptions() before InitProcessGlobals() has + * had a chance to run pg_initialize_timing(). + */ + pg_initialize_timing(); + +#if PG_INSTR_TSC_CLOCK + pg_initialize_timing_tsc(); + + if (*newval == TIMING_CLOCK_SOURCE_TSC && tsc_frequency_khz <= 0) + { + GUC_check_errdetail("TSC is not supported as timing clock source"); + return false; + } +#endif + + return true; +} + +void +assign_timing_clock_source(int newval, void *extra) +{ + /* + * Ignore the return code since the check hook already verified TSC is + * usable if its explicitly requested + */ + pg_set_timing_clock_source(newval); +} + +const char * +show_timing_clock_source(void) +{ + switch (timing_clock_source) + { + case TIMING_CLOCK_SOURCE_AUTO: +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + return "auto (tsc)"; +#endif + return "auto (system)"; + case TIMING_CLOCK_SOURCE_SYSTEM: + return "system"; +#if PG_INSTR_TSC_CLOCK + case TIMING_CLOCK_SOURCE_TSC: + return "tsc"; +#endif + } + + /* unreachable */ + return "?"; +} diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 30357845729ac..b75a4f6ec9af3 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -55,6 +55,7 @@ #ifdef EXEC_BACKEND #include "nodes/queryjumble.h" +#include "portability/instr_time.h" #include "storage/pg_shmem.h" #include "storage/spin.h" #endif @@ -132,6 +133,10 @@ typedef struct int MyPMChildSlot; +#if PG_INSTR_TSC_CLOCK + int32 tsc_frequency_khz; +#endif + /* * These are only used by backend processes, but are here because passing * a socket needs some special handling on Windows. 'client_sock' is an @@ -753,6 +758,10 @@ save_backend_variables(BackendParameters *param, param->MaxBackends = MaxBackends; param->num_pmchild_slots = num_pmchild_slots; +#if PG_INSTR_TSC_CLOCK + param->tsc_frequency_khz = tsc_frequency_khz; +#endif + #ifdef WIN32 param->PostmasterHandle = PostmasterHandle; if (!write_duplicated_handle(¶m->initial_signal_pipe, @@ -1012,6 +1021,10 @@ restore_backend_variables(BackendParameters *param) MaxBackends = param->MaxBackends; num_pmchild_slots = param->num_pmchild_slots; +#if PG_INSTR_TSC_CLOCK + tsc_frequency_khz = param->tsc_frequency_khz; +#endif + #ifdef WIN32 PostmasterHandle = param->PostmasterHandle; pgwin32_initial_signal_pipe = param->initial_signal_pipe; diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 0c9854ad8fc05..246c582eeed18 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3003,6 +3003,17 @@ assign_hook => 'assign_timezone_abbreviations', }, +{ name => 'timing_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME', + short_desc => 'Controls the clock source used for collecting timing measurements.', + long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.', + variable => 'timing_clock_source', + boot_val => 'TIMING_CLOCK_SOURCE_AUTO', + options => 'timing_clock_source_options', + check_hook => 'check_timing_clock_source', + assign_hook => 'assign_timing_clock_source', + show_hook => 'show_timing_clock_source', +}, + { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', short_desc => 'Logs details of pre-authentication connection handshake.', flags => 'GUC_NOT_IN_SAMPLE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 1e14b7b4af060..c998e020e3e8f 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -92,6 +92,7 @@ #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" +#include "portability/instr_time.h" #include "utils/bytea.h" #include "utils/float.h" #include "utils/guc_hooks.h" @@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry timing_clock_source_options[] = { + {"auto", TIMING_CLOCK_SOURCE_AUTO, false}, + {"system", TIMING_CLOCK_SOURCE_SYSTEM, false}, +#if PG_INSTR_TSC_CLOCK + {"tsc", TIMING_CLOCK_SOURCE_TSC, false}, +#endif + {NULL, 0, false} +}; + static const struct config_enum_entry huge_pages_status_options[] = { {"off", HUGE_PAGES_OFF, false}, {"on", HUGE_PAGES_ON, false}, @@ -724,6 +734,7 @@ const char *const config_group_names[] = [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"), [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"), [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"), + [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"), [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"), [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"), [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e4abe6c007776..99d57ec9e211f 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -196,6 +196,10 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Time - + +#timing_clock_source = auto # auto, system, tsc (if supported) + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index 1d9ee4fb5882c..ee0e3a3b0abde 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -43,7 +43,9 @@ main(int argc, char *argv[]) handle_args(argc, argv); - /* initialize timing infrastructure (required for INSTR_* calls) */ + /* + * Initialize timing infrastructure (required for INSTR_* calls) + */ pg_initialize_timing(); loop_count = test_timing(test_duration); diff --git a/src/common/instr_time.c b/src/common/instr_time.c index 48e8283d166a1..c49f01350ecb0 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -14,14 +14,21 @@ */ #include "postgres.h" +#include + +#ifndef WIN32 +#include +#endif + +#include "port/pg_cpu.h" #include "portability/instr_time.h" /* * Stores what the number of ticks needs to be multiplied with to end up * with nanoseconds using integer math. * - * On certain platforms (currently Windows) the ticks to nanoseconds conversion - * requires floating point math because: + * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows) + * the ticks to nanoseconds conversion requires floating point math because: * * sec = ticks / frequency_hz * ns = ticks / frequency_hz * 1,000,000,000 @@ -40,16 +47,32 @@ * We remember the maximum number of ticks that can be multiplied by the scale * factor without overflowing so we can check via a * b > max <=> a > max / b. * - * On all other platforms we are using clock_gettime(), which uses nanoseconds + * In all other cases we are using clock_gettime(), which uses nanoseconds * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns * to return the original value. */ uint64 ticks_per_ns_scaled = 0; uint64 max_ticks_no_overflow = 0; bool timing_initialized = false; +int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO; static void set_ticks_per_ns(void); +#if PG_INSTR_TSC_CLOCK +static bool tsc_use_by_default(void); +static void set_ticks_per_ns_system(void); +static void set_ticks_per_ns_for_tsc(void); +#endif + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + * + * The allow_tsc_calibration argument sets whether the TSC logic (if available) + * is permitted to do calibration if it couldn't get the frequency from CPUID. + * + * Calibration may take up to TSC_CALIBRATION_MAX_NS and delays program start. + */ void pg_initialize_timing(void) { @@ -60,10 +83,41 @@ pg_initialize_timing(void) timing_initialized = true; } +bool +pg_set_timing_clock_source(TimingClockSourceType source) +{ + Assert(timing_initialized); + +#if PG_INSTR_TSC_CLOCK + pg_initialize_timing_tsc(); +#endif + +#if PG_INSTR_TSC_CLOCK + switch (source) + { + case TIMING_CLOCK_SOURCE_AUTO: + use_tsc = (tsc_frequency_khz > 0) && tsc_use_by_default(); + break; + case TIMING_CLOCK_SOURCE_SYSTEM: + use_tsc = false; + break; + case TIMING_CLOCK_SOURCE_TSC: + if (tsc_frequency_khz <= 0) /* Tell caller TSC is not usable */ + return false; + use_tsc = true; + break; + } +#endif + + set_ticks_per_ns(); + timing_clock_source = source; + return true; +} + #ifndef WIN32 static void -set_ticks_per_ns(void) +set_ticks_per_ns_system(void) { ticks_per_ns_scaled = 0; max_ticks_no_overflow = 0; @@ -82,10 +136,249 @@ GetTimerFrequency(void) } static void -set_ticks_per_ns(void) +set_ticks_per_ns_system(void) { ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; } #endif /* WIN32 */ + +static void +set_ticks_per_ns(void) +{ +#if PG_INSTR_TSC_CLOCK + if (use_tsc) + set_ticks_per_ns_for_tsc(); + else + set_ticks_per_ns_system(); +#else + set_ticks_per_ns_system(); +#endif +} + +/* TSC specific logic */ + +#if PG_INSTR_TSC_CLOCK + +bool use_tsc = false; + +int32 tsc_frequency_khz = -1; + +static uint32 tsc_calibrate(void); + +/* + * Detect the TSC frequency and whether RDTSCP is available on x86-64. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + */ +static void +tsc_detect_frequency(void) +{ + tsc_frequency_khz = 0; + + /* We require RDTSCP support, bail if not available */ + if (!x86_feature_available(PG_RDTSCP)) + return; + + /* Determine speed at which the TSC advances */ + tsc_frequency_khz = x86_tsc_frequency_khz(); + if (tsc_frequency_khz > 0) + return; + + /* + * CPUID did not give us the TSC frequency. If TSC is invariant and RDTSCP + * is available, we can measure the frequency by comparing TSC ticks + * against walltime using a short calibration loop. + */ + if (x86_feature_available(PG_TSC_INVARIANT)) + tsc_frequency_khz = tsc_calibrate(); +} + +/* + * Decides whether to use the TSC clock source if the user did not specify it + * one way or the other, and it is available (checked separately). + * + * Mirrors the Linux kernel's clocksource watchdog disable logic as updated in + * 2021 to reflect the reliability of the TSC on Intel platforms, see + * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion + * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/ + * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/ + * for reference. + * + * When the CPU has an invariant TSC (which we require in x86_tsc_frequency_khz), + * TSC_ADJUST bit set (Intel-only), and the system has at most 4 physical + * packages (sockets), we consider the TSC trustworthy by default, matching the + * Linux kernel. + * + * On other CPU platforms (e.g. AMD), in a virtual machine, or on 8+ socket + * systems we don't have an easy way to determine the TSC's reliability. If on + * Linux, we can check if TSC is the active clocksource, based on it having run + * the watchdog logic to monitor TSC correctness. For other platforms the user + * must explicitly enable it via GUC instead. + */ +static bool +tsc_use_by_default(void) +{ + if (x86_feature_available(PG_TSC_ADJUST)) + { + int cpus_per_package = x86_logical_processors_per_package(); + long total_cpus; + +#ifdef _SC_NPROCESSORS_CONF + total_cpus = sysconf(_SC_NPROCESSORS_CONF); +#elif defined(WIN32) + { + SYSTEM_INFO si; + + GetSystemInfo(&si); + total_cpus = si.dwNumberOfProcessors; + } +#else + total_cpus = -1; +#endif /* _SC_NPROCESSORS_CONF / WIN32 */ + + if (total_cpus > 0 && cpus_per_package > 0 && (total_cpus / cpus_per_package) <= 4) + return true; + } + +#if defined(__linux__) + { + FILE *fp; + char buf[128]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp) + { + bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL && + strcmp(buf, "tsc\n") == 0); + + fclose(fp); + if (is_tsc) + return true; + } + } +#endif + + return false; +} + +/* + * Calibrate the TSC frequency by comparing TSC ticks against walltime. + * + * Takes initial TSC and system clock snapshots, then loops, recomputing the + * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC + * ticks divided by elapsed time. + * + * Once the frequency estimate stabilizes (consecutive iterations agree), we + * consider it converged and the frequency in KHz is returned. If either too + * many iterations or a time limit passes without convergence, 0 is returned. + */ +#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS) +#define TSC_CALIBRATION_ITERATIONS 1000000 +#define TSC_CALIBRATION_SKIPS 100 +#define TSC_CALIBRATION_STABLE_CYCLES 10 + +static uint32 +tsc_calibrate(void) +{ + instr_time initial_wall; + int64 initial_tsc; + double freq_khz = 0; + double prev_freq_khz = 0; + int stable_count = 0; + int64 prev_tsc; + uint32 unused; + + /* Ensure INSTR_* time below work on system time */ + set_ticks_per_ns_system(); + + INSTR_TIME_SET_CURRENT(initial_wall); + +#ifdef _MSC_VER + initial_tsc = __rdtscp(&unused); +#else + initial_tsc = __builtin_ia32_rdtscp(&unused); +#endif + prev_tsc = initial_tsc; + + for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++) + { + instr_time now_wall; + int64 now_tsc; + int64 elapsed_ns; + int64 elapsed_ticks; + + INSTR_TIME_SET_CURRENT(now_wall); + +#ifdef _MSC_VER + now_tsc = __rdtscp(&unused); +#else + now_tsc = __builtin_ia32_rdtscp(&unused); +#endif + + INSTR_TIME_SUBTRACT(now_wall, initial_wall); + elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall); + + /* Safety: bail out if we've taken too long */ + if (elapsed_ns >= TSC_CALIBRATION_MAX_NS) + break; + + elapsed_ticks = now_tsc - initial_tsc; + + /* + * Skip if this is not the Nth cycle where we measure, if TSC hasn't + * advanced, or we walked backwards for some reason. + */ + if (i % TSC_CALIBRATION_SKIPS != 0 || now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0) + continue; + + freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000; + + /* + * Once freq_khz / prev_freq_khz is small, check if it stays that way. + * If it does for long enough, we've got a winner frequency. + */ + if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001) + { + stable_count++; + if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES) + return (uint32) freq_khz; + } + else + stable_count = 0; + + prev_tsc = now_tsc; + prev_freq_khz = freq_khz; + } + + /* did not converge */ + return 0; +} + +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +/* + * Initialize the TSC clock source by determining its usability and frequency. + * + * This can be called multiple times, as tsc_frequency_khz will be set to 0 + * if a prior call determined the TSC is not usable. On EXEC_BACKEND (Windows), + * the TSC frequency may also be set by restore_backend_variables. + */ +void +pg_initialize_timing_tsc(void) +{ + if (tsc_frequency_khz < 0) + tsc_detect_frequency(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac27..a32e67487f834 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -23,6 +23,12 @@ typedef enum X86FeatureId /* scalar registers and 128-bit XMM registers */ PG_SSE4_2, PG_POPCNT, + PG_HYPERVISOR, + + /* TSC flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, /* 512-bit ZMM registers */ PG_AVX512_BW, @@ -45,6 +51,9 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern int x86_logical_processors_per_package(void); +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index e1584695520a3..ce6794c0ada83 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,10 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in + * certain cases, or alternatively clock_gettime() on Unix-like systems and + * QueryPerformanceCounter() on Windows. These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -17,7 +18,11 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window + * + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * * INSTR_TIME_ADD(x, y) x += y @@ -83,27 +88,91 @@ typedef struct instr_time /* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ #define TICKS_TO_NS_SHIFT 14 -#ifdef WIN32 -#define PG_INSTR_TICKS_TO_NS 1 -#else -#define PG_INSTR_TICKS_TO_NS 0 -#endif - /* * Variables used to translate ticks to nanoseconds, initialized by - * pg_initialize_timing. + * pg_initialize_timing and adjusted by pg_set_timing_clock_source calls or + * changes of the "timing_clock_source" GUC. + * + * Note that changing these values after setting an instr_time and before + * reading/converting it will lead to incorrect results. This is technically + * possibly because the GUC can be changed at runtime, but unlikely, and we + * allow changing this at runtime to simplify testing of different sources. */ extern PGDLLIMPORT uint64 ticks_per_ns_scaled; extern PGDLLIMPORT uint64 max_ticks_no_overflow; extern PGDLLIMPORT bool timing_initialized; +typedef enum +{ + TIMING_CLOCK_SOURCE_AUTO, + TIMING_CLOCK_SOURCE_SYSTEM, + TIMING_CLOCK_SOURCE_TSC +} TimingClockSourceType; + +extern int timing_clock_source; + /* * Initialize timing infrastructure * - * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros. + * This must be called at least once before using INSTR_TIME_SET_CURRENT* + * macros. + * + * If you want to use the TSC clock source in a client program you just also + * call pg_set_timing_clock_source afterwards. */ extern void pg_initialize_timing(void); +/* + * Sets the time source to be used. Mainly intended for frontend programs, + * the backend should set it via the timing_clock_source GUC instead. + * + * Returns false if the clock source could not be set, for example when TSC + * is not available despite being explicitly set. + */ +extern bool pg_set_timing_clock_source(TimingClockSourceType source); + +#if defined(__x86_64__) || defined(_M_X64) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(WIN32) +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 0 +#endif + + +#if PG_INSTR_TSC_CLOCK +/* Whether to actually use TSC based on availability and GUC settings. */ +extern PGDLLIMPORT bool use_tsc; + +/* + * TSC frequency in kHz, set during initialization. + * + * -1 = not yet initialized, 0 = TSC not usable, >0 = frequency in kHz. + */ +extern PGDLLIMPORT int32 tsc_frequency_khz; + +extern void pg_initialize_timing_tsc(void); + +#endif /* PG_INSTR_TSC_CLOCK */ + +/* + * Returns the current timing clock source effectively in use, resolving + * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or + * TIMING_CLOCK_SOURCE_TSC. + */ +static inline TimingClockSourceType +pg_current_timing_clock_source(void) +{ +#if PG_INSTR_TSC_CLOCK + return use_tsc ? TIMING_CLOCK_SOURCE_TSC : TIMING_CLOCK_SOURCE_SYSTEM; +#else + return TIMING_CLOCK_SOURCE_SYSTEM; +#endif +} + #ifndef WIN32 /* On POSIX, use clock_gettime() for system clock source */ @@ -121,24 +190,27 @@ extern void pg_initialize_timing(void); * than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than * their version of CLOCK_MONOTONIC. + * + * Note this does not get used in case the TSC clock source logic is used, + * which directly calls architecture specific timing instructions (e.g. RDTSC). */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW #elif defined(CLOCK_MONOTONIC) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC #else -#define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME #endif static inline instr_time -pg_get_ticks(void) +pg_get_ticks_system(void) { instr_time now; struct timespec tmp; Assert(timing_initialized); - clock_gettime(PG_INSTR_CLOCK, &tmp); + clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; @@ -149,7 +221,7 @@ pg_get_ticks(void) /* On Windows, use QueryPerformanceCounter() for system clock source */ static inline instr_time -pg_get_ticks(void) +pg_get_ticks_system(void) { instr_time now; LARGE_INTEGER tmp; @@ -244,6 +316,66 @@ pg_ns_to_ticks(int64 ns) #endif /* PG_INSTR_TICKS_TO_NS */ } +#if PG_INSTR_TSC_CLOCK + +#ifdef _MSC_VER +#include +#endif /* defined(_MSC_VER) */ + +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + +#ifdef _MSC_VER + now.ticks = __rdtsc(); +#else + /* Avoid complex includes on clang/GCC that raise compile times */ + now.ticks = __builtin_ia32_rdtsc(); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + uint32 unused; + +#ifdef _MSC_VER + now.ticks = __rdtscp(&unused); +#else + now.ticks = __builtin_ia32_rdtscp(&unused); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); +} + +#else + +static inline instr_time +pg_get_ticks_fast(void) +{ + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + return pg_get_ticks_system(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ + /* * Common macros */ @@ -252,6 +384,9 @@ pg_ns_to_ticks(int64 ns) #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_get_ticks()) diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index b01697c1f606d..307f4fbaefe08 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -163,6 +163,9 @@ extern const char *show_timezone(void); extern bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); extern void assign_timezone_abbreviations(const char *newval, void *extra); +extern void assign_timing_clock_source(int newval, void *extra); +extern bool check_timing_clock_source(int *newval, void **extra, GucSource source); +extern const char *show_timing_clock_source(void); extern bool check_transaction_buffers(int *newval, void **extra, GucSource source); extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 71a8016196138..63440b8e36c83 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -60,6 +60,7 @@ enum config_group CONN_AUTH_TCP, CONN_AUTH_AUTH, CONN_AUTH_SSL, + RESOURCES_TIME, RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index b1aa71b888739..364b0bad2e081 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -25,6 +25,11 @@ #endif /* defined(_MSC_VER) */ #endif +#ifdef __linux__ +#include +#include +#endif + #ifdef HAVE_XSAVE_INTRINSICS #include #endif @@ -86,6 +91,21 @@ pg_cpuid(int leaf, CPUIDResult *r) #endif } +#if defined(HAVE__CPUIDEX) +static inline bool +pg_cpuidex(int leaf, int subleaf, CPUIDResult *r) +{ + int exx[4] = {0}; + + __cpuidex(exx, leaf, subleaf); + r->eax = exx[0]; + r->ebx = exx[1]; + r->ecx = exx[2]; + r->edx = exx[3]; + return true; +} +#endif + /* * Request CPUID information for the specified leaf and subleaf. * @@ -97,14 +117,7 @@ pg_cpuid_subleaf(int leaf, int subleaf, CPUIDResult *r) #if defined(HAVE__GET_CPUID_COUNT) return __get_cpuid_count(leaf, subleaf, &r->eax, &r->ebx, &r->ecx, &r->edx) == 1; #elif defined(HAVE__CPUIDEX) - int exx[4] = {0}; - - __cpuidex(exx, leaf, subleaf); - r->eax = exx[0]; - r->ebx = exx[1]; - r->ecx = exx[2]; - r->edx = exx[3]; - return true; + return pg_cpuidex(leaf, subleaf, r); #else memset(r, 0, sizeof(CPUIDResult)); return false; @@ -121,20 +134,23 @@ void set_x86_features(void) { CPUIDResult r = {0}; + bool have_osxsave; pg_cpuid(0x01, &r); X86Features[PG_SSE4_2] = r.ecx >> 20 & 1; X86Features[PG_POPCNT] = r.ecx >> 23 & 1; + X86Features[PG_HYPERVISOR] = r.ecx >> 31 & 1; + have_osxsave = r.ecx & (1 << 27); - /* All these features depend on OSXSAVE */ - if (r.ecx & (1 << 27)) - { - uint32 xcr0_val = 0; + pg_cpuid_subleaf(0x07, 0, &r); - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + X86Features[PG_TSC_ADJUST] = (r.ebx & (1 << 1)) != 0; - pg_cpuid_subleaf(0x07, 0, &r); + /* leaf 7 features that depend on OSXSAVE */ + if (have_osxsave) + { + uint32 xcr0_val = 0; #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ @@ -153,7 +169,163 @@ set_x86_features(void) } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, &r); + X86Features[PG_RDTSCP] = r.edx >> 27 & 1; + + pg_cpuid(0x80000007, &r); + X86Features[PG_TSC_INVARIANT] = r.edx >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* + * Return the number of logical processors per physical CPU package (socket). + * + * This uses CPUID.0B (Extended Topology Enumeration) to enumerate topology + * levels. Each sub-leaf reports a level type in ECX[15:8] (1 = SMT, 2 = Core) + * and the number of logical processors at that level and below in EBX[15:0]. + * The value at the highest level gives us logical processors per package. + * + * Vendor-specific leaves (0x1F for Intel, 0x80000026 for AMD) provide + * finer-grained sub-package topology but are assumed to report the same + * per-package totals on current hardware. + * + * Returns 0 if topology information is not available. + */ +int +x86_logical_processors_per_package(void) +{ + int logical_per_package = 0; + + for (int subleaf = 0; subleaf < 8; subleaf++) + { + CPUIDResult r = {0}; + uint32 level_type; + + if (!pg_cpuid_subleaf(0x0B, subleaf, &r)) + return 0; + + level_type = (r.ecx >> 8) & 0xff; + + /* level_type 0 means end of enumeration */ + if (level_type == 0) + break; + + logical_per_package = r.ebx & 0xffff; + } + + return logical_per_package; +} + +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates TSC is not invariant, or the frequency information was not + * accessible and the instructions should not be used. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + + if (!x86_feature_available(PG_TSC_INVARIANT)) + return 0; + + if (x86_feature_available(PG_HYPERVISOR)) + return x86_hypervisor_tsc_frequency_khz(); + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "Intel® 64 and IA-32 Architectures Software Developer’s Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, &r); + if (r.ecx > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (r.eax == 0 || r.ebx == 0) + return 0; + + return r.ecx / 1000 * r.ebx / r.eax; + } + + /* + * When CPUID.15H is not available/incomplete, but we have verified an + * invariant TSC is used, we can instead get the processor base frequency + * in MHz from CPUID.16H:EAX, the "Processor Frequency Information Leaf". + */ + pg_cpuid(0x16, &r); + if (r.eax > 0) + return r.eax * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access an MSR to get the frequency (which is typically not available + * for unprivileged processes), so we instead rely on the TSC calibration logic. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r.ebx == 0x61774d56 && r.ecx == 0x4d566572 && r.edx == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r.ebx == 0x4b4d564b && r.ecx == 0x564b4d56 && r.edx == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + +#if defined(HAVE__CPUIDEX) + + /* + * The hypervisor is determined using the 0x40000000 Hypervisor + * information leaf, which requires use of __cpuidex to set ECX to 0 to + * access it. + * + * The similar __get_cpuid_count function does not work as expected since + * it contains a check for __get_cpuid_max, which has been observed to be + * lower than the special Hypervisor leaf, despite it being available. + */ + pg_cpuidex(0x40000000, 0, &r); + + if (r.eax >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) + { + pg_cpuidex(0x40000010, 0, &r); + if (r.eax > 0) + return r.eax; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 7546babac1e48..3949752c38d03 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3149,6 +3149,7 @@ TimeoutId TimeoutType Timestamp TimestampTz +TimingClockSourceType TmFromChar TmToChar ToastAttrInfo From 62f4abdf8786c2064070580e65fb82726ccfeadb Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Thu, 12 Feb 2026 01:12:19 -0800 Subject: [PATCH 6/7] pg_test_timing: Also test RDTSC/RDTSCP timing and report time source Author: David Geier Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/bin/pg_test_timing/pg_test_timing.c | 98 ++++++++++++++++++++++--- src/include/portability/instr_time.h | 6 ++ 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index ee0e3a3b0abde..1e943fc8d2d8d 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -30,14 +30,16 @@ static long long int largest_diff_count; static void handle_args(int argc, char *argv[]); -static uint64 test_timing(unsigned int duration); +static void test_system_timing(void); +#if PG_INSTR_TSC_CLOCK +static void test_tsc_timing(void); +#endif +static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing); static void output(uint64 loop_count); int main(int argc, char *argv[]) { - uint64 loop_count; - set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing")); progname = get_progname(argv[0]); @@ -48,9 +50,11 @@ main(int argc, char *argv[]) */ pg_initialize_timing(); - loop_count = test_timing(test_duration); + test_system_timing(); - output(loop_count); +#if PG_INSTR_TSC_CLOCK + test_tsc_timing(); +#endif return 0; } @@ -148,20 +152,92 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", - "Testing timing overhead for %u seconds.\n", + printf(ngettext("Testing timing overhead for %u second.\n\n", + "Testing timing overhead for %u seconds.\n\n", test_duration), test_duration); } +/* + * This tests default (non-fast) timing code. A clock source for that is + * always available. Hence, we can unconditionally output the result. + */ +static void +test_system_timing(void) +{ + uint64 loop_count; + + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false); + output(loop_count); +} + +/* + * If on a supported architecture, test the TSC clock source. This clock + * source is not always available. In that case we print an informational + * message indicating as such. + * + * We first emit "slow" timings (RDTSCP on x86), which are used for higher + * precision measurements when the TSC clock source is enabled. We emit + * "fast" timings second (RDTSC on x86), which is used for faster timing + * measurements with lower precision. + */ +#if PG_INSTR_TSC_CLOCK +static void +test_tsc_timing(void) +{ + uint64 loop_count; + + printf("\n"); + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false); + if (loop_count > 0) + { + output(loop_count); + printf("\n"); + + /* Now, emit fast timing measurements */ + loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true); + output(loop_count); + printf("\n"); + + printf(_("TSC frequency: %u kHz\n"), tsc_frequency_khz); + + pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO); + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n")); + else + printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n")); + } + else + printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?.\n")); +} +#endif + static uint64 -test_timing(unsigned int duration) +test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing) { uint64 loop_count = 0; instr_time start_time, end_time, prev, cur; + char *time_source = NULL; + + if (!pg_set_timing_clock_source(source)) + return 0; + + time_source = PG_INSTR_SYSTEM_CLOCK_NAME; + +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME; +#endif + + if (fast_timing) + printf(_("Fast clock source: %s\n"), time_source); + else if (source == TIMING_CLOCK_SOURCE_SYSTEM) + printf(_("System clock source: %s\n"), time_source); + else + printf(_("Clock source: %s\n"), time_source); /* * Pre-zero the statistics data structures. They're already zero by @@ -186,7 +262,11 @@ test_timing(unsigned int duration) instr_time diff_time; prev = cur; - INSTR_TIME_SET_CURRENT(cur); + + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(cur); + else + INSTR_TIME_SET_CURRENT(cur); diff_time = cur; INSTR_TIME_SUBTRACT(diff_time, prev); diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index ce6794c0ada83..f0d1118fc4d3d 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -133,6 +133,8 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #if defined(__x86_64__) || defined(_M_X64) #define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" +#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" #define PG_INSTR_TICKS_TO_NS 1 #elif defined(WIN32) #define PG_INSTR_TSC_CLOCK 0 @@ -196,10 +198,13 @@ pg_current_timing_clock_source(void) */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) #define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)" #elif defined(CLOCK_MONOTONIC) #define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)" #else #define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)" #endif static inline instr_time @@ -220,6 +225,7 @@ pg_get_ticks_system(void) /* On Windows, use QueryPerformanceCounter() for system clock source */ +#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter" static inline instr_time pg_get_ticks_system(void) { From 2d48b0428a1b5ed45f47e2d59acba273d56eb55f Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 01:38:14 -0700 Subject: [PATCH 7/7] instrumentation: ARM support for fast time measurements Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl Reviewed-by: Discussion: --- src/common/instr_time.c | 66 ++++++++++++++++++++++++++-- src/include/port/pg_cpu.h | 6 +++ src/include/portability/instr_time.h | 57 ++++++++++++++++++++++-- src/port/meson.build | 1 + src/port/pg_cpu_arm.c | 45 +++++++++++++++++++ 5 files changed, 167 insertions(+), 8 deletions(-) create mode 100644 src/port/pg_cpu_arm.c diff --git a/src/common/instr_time.c b/src/common/instr_time.c index c49f01350ecb0..ebd155e186acc 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -20,6 +20,10 @@ #include #endif +#if defined(__APPLE__) +#include +#endif + #include "port/pg_cpu.h" #include "portability/instr_time.h" @@ -157,7 +161,7 @@ set_ticks_per_ns(void) #endif } -/* TSC specific logic */ +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ #if PG_INSTR_TSC_CLOCK @@ -165,6 +169,19 @@ bool use_tsc = false; int32 tsc_frequency_khz = -1; +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + static uint32 tsc_calibrate(void); /* @@ -360,13 +377,54 @@ tsc_calibrate(void) return 0; } +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Detect the generic timer frequency on AArch64. + */ static void -set_ticks_per_ns_for_tsc(void) +tsc_detect_frequency(void) { - ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; - max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; + if (aarch64_has_heterogeneous_cores()) + { + tsc_frequency_khz = 0; + return; + } + + tsc_frequency_khz = aarch64_cntvct_frequency_khz(); +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; } +#endif /* defined(__aarch64__) */ + /* * Initialize the TSC clock source by determining its usability and frequency. * diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index a32e67487f834..82df66f381e03 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -56,4 +56,10 @@ extern uint32 x86_tsc_frequency_khz(void); #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index f0d1118fc4d3d..0a6585ec17deb 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,8 +4,9 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in - * certain cases, or alternatively clock_gettime() on Unix-like systems and + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and * QueryPerformanceCounter() on Windows. These macros also give some breathing * room to use other high-precision-timing APIs. * @@ -136,6 +137,11 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" #define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" #define PG_INSTR_TICKS_TO_NS 1 +#elif defined(__aarch64__) && !defined(WIN32) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" +#define PG_INSTR_TICKS_TO_NS 1 #elif defined(WIN32) #define PG_INSTR_TSC_CLOCK 0 #define PG_INSTR_TICKS_TO_NS 1 @@ -144,7 +150,6 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TICKS_TO_NS 0 #endif - #if PG_INSTR_TSC_CLOCK /* Whether to actually use TSC based on availability and GUC settings. */ extern PGDLLIMPORT bool use_tsc; @@ -324,6 +329,8 @@ pg_ns_to_ticks(int64 ns) #if PG_INSTR_TSC_CLOCK +#if defined(__x86_64__) || defined(_M_X64) + #ifdef _MSC_VER #include #endif /* defined(_MSC_VER) */ @@ -366,7 +373,49 @@ pg_get_ticks(void) return pg_get_ticks_system(); } -#else +#elif defined(__aarch64__) && !defined(WIN32) + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ static inline instr_time pg_get_ticks_fast(void) diff --git a/src/port/meson.build b/src/port/meson.build index 7296f8e3c037f..110bcd28edd4c 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_localeconv_r.c', 'pg_numa.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 0000000000000..6fd9dd892ec98 --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(WIN32) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */