diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d3fea738ca33c..a57761c6facd3 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2533,6 +2533,60 @@ include_dir 'conf.d'
+
+ Timing
+
+
+
+ timing_clock_source (enum)
+
+ timing_clock_source configuration parameter
+
+
+
+
+ Selects the method for making timing measurements using the OS or specialized CPU
+ instructions. Possible values are:
+
+
+
+ auto (automatically chooses TSC clock source for modern CPUs,
+ otherwise uses the OS system clock)
+
+
+
+
+ system (measures timing using the OS system clock)
+
+
+
+
+ tsc (measures timing using the x86-64 Time-Stamp Counter (TSC)
+ by directly executing RDTSC/RDTSCP instructions, see below)
+
+
+
+ The default is auto.
+
+
+ If enabled, the TSC clock source will use the RDTSC instruction for the x86-64
+ Time-Stamp Counter (TSC) to perform certain time measurements, for example during
+ EXPLAIN ANALYZE. The RDTSC instruction has less overhead than going through the OS
+ clock source, which for an EXPLAIN ANALYZE statement will show timing closer to the
+ actual runtime when timing is off. For timings that require higher precision the
+ RDTSCP instruction is used, which avoids inaccuracies due to CPU instruction re-ordering.
+ Use of RDTSC/RDTSC is not supported on older CPUs or hypervisors that don't pass the TSC
+ frequency to guest VMs, and is not advised on systems that utilize an emulated TSC.
+
+
+ To help decide which clock source to use on an x86-64 system you can run the
+ pg_test_timing utility to check TSC availability, and
+ perform timing measurements.
+
+
+
+
+
Background Writer
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index a40610bc2522f..f217a72461d62 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -72,7 +72,7 @@ InstrStartNode(Instrumentation *instr)
if (!INSTR_TIME_IS_ZERO(instr->starttime))
elog(ERROR, "InstrStartNode called twice in a row");
else
- INSTR_TIME_SET_CURRENT(instr->starttime);
+ INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
}
/* save buffer usage totals at node entry, if needed */
@@ -99,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
if (INSTR_TIME_IS_ZERO(instr->starttime))
elog(ERROR, "InstrStopNode called without start");
- INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_SET_CURRENT_FAST(endtime);
INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
INSTR_TIME_SET_ZERO(instr->starttime);
@@ -294,3 +294,78 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes;
dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full;
}
+
+/* GUC hooks for timing_clock_source */
+
+#include "portability/instr_time.h"
+#include "utils/guc_hooks.h"
+
+bool
+check_timing_clock_source(int *newval, void **extra, GucSource source)
+{
+ /*
+ * Do nothing if timing is not initialized. This is only expected on child
+ * processes in EXEC_BACKEND builds, as GUC hooks can be called during
+ * InitializeGUCOptions() before InitProcessGlobals() has had a chance to
+ * run pg_initialize_timing(). Instead, TSC will be initialized via
+ * restore_backend_variables.
+ */
+#ifdef EXEC_BACKEND
+ if (!timing_initialized)
+ return true;
+#else
+ Assert(timing_initialized);
+#endif
+
+#if PG_INSTR_TSC_CLOCK
+ pg_initialize_timing_tsc();
+
+ if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0)
+ {
+ GUC_check_errdetail("TSC is not supported as timing clock source");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+void
+assign_timing_clock_source(int newval, void *extra)
+{
+#ifdef EXEC_BACKEND
+ if (!timing_initialized)
+ return;
+#else
+ Assert(timing_initialized);
+#endif
+
+ /*
+ * Ignore the return code since the check hook already verified TSC is
+ * usable if its explicitly requested.
+ */
+ pg_set_timing_clock_source(newval);
+}
+
+const char *
+show_timing_clock_source(void)
+{
+ switch (timing_clock_source)
+ {
+ case TIMING_CLOCK_SOURCE_AUTO:
+#if PG_INSTR_TSC_CLOCK
+ if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+ return "auto (tsc)";
+#endif
+ return "auto (system)";
+ case TIMING_CLOCK_SOURCE_SYSTEM:
+ return "system";
+#if PG_INSTR_TSC_CLOCK
+ case TIMING_CLOCK_SOURCE_TSC:
+ return "tsc";
+#endif
+ }
+
+ /* unreachable */
+ return "?";
+}
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index 434e06430220e..4883f297a1c4e 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -55,6 +55,7 @@
#ifdef EXEC_BACKEND
#include "nodes/queryjumble.h"
+#include "portability/instr_time.h"
#include "storage/pg_shmem.h"
#include "storage/spin.h"
#endif
@@ -127,6 +128,10 @@ typedef struct
int MyPMChildSlot;
+#if PG_INSTR_TSC_CLOCK
+ int32 timing_tsc_frequency_khz;
+#endif
+
/*
* These are only used by backend processes, but are here because passing
* a socket needs some special handling on Windows. 'client_sock' is an
@@ -743,6 +748,10 @@ save_backend_variables(BackendParameters *param,
param->MaxBackends = MaxBackends;
param->num_pmchild_slots = num_pmchild_slots;
+#if PG_INSTR_TSC_CLOCK
+ param->timing_tsc_frequency_khz = timing_tsc_frequency_khz;
+#endif
+
#ifdef WIN32
param->PostmasterHandle = PostmasterHandle;
if (!write_duplicated_handle(¶m->initial_signal_pipe,
@@ -997,6 +1006,14 @@ restore_backend_variables(BackendParameters *param)
MaxBackends = param->MaxBackends;
num_pmchild_slots = param->num_pmchild_slots;
+#if PG_INSTR_TSC_CLOCK
+ timing_tsc_frequency_khz = param->timing_tsc_frequency_khz;
+
+ /* Re-run logic usually done by assign_timing_clock_source */
+ pg_initialize_timing();
+ pg_set_timing_clock_source(timing_clock_source);
+#endif
+
#ifdef WIN32
PostmasterHandle = param->PostmasterHandle;
pgwin32_initial_signal_pipe = param->initial_signal_pipe;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index eb4f3eb72d456..aa6b750d28ce5 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1937,6 +1937,11 @@ InitProcessGlobals(void)
MyStartTimestamp = GetCurrentTimestamp();
MyStartTime = timestamptz_to_time_t(MyStartTimestamp);
+ /*
+ * Initialize timing infrastructure
+ */
+ pg_initialize_timing();
+
/*
* Set a different global seed in every process. We want something
* unpredictable, so if possible, use high-quality random bits for the
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index a315c4ab8aba2..233b8216677fe 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -3044,6 +3044,17 @@
assign_hook => 'assign_timezone_abbreviations',
},
+{ name => 'timing_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME',
+ short_desc => 'Controls the clock source used for collecting timing measurements.',
+ long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.',
+ variable => 'timing_clock_source',
+ boot_val => 'TIMING_CLOCK_SOURCE_AUTO',
+ options => 'timing_clock_source_options',
+ check_hook => 'check_timing_clock_source',
+ assign_hook => 'assign_timing_clock_source',
+ show_hook => 'show_timing_clock_source',
+},
+
{ name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS',
short_desc => 'Logs details of pre-authentication connection handshake.',
flags => 'GUC_NOT_IN_SAMPLE',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index d9ca13baff97d..9f9d8d17be917 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -92,6 +92,7 @@
#include "tcop/tcopprot.h"
#include "tsearch/ts_cache.h"
#include "utils/builtins.h"
+#include "portability/instr_time.h"
#include "utils/bytea.h"
#include "utils/float.h"
#include "utils/guc_hooks.h"
@@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = {
{NULL, 0, false}
};
+static const struct config_enum_entry timing_clock_source_options[] = {
+ {"auto", TIMING_CLOCK_SOURCE_AUTO, false},
+ {"system", TIMING_CLOCK_SOURCE_SYSTEM, false},
+#if PG_INSTR_TSC_CLOCK
+ {"tsc", TIMING_CLOCK_SOURCE_TSC, false},
+#endif
+ {NULL, 0, false}
+};
+
static const struct config_enum_entry huge_pages_status_options[] = {
{"off", HUGE_PAGES_OFF, false},
{"on", HUGE_PAGES_ON, false},
@@ -731,6 +741,7 @@ const char *const config_group_names[] =
[CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"),
[CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"),
[CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"),
+ [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"),
[RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"),
[RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"),
[RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 6d0337853e01b..ae027b2f2ae30 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -196,6 +196,10 @@
#max_files_per_process = 1000 # min 64
# (change requires restart)
+# - Time -
+
+#timing_clock_source = auto # auto, system, tsc (if supported)
+
# - Background Writer -
#bgwriter_delay = 200ms # 10-10000ms between rounds
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index aee41dbe3f9b7..9be5b09652ac3 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -30,22 +30,29 @@ static long long int largest_diff_count;
static void handle_args(int argc, char *argv[]);
-static uint64 test_timing(unsigned int duration);
+static void test_system_timing(void);
+#if PG_INSTR_TSC_CLOCK
+static void test_tsc_timing(void);
+#endif
+static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing);
static void output(uint64 loop_count);
int
main(int argc, char *argv[])
{
- uint64 loop_count;
-
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing"));
progname = get_progname(argv[0]);
handle_args(argc, argv);
- loop_count = test_timing(test_duration);
+ /* initialize timing infrastructure (required for INSTR_* calls) */
+ pg_initialize_timing();
- output(loop_count);
+ test_system_timing();
+
+#if PG_INSTR_TSC_CLOCK
+ test_tsc_timing();
+#endif
return 0;
}
@@ -143,20 +150,92 @@ handle_args(int argc, char *argv[])
exit(1);
}
- printf(ngettext("Testing timing overhead for %u second.\n",
- "Testing timing overhead for %u seconds.\n",
+ printf(ngettext("Testing timing overhead for %u second.\n\n",
+ "Testing timing overhead for %u seconds.\n\n",
test_duration),
test_duration);
}
+/*
+ * This tests default (non-fast) timing code. A clock source for that is
+ * always available. Hence, we can unconditionally output the result.
+ */
+static void
+test_system_timing(void)
+{
+ uint64 loop_count;
+
+ loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false);
+ output(loop_count);
+}
+
+/*
+ * If on a supported architecture, test the TSC clock source. This clock
+ * source is not always available. In that case we print an informational
+ * message indicating as such.
+ *
+ * We first emit "slow" timings (RDTSCP on x86), which are used for higher
+ * precision measurements when the TSC clock source is enabled. We emit
+ * "fast" timings second (RDTSC on x86), which is used for faster timing
+ * measurements with lower precision.
+ */
+#if PG_INSTR_TSC_CLOCK
+static void
+test_tsc_timing(void)
+{
+ uint64 loop_count;
+
+ printf("\n");
+ loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false);
+ if (loop_count > 0)
+ {
+ output(loop_count);
+ printf("\n");
+
+ /* Now, emit fast timing measurements */
+ loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true);
+ output(loop_count);
+ printf("\n");
+
+ printf(_("TSC frequency: %u kHz\n"), timing_tsc_frequency_khz);
+
+ pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO);
+ if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+ printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n"));
+ else
+ printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n"));
+ }
+ else
+ printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?.\n"));
+}
+#endif
+
static uint64
-test_timing(unsigned int duration)
+test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing)
{
uint64 loop_count = 0;
instr_time start_time,
end_time,
prev,
cur;
+ char *time_source = NULL;
+
+ if (!pg_set_timing_clock_source(source))
+ return 0;
+
+ time_source = PG_INSTR_SYSTEM_CLOCK_NAME;
+
+#if PG_INSTR_TSC_CLOCK
+ if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+ time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME;
+#endif
+
+ if (fast_timing)
+ printf(_("Fast clock source: %s\n"), time_source);
+ else if (source == TIMING_CLOCK_SOURCE_SYSTEM)
+ printf(_("System clock source: %s\n"), time_source);
+ else
+ printf(_("Clock source: %s\n"), time_source);
/*
* Pre-zero the statistics data structures. They're already zero by
@@ -181,7 +260,11 @@ test_timing(unsigned int duration)
instr_time diff_time;
prev = cur;
- INSTR_TIME_SET_CURRENT(cur);
+
+ if (fast_timing)
+ INSTR_TIME_SET_CURRENT_FAST(cur);
+ else
+ INSTR_TIME_SET_CURRENT(cur);
diff_time = cur;
INSTR_TIME_SUBTRACT(diff_time, prev);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 1dae918cc09d2..c969afab3a595 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -6820,6 +6820,9 @@ main(int argc, char **argv)
int exit_code = 0;
struct timeval tv;
+ /* initialize timing infrastructure (required for INSTR_* calls) */
+ pg_initialize_timing();
+
/*
* Record difference between Unix time and instr_time time. We'll use
* this for logging and aggregation.
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 9a397ec87b736..69d044d405d5b 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
#include "help.h"
#include "input.h"
#include "mainloop.h"
+#include "portability/instr_time.h"
#include "settings.h"
/*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
+ /* initialize timing infrastructure (required for INSTR_* calls) */
+ pg_initialize_timing();
+
SyncVariables();
if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 2c720caa50972..1a2fbbe887f22 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
file_perm.o \
file_utils.o \
hashfn.o \
+ instr_time.o \
ip.o \
jsonapi.o \
keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 0000000000000..8fcf49023bd6d
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,421 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ * Non-inline parts of the portable high-precision interval timing
+ * implementation
+ *
+ * Portions Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include
+
+#if defined(__APPLE__)
+#include
+#endif
+
+#include "port/pg_cpu.h"
+#include "portability/instr_time.h"
+
+/*
+ * Stores what the number of ticks needs to be multiplied with to end up
+ * with nanoseconds using integer math.
+ *
+ * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
+ * the ticks to nanoseconds conversion requires floating point math because:
+ *
+ * sec = ticks / frequency_hz
+ * ns = ticks / frequency_hz * 1,000,000,000
+ * ns = ticks * (1,000,000,000 / frequency_hz)
+ * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
+ *
+ * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
+ * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
+ *
+ * To be able to use integer math we work around the lack of precision. We
+ * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
+ * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
+ * the same amount.
+ *
+ * We remember the maximum number of ticks that can be multiplied by the scale
+ * factor without overflowing so we can check via a * b > max <=> a > max / b.
+ *
+ * However, as this is meant for interval measurements, it is unlikely that the
+ * overflow path is actually taken in typical scenarios, since overflows would
+ * only occur for intervals longer than 6.5 days.
+ *
+ * Note we utilize unsigned integers even though ticks are stored as a signed
+ * value to encourage compilers to generate better assembly, since we can be
+ * sure these values are not negative.
+ *
+ * In all other cases we are using clock_gettime(), which uses nanoseconds
+ * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
+ * to return the original value.
+ */
+uint64 ticks_per_ns_scaled = 0;
+uint64 max_ticks_no_overflow = 0;
+bool timing_initialized = false;
+int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
+
+static void set_ticks_per_ns(void);
+static void set_ticks_per_ns_system(void);
+
+#if PG_INSTR_TSC_CLOCK
+static bool tsc_use_by_default(void);
+static void set_ticks_per_ns_for_tsc(void);
+#endif
+
+/*
+ * Initializes timing infrastructure. Must be called before making any use
+ * of INSTR* macros.
+ *
+ * The allow_tsc_calibration argument sets whether the TSC logic (if available)
+ * is permitted to do calibration if it couldn't get the frequency from CPUID.
+ *
+ * Calibration may take up to TSC_CALIBRATION_MAX_NS and delays program start.
+ */
+void
+pg_initialize_timing(void)
+{
+ if (timing_initialized)
+ return;
+
+ set_ticks_per_ns();
+ timing_initialized = true;
+}
+
+bool
+pg_set_timing_clock_source(TimingClockSourceType source)
+{
+ Assert(timing_initialized);
+
+#if PG_INSTR_TSC_CLOCK
+ pg_initialize_timing_tsc();
+
+ switch (source)
+ {
+ case TIMING_CLOCK_SOURCE_AUTO:
+ timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
+ break;
+ case TIMING_CLOCK_SOURCE_SYSTEM:
+ timing_tsc_enabled = false;
+ break;
+ case TIMING_CLOCK_SOURCE_TSC:
+ /* Tell caller TSC is not usable */
+ if (timing_tsc_frequency_khz <= 0)
+ return false;
+ timing_tsc_enabled = true;
+ break;
+ }
+#endif
+
+ set_ticks_per_ns();
+ timing_clock_source = source;
+ return true;
+}
+
+static void
+set_ticks_per_ns(void)
+{
+#if PG_INSTR_TSC_CLOCK
+ if (timing_tsc_enabled)
+ set_ticks_per_ns_for_tsc();
+ else
+ set_ticks_per_ns_system();
+#else
+ set_ticks_per_ns_system();
+#endif
+}
+
+#ifndef WIN32
+
+static void
+set_ticks_per_ns_system(void)
+{
+ ticks_per_ns_scaled = 0;
+ max_ticks_no_overflow = 0;
+}
+
+#else /* WIN32 */
+
+/* GetTimerFrequency returns counts per second */
+static inline double
+GetTimerFrequency(void)
+{
+ LARGE_INTEGER f;
+
+ QueryPerformanceFrequency(&f);
+ return (double) f.QuadPart;
+}
+
+static void
+set_ticks_per_ns_system(void)
+{
+ ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
+ max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+#endif /* WIN32 */
+
+/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */
+
+#if PG_INSTR_TSC_CLOCK
+
+bool timing_tsc_enabled = false;
+
+int32 timing_tsc_frequency_khz = -1;
+
+static void tsc_detect_frequency(void);
+
+/*
+ * Initialize the TSC clock source by determining its usability and frequency.
+ *
+ * This can be called multiple times, as timing_tsc_frequency_khz will be set to 0
+ * if a prior call determined the TSC is not usable. On EXEC_BACKEND (Windows),
+ * the TSC frequency may also be set by restore_backend_variables.
+ */
+void
+pg_initialize_timing_tsc(void)
+{
+ if (timing_tsc_frequency_khz < 0)
+ {
+ tsc_detect_frequency();
+
+#ifndef FRONTEND
+ elog(DEBUG1, "detected TSC frequency: %d kHz", timing_tsc_frequency_khz);
+#endif
+ }
+}
+
+static void
+set_ticks_per_ns_for_tsc(void)
+{
+ ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
+ max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+/*
+ * x86-64 TSC specific logic
+ */
+
+static uint32 tsc_calibrate(void);
+
+/*
+ * Detect the TSC frequency and whether RDTSCP is available on x86-64.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ */
+static void
+tsc_detect_frequency(void)
+{
+ timing_tsc_frequency_khz = 0;
+
+ /* We require RDTSCP support and an invariant TSC, bail if not available */
+ if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT))
+ return;
+
+ /* Determine speed at which the TSC advances */
+ timing_tsc_frequency_khz = x86_tsc_frequency_khz();
+ if (timing_tsc_frequency_khz > 0)
+ return;
+
+ /*
+ * CPUID did not give us the TSC frequency. We can instead measure the
+ * frequency by comparing ticks against walltime in a calibration loop.
+ */
+ timing_tsc_frequency_khz = tsc_calibrate();
+}
+
+/*
+ * Decides whether to use the TSC clock source if the user did not specify it
+ * one way or the other, and it is available (checked separately).
+ *
+ * Mirrors the Linux kernel's clocksource watchdog disable logic as updated in
+ * 2021 to reflect the reliability of the TSC on Intel platforms, see
+ * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
+ * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
+ * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
+ * for reference.
+ *
+ * When the CPU has an invariant TSC (which we require in x86_tsc_frequency_khz),
+ * TSC_ADJUST bit set (Intel-only), we consider the TSC trustworthy by default,
+ * matching the Linux kernel.
+ *
+ * On other CPU platforms (e.g. AMD), in a virtual machine, or on systems we
+ * don't have an easy way to determine the TSC's reliability. If on Linux, we
+ * can check if TSC is the active clocksource, based on it having run the
+ * watchdog logic to monitor TSC correctness. For other platforms the user must
+ * explicitly enable it via GUC instead.
+ */
+static bool
+tsc_use_by_default(void)
+{
+ if (x86_feature_available(PG_TSC_ADJUST))
+ return true;
+
+#if defined(__linux__)
+ {
+ FILE *fp;
+ char buf[128];
+
+ fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+ if (fp)
+ {
+ bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
+ strcmp(buf, "tsc\n") == 0);
+
+ fclose(fp);
+ if (is_tsc)
+ return true;
+ }
+ }
+#endif
+
+ return false;
+}
+
+/*
+ * Calibrate the TSC frequency by comparing TSC ticks against walltime.
+ *
+ * Takes initial TSC and system clock snapshots, then loops, recomputing the
+ * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
+ * ticks divided by elapsed time.
+ *
+ * Once the frequency estimate stabilizes (consecutive iterations agree), we
+ * consider it converged and the frequency in KHz is returned. If either too
+ * many iterations or a time limit passes without convergence, 0 is returned.
+ */
+#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
+#define TSC_CALIBRATION_ITERATIONS 1000000
+#define TSC_CALIBRATION_SKIPS 100
+#define TSC_CALIBRATION_STABLE_CYCLES 10
+
+static uint32
+tsc_calibrate(void)
+{
+ instr_time initial_wall;
+ int64 initial_tsc;
+ double freq_khz = 0;
+ double prev_freq_khz = 0;
+ int stable_count = 0;
+ int64 prev_tsc;
+
+ /* Ensure INSTR_* time below work on system time */
+ set_ticks_per_ns_system();
+
+ INSTR_TIME_SET_CURRENT(initial_wall);
+
+ initial_tsc = pg_rdtscp();
+ prev_tsc = initial_tsc;
+
+ for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
+ {
+ instr_time now_wall;
+ int64 now_tsc;
+ int64 elapsed_ns;
+ int64 elapsed_ticks;
+
+ INSTR_TIME_SET_CURRENT(now_wall);
+
+ now_tsc = pg_rdtscp();
+
+ INSTR_TIME_SUBTRACT(now_wall, initial_wall);
+ elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
+
+ /* Safety: bail out if we've taken too long */
+ if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
+ break;
+
+ elapsed_ticks = now_tsc - initial_tsc;
+
+ /*
+ * Skip if this is not the Nth cycle where we measure, if TSC hasn't
+ * advanced, or we walked backwards for some reason.
+ */
+ if (i % TSC_CALIBRATION_SKIPS != 0 || now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
+ continue;
+
+ freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
+
+ /*
+ * Once freq_khz / prev_freq_khz is small, check if it stays that way.
+ * If it does for long enough, we've got a winner frequency.
+ */
+ if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
+ {
+ stable_count++;
+ if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
+ return (uint32) freq_khz;
+ }
+ else
+ stable_count = 0;
+
+ prev_tsc = now_tsc;
+ prev_freq_khz = freq_khz;
+ }
+
+ /* did not converge */
+ return 0;
+}
+
+#elif defined(__aarch64__)
+
+/*
+ * Check whether this is a heterogeneous Apple Silicon P+E core system
+ * where CNTVCT_EL0 may tick at different rates on different core types.
+ */
+static bool
+aarch64_has_heterogeneous_cores(void)
+{
+#if defined(__APPLE__)
+ int nperflevels = 0;
+ size_t len = sizeof(nperflevels);
+
+ if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0)
+ return nperflevels > 1;
+#endif
+
+ return false;
+}
+
+/*
+ * Detect the generic timer frequency on AArch64.
+ */
+static void
+tsc_detect_frequency(void)
+{
+ if (aarch64_has_heterogeneous_cores())
+ {
+ timing_tsc_frequency_khz = 0;
+ return;
+ }
+
+ timing_tsc_frequency_khz = aarch64_cntvct_frequency_khz();
+}
+
+/*
+ * The ARM generic timer is architecturally guaranteed to be monotonic and
+ * synchronized across cores of the same type, so we always use it by default
+ * when available and cores are homogenous.
+ */
+static bool
+tsc_use_by_default(void)
+{
+ return true;
+}
+
+#endif /* defined(__aarch64__) */
+
+#endif /* PG_INSTR_TSC_CLOCK */
diff --git a/src/common/meson.build b/src/common/meson.build
index 4f9b8b8263d55..9bd55cda95b10 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
'file_perm.c',
'file_utils.c',
'hashfn.c',
+ 'instr_time.c',
'ip.c',
'jsonapi.c',
'keywords.c',
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index c5d96bb4f479f..aee501a4ecdc4 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -32,8 +32,16 @@ typedef enum X86FeatureId
PG_AVX512_VL,
PG_AVX512_VPCLMULQDQ,
PG_AVX512_VPOPCNTDQ,
+
+ /* identification */
+ PG_HYPERVISOR,
+
+ /* Time-Stamp Counter (TSC) flags */
+ PG_RDTSCP,
+ PG_TSC_INVARIANT,
+ PG_TSC_ADJUST,
} X86FeatureId;
-#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
+#define X86FeaturesSize (PG_TSC_ADJUST + 1)
extern PGDLLIMPORT bool X86Features[];
@@ -48,6 +56,14 @@ x86_feature_available(X86FeatureId feature)
return X86Features[feature];
}
+extern uint32 x86_tsc_frequency_khz(void);
+
#endif /* defined(USE_SSE2) || defined(__i386__) */
+#if defined(__aarch64__)
+
+extern uint32 aarch64_cntvct_frequency_khz(void);
+
+#endif /* defined(__aarch64__) */
+
#endif /* PG_CPU_H */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 0a1fff7c487ae..3c9815a3a25eb 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,11 @@
* portable high-precision interval timing
*
* This file provides an abstraction layer to hide portability issues in
- * interval timing. On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter(). These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on
+ * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or
+ * alternatively clock_gettime() on Unix-like systems and
+ * QueryPerformanceCounter() on Windows. These macros also give some breathing
+ * room to use other high-precision-timing APIs.
*
* The basic data type is instr_time, which all callers should treat as an
* opaque typedef. instr_time can store either an absolute time (of
@@ -17,7 +19,11 @@
*
* INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too)
*
- * INSTR_TIME_SET_CURRENT(t) set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting
+ * for instructions in out-of-order window
+ *
+ * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for
+ * instructions in OOO to retire
*
*
* INSTR_TIME_ADD(x, y) x += y
@@ -80,11 +86,103 @@ typedef struct instr_time
#define NS_PER_MS INT64CONST(1000000)
#define NS_PER_US INT64CONST(1000)
+/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */
+#define TICKS_TO_NS_SHIFT 14
-#ifndef WIN32
+/*
+ * Variables used to translate ticks to nanoseconds, initialized by
+ * pg_initialize_timing and adjusted by pg_set_timing_clock_source calls or
+ * changes of the "timing_clock_source" GUC.
+ *
+ * Note that changing these values after setting an instr_time and before
+ * reading/converting it will lead to incorrect results. This is technically
+ * possibly because the GUC can be changed at runtime, but unlikely, and we
+ * allow changing this at runtime to simplify testing of different sources.
+ */
+extern PGDLLIMPORT uint64 ticks_per_ns_scaled;
+extern PGDLLIMPORT uint64 max_ticks_no_overflow;
+extern PGDLLIMPORT bool timing_initialized;
+
+typedef enum
+{
+ TIMING_CLOCK_SOURCE_AUTO,
+ TIMING_CLOCK_SOURCE_SYSTEM,
+ TIMING_CLOCK_SOURCE_TSC
+} TimingClockSourceType;
+
+extern int timing_clock_source;
+
+/*
+ * Initialize timing infrastructure
+ *
+ * This must be called at least once before using INSTR_TIME_SET_CURRENT*
+ * macros.
+ *
+ * If you want to use the TSC clock source in a client program you just also
+ * call pg_set_timing_clock_source afterwards.
+ */
+extern void pg_initialize_timing(void);
+
+/*
+ * Sets the time source to be used. Mainly intended for frontend programs,
+ * the backend should set it via the timing_clock_source GUC instead.
+ *
+ * Returns false if the clock source could not be set, for example when TSC
+ * is not available despite being explicitly set.
+ */
+extern bool pg_set_timing_clock_source(TimingClockSourceType source);
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define PG_INSTR_TSC_CLOCK 1
+#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC"
+#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP"
+#define PG_INSTR_TICKS_TO_NS 1
+#elif defined(__aarch64__) && !defined(WIN32)
+#define PG_INSTR_TSC_CLOCK 1
+#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0"
+#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)"
+#define PG_INSTR_TICKS_TO_NS 1
+#elif defined(WIN32)
+#define PG_INSTR_TSC_CLOCK 0
+#define PG_INSTR_TICKS_TO_NS 1
+#else
+#define PG_INSTR_TSC_CLOCK 0
+#define PG_INSTR_TICKS_TO_NS 0
+#endif
+
+#if PG_INSTR_TSC_CLOCK
+/* Whether to actually use TSC based on availability and GUC settings. */
+extern PGDLLIMPORT bool timing_tsc_enabled;
+
+/*
+ * TSC frequency in kHz, set during initialization.
+ *
+ * -1 = not yet initialized, 0 = TSC not usable, >0 = frequency in kHz.
+ */
+extern PGDLLIMPORT int32 timing_tsc_frequency_khz;
+
+extern void pg_initialize_timing_tsc(void);
+
+#endif /* PG_INSTR_TSC_CLOCK */
+
+/*
+ * Returns the current timing clock source effectively in use, resolving
+ * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or
+ * TIMING_CLOCK_SOURCE_TSC.
+ */
+static inline TimingClockSourceType
+pg_current_timing_clock_source(void)
+{
+#if PG_INSTR_TSC_CLOCK
+ return timing_tsc_enabled ? TIMING_CLOCK_SOURCE_TSC : TIMING_CLOCK_SOURCE_SYSTEM;
+#else
+ return TIMING_CLOCK_SOURCE_SYSTEM;
+#endif
+}
+#ifndef WIN32
-/* Use clock_gettime() */
+/* On POSIX, use clock_gettime() for system clock source */
#include
@@ -99,76 +197,252 @@ typedef struct instr_time
* than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides
* CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than
* their version of CLOCK_MONOTONIC.
+ *
+ * Note this does not get used in case the TSC clock source logic is used,
+ * which directly calls architecture specific timing instructions (e.g. RDTSC).
*/
#if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
-#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW
+#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW
+#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)"
#elif defined(CLOCK_MONOTONIC)
-#define PG_INSTR_CLOCK CLOCK_MONOTONIC
+#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC
+#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)"
#else
-#define PG_INSTR_CLOCK CLOCK_REALTIME
+#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME
+#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)"
#endif
-/* helper for INSTR_TIME_SET_CURRENT */
static inline instr_time
-pg_clock_gettime_ns(void)
+pg_get_ticks_system(void)
{
instr_time now;
struct timespec tmp;
- clock_gettime(PG_INSTR_CLOCK, &tmp);
+ Assert(timing_initialized);
+
+ clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp);
now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
return now;
}
-#define INSTR_TIME_SET_CURRENT(t) \
- ((t) = pg_clock_gettime_ns())
-
-#define INSTR_TIME_GET_NANOSEC(t) \
- ((int64) (t).ticks)
-
-#define INSTR_TIME_ADD_NANOSEC(t, n) \
- ((t).ticks += (n))
-
-
#else /* WIN32 */
+/* On Windows, use QueryPerformanceCounter() for system clock source */
-/* Use QueryPerformanceCounter() */
-
-/* helper for INSTR_TIME_SET_CURRENT */
+#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter"
static inline instr_time
-pg_query_performance_counter(void)
+pg_get_ticks_system(void)
{
instr_time now;
LARGE_INTEGER tmp;
+ Assert(timing_initialized);
+
QueryPerformanceCounter(&tmp);
now.ticks = tmp.QuadPart;
return now;
}
-static inline double
-GetTimerFrequency(void)
+#endif /* WIN32 */
+
+static inline int64
+pg_ticks_to_ns(int64 ticks)
{
- LARGE_INTEGER f;
+#if PG_INSTR_TICKS_TO_NS
+ int64 ns = 0;
+
+ Assert(timing_initialized);
+
+ /*
+ * Avoid doing work if we don't use scaled ticks, e.g. system clock on
+ * Unix
+ */
+ if (ticks_per_ns_scaled == 0)
+ return ticks;
+
+ /*
+ * Would multiplication overflow? If so perform computation in two parts.
+ */
+ if (unlikely(ticks > (int64) max_ticks_no_overflow))
+ {
+ /*
+ * To avoid overflow, first scale total ticks down by the fixed
+ * factor, and *afterwards* multiply them by the frequency-based scale
+ * factor.
+ *
+ * The remaining ticks can follow the regular formula, since they
+ * won't overflow.
+ */
+ int64 count = ticks >> TICKS_TO_NS_SHIFT;
+
+ ns = count * ticks_per_ns_scaled;
+ ticks -= (count << TICKS_TO_NS_SHIFT);
+ }
+
+ ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT;
+
+ return ns;
+#else
+ Assert(timing_initialized);
- QueryPerformanceFrequency(&f);
- return (double) f.QuadPart;
+ return ticks;
+#endif /* PG_INSTR_TICKS_TO_NS */
}
-#define INSTR_TIME_SET_CURRENT(t) \
- ((t) = pg_query_performance_counter())
+static inline int64
+pg_ns_to_ticks(int64 ns)
+{
+#if PG_INSTR_TICKS_TO_NS
+ int64 ticks = 0;
-#define INSTR_TIME_GET_NANOSEC(t) \
- ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
+ Assert(timing_initialized);
-#define INSTR_TIME_ADD_NANOSEC(t, n) \
- ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency())))
+ /*
+ * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g.
+ * system clock on Unix).
+ */
+ if (ticks_per_ns_scaled == 0)
+ return ns;
-#endif /* WIN32 */
+ /*
+ * The reverse of pg_ticks_to_ns to avoid a similar overflow problem.
+ */
+ if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT)))
+ {
+ int64 count = ns / ticks_per_ns_scaled;
+
+ ticks = count << TICKS_TO_NS_SHIFT;
+ ns -= count * ticks_per_ns_scaled;
+ }
+
+ ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled;
+
+ return ticks;
+#else
+ Assert(timing_initialized);
+
+ return ns;
+#endif /* PG_INSTR_TICKS_TO_NS */
+}
+
+#if PG_INSTR_TSC_CLOCK
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+#ifdef _MSC_VER
+#include
+#endif /* defined(_MSC_VER) */
+
+/* Helpers to abstract compiler differences for reading the x86 TSC. */
+static inline int64
+pg_rdtsc(void)
+{
+#ifdef _MSC_VER
+ return __rdtsc();
+#else
+ return __builtin_ia32_rdtsc();
+#endif /* defined(_MSC_VER) */
+}
+
+static inline int64
+pg_rdtscp(void)
+{
+ uint32 unused;
+
+#ifdef _MSC_VER
+ return __rdtscp(&unused);
+#else
+ return __builtin_ia32_rdtscp(&unused);
+#endif /* defined(_MSC_VER) */
+}
+
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ now.ticks = pg_rdtsc();
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ now.ticks = pg_rdtscp();
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+#elif defined(__aarch64__) && !defined(WIN32)
+
+/*
+ * Read the ARM generic timer counter (CNTVCT_EL0).
+ *
+ * The "fast" variant reads the counter without a barrier, analogous to RDTSC
+ * on x86. The regular variant issues an ISB (Instruction Synchronization
+ * Barrier) first, which acts as a serializing instruction analogous to RDTSCP,
+ * ensuring all preceding instructions have completed before reading the
+ * counter.
+ */
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ now.ticks = __builtin_arm_rsr64("cntvct_el0");
+ return now;
+ }
+ return pg_get_ticks_system();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ __builtin_arm_isb(0xf);
+ now.ticks = __builtin_arm_rsr64("cntvct_el0");
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+#endif /* defined(__aarch64__) */
+
+#else /* !PG_INSTR_TSC_CLOCK */
+
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+ return pg_get_ticks_system();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+ return pg_get_ticks_system();
+}
+
+#endif /* PG_INSTR_TSC_CLOCK */
/*
* Common macros
@@ -178,10 +452,19 @@ GetTimerFrequency(void)
#define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0)
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+ ((t) = pg_get_ticks_fast())
+
+#define INSTR_TIME_SET_CURRENT(t) \
+ ((t) = pg_get_ticks())
+
#define INSTR_TIME_ADD(x,y) \
((x).ticks += (y).ticks)
+#define INSTR_TIME_ADD_NANOSEC(t, n) \
+ ((t).ticks += pg_ns_to_ticks(n))
+
#define INSTR_TIME_SUBTRACT(x,y) \
((x).ticks -= (y).ticks)
@@ -191,6 +474,9 @@ GetTimerFrequency(void)
#define INSTR_TIME_GT(x,y) \
((x).ticks > (y).ticks)
+#define INSTR_TIME_GET_NANOSEC(t) \
+ (pg_ticks_to_ns((t).ticks))
+
#define INSTR_TIME_GET_DOUBLE(t) \
((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index b01697c1f606d..307f4fbaefe08 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -163,6 +163,9 @@ extern const char *show_timezone(void);
extern bool check_timezone_abbreviations(char **newval, void **extra,
GucSource source);
extern void assign_timezone_abbreviations(const char *newval, void *extra);
+extern void assign_timing_clock_source(int newval, void *extra);
+extern bool check_timing_clock_source(int *newval, void **extra, GucSource source);
+extern const char *show_timing_clock_source(void);
extern bool check_transaction_buffers(int *newval, void **extra, GucSource source);
extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source);
extern bool check_transaction_isolation(int *newval, void **extra, GucSource source);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 71a8016196138..63440b8e36c83 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -60,6 +60,7 @@ enum config_group
CONN_AUTH_TCP,
CONN_AUTH_AUTH,
CONN_AUTH_SSL,
+ RESOURCES_TIME,
RESOURCES_MEM,
RESOURCES_DISK,
RESOURCES_KERNEL,
diff --git a/src/port/meson.build b/src/port/meson.build
index 922b3f646768d..d695f92b769e1 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
'noblock.c',
'path.c',
'pg_bitutils.c',
+ 'pg_cpu_arm.c',
'pg_cpu_x86.c',
'pg_getopt_ctx.c',
'pg_localeconv_r.c',
diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c
new file mode 100644
index 0000000000000..6fd9dd892ec98
--- /dev/null
+++ b/src/port/pg_cpu_arm.c
@@ -0,0 +1,45 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_cpu_arm.c
+ * Runtime CPU feature detection for AArch64
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_cpu_arm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#if defined(__aarch64__) && !defined(WIN32)
+
+#include "port/pg_cpu.h"
+
+/*
+ * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz.
+ *
+ * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable
+ * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets
+ * this at boot and it does not change.
+ *
+ * Returns 0 if the frequency is not available (should not happen on conforming
+ * implementations).
+ */
+uint32
+aarch64_cntvct_frequency_khz(void)
+{
+ uint64 freq;
+
+ freq = __builtin_arm_rsr64("cntfrq_el0");
+
+ if (freq == 0)
+ return 0;
+
+ return (uint32) (freq / 1000);
+}
+
+#endif /* defined(__aarch64__) */
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 40ff78633ca3f..f076db0ff1c38 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -101,19 +101,24 @@ void
set_x86_features(void)
{
unsigned int reg[4] = {0};
+ bool have_osxsave;
pg_cpuid(0x01, reg);
X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1;
X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1;
+ X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1;
+ have_osxsave = reg[ECX] >> 27 & 1;
+
+ pg_cpuid_subleaf(0x07, 0, reg);
+
+ X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1;
/* leaf 7 features that depend on OSXSAVE */
- if (reg[ECX] & (1 << 27))
+ if (have_osxsave)
{
uint32 xcr0_val = 0;
- pg_cpuid_subleaf(0x07, 0, reg);
-
#ifdef HAVE_XSAVE_INTRINSICS
/* get value of Extended Control Register */
xcr0_val = _xgetbv(0);
@@ -135,7 +140,121 @@ set_x86_features(void)
}
}
+ /* Check for other TSC related flags */
+ pg_cpuid(0x80000001, reg);
+ X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1;
+
+ pg_cpuid(0x80000007, reg);
+ X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1;
+
X86Features[INIT_PG_X86] = true;
}
+/* TSC (Time-stamp Counter) handling code */
+
+static uint32 x86_hypervisor_tsc_frequency_khz(void);
+
+/*
+ * Determine the TSC frequency of the CPU through CPUID, where supported.
+ *
+ * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of
+ * 0 indicates the frequency information was not accessible via CPUID.
+ */
+uint32
+x86_tsc_frequency_khz(void)
+{
+ unsigned int reg[4] = {0};
+
+ if (x86_feature_available(PG_HYPERVISOR))
+ return x86_hypervisor_tsc_frequency_khz();
+
+ /*
+ * On modern Intel CPUs, the TSC is implemented by invariant timekeeping
+ * hardware, also called "Always Running Timer", or ART. The ART stays
+ * consistent even if the CPU changes frequency due to changing power
+ * levels.
+ *
+ * As documented in "Determining the Processor Base Frequency" in the
+ * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual",
+ * February 2026 Edition, we can get the TSC frequency as follows:
+ *
+ * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) /
+ * CPUID.15H:EAX[31:0]
+ *
+ * With CPUID.15H:ECX representing the nominal core crystal clock
+ * frequency, and EAX/EBX representing values used to translate the TSC
+ * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of
+ * that manual.
+ *
+ * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as
+ * such we fall back to alternate approaches.
+ */
+ pg_cpuid(0x15, reg);
+ if (reg[ECX] > 0)
+ {
+ /*
+ * EBX not being set indicates invariant TSC is not available. Require
+ * EAX being non-zero too, to avoid a theoretical divide by zero.
+ */
+ if (reg[EAX] == 0 || reg[EBX] == 0)
+ return 0;
+
+ return reg[ECX] / 1000 * reg[EBX] / reg[EAX];
+ }
+
+ /*
+ * When CPUID.15H is not available/incomplete, we can instead try to get
+ * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor
+ * Frequency Information Leaf".
+ */
+ pg_cpuid(0x16, reg);
+ if (reg[EAX] > 0)
+ return reg[EAX] * 1000;
+
+ return 0;
+}
+
+/*
+ * Support for reading TSC frequency for hypervisors passing it to a guest VM.
+ *
+ * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz
+ * available at the vendor-specific 0x40000010 leaf in the EAX register.
+ *
+ * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would
+ * need to access a model-specific register (MSR) to get the frequency. MSRs are
+ * separate from CPUID and typically not available for unprivileged processes,
+ * so we can't get the frequency this way.
+ */
+#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */
+static uint32
+x86_hypervisor_tsc_frequency_khz(void)
+{
+ unsigned int reg[4] = {0};
+
+#if defined(HAVE__CPUIDEX)
+
+ /*
+ * The hypervisor is determined using the 0x40000000 Hypervisor
+ * information leaf, which requires use of __cpuidex to set ECX to 0 to
+ * access it.
+ *
+ * The similar __get_cpuid_count function does not work as expected since
+ * it contains a check for __get_cpuid_max, which has been observed to be
+ * lower than the special Hypervisor leaf, despite it being available.
+ */
+ __cpuidex((int *) reg, 0x40000000, 0);
+
+ if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg)))
+ {
+ __cpuidex((int *) reg, 0x40000010, 0);
+ if (reg[EAX] > 0)
+ return reg[EAX];
+ }
+#endif /* HAVE__CPUIDEX */
+
+ return 0;
+}
+
+
#endif /* defined(USE_SSE2) || defined(__i386__) */
diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
index cf55cdf3688d9..c3261bff209fb 100644
--- a/src/test/regress/expected/misc_functions.out
+++ b/src/test/regress/expected/misc_functions.out
@@ -850,3 +850,14 @@ SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats();
RESET ROLE;
DROP ROLE regress_multixact_funcs;
+-- test instr_time nanosecond<->ticks conversion
+CREATE FUNCTION test_instr_time()
+ RETURNS bool
+ AS :'regresslib'
+ LANGUAGE C;
+SELECT test_instr_time();
+ test_instr_time
+-----------------
+ t
+(1 row)
+
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index 9a918156437b2..0c0620569829b 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -2181,6 +2181,8 @@ regression_main(int argc, char *argv[],
progname = get_progname(argv[0]);
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress"));
+ pg_initialize_timing();
+
get_restricted_token();
atexit(stop_postmaster);
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 68a01a1dde014..c2eaa96f08605 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -38,6 +38,7 @@
#include "optimizer/plancat.h"
#include "parser/parse_coerce.h"
#include "port/atomics.h"
+#include "portability/instr_time.h"
#include "postmaster/postmaster.h" /* for MAX_BACKENDS */
#include "storage/spin.h"
#include "tcop/tcopprot.h"
@@ -1384,3 +1385,38 @@ test_translation(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+
+/* Verify that pg_ticks_to_ns behaves correct, including overflow */
+PG_FUNCTION_INFO_V1(test_instr_time);
+Datum
+test_instr_time(PG_FUNCTION_ARGS)
+{
+ instr_time t;
+ int64 test_ns[] = {0, 1000, INT64CONST(1000000000000000)};
+ int64 max_err;
+
+ /*
+ * The ns-to-ticks-to-ns roundtrip may lose precision due to integer
+ * truncation in the fixed-point conversion. The maximum error depends on
+ * ticks_per_ns_scaled relative to the shift factor.
+ */
+ max_err = (ticks_per_ns_scaled >> TICKS_TO_NS_SHIFT) + 1;
+
+ for (int i = 0; i < lengthof(test_ns); i++)
+ {
+ int64 result;
+
+ INSTR_TIME_SET_ZERO(t);
+ INSTR_TIME_ADD_NANOSEC(t, test_ns[i]);
+ result = INSTR_TIME_GET_NANOSEC(t);
+
+ if (result < test_ns[i] - max_err || result > test_ns[i])
+ elog(ERROR,
+ "INSTR_TIME_GET_NANOSEC(t) yielded " INT64_FORMAT
+ ", expected " INT64_FORMAT " (max_err " INT64_FORMAT
+ ") in file \"%s\" line %u",
+ result, test_ns[i], max_err, __FILE__, __LINE__);
+ }
+
+ PG_RETURN_BOOL(true);
+}
diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
index c8226652f2c94..946ee5726cdd7 100644
--- a/src/test/regress/sql/misc_functions.sql
+++ b/src/test/regress/sql/misc_functions.sql
@@ -349,3 +349,10 @@ SET ROLE regress_multixact_funcs;
SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats();
RESET ROLE;
DROP ROLE regress_multixact_funcs;
+
+-- test instr_time nanosecond<->ticks conversion
+CREATE FUNCTION test_instr_time()
+ RETURNS bool
+ AS :'regresslib'
+ LANGUAGE C;
+SELECT test_instr_time();
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c72f6c595730a..d609599f419a7 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3170,6 +3170,7 @@ TimeoutId
TimeoutType
Timestamp
TimestampTz
+TimingClockSourceType
TmFromChar
TmToChar
ToastAttrInfo