Skip to content

Commit acae748

Browse files
committed
instrumentation: ARM support for fast time measurements
Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl <lukas@fittl.com> Reviewed-by: Discussion:
1 parent 514cd2b commit acae748

5 files changed

Lines changed: 166 additions & 8 deletions

File tree

src/common/instr_time.c

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
#include <unistd.h>
2121
#endif
2222

23+
#if defined(__APPLE__)
24+
#include <sys/sysctl.h>
25+
#endif
26+
2327
#include "port/pg_cpu.h"
2428
#include "portability/instr_time.h"
2529

@@ -162,14 +166,27 @@ set_ticks_per_ns()
162166
#endif
163167
}
164168

165-
/* TSC specific logic */
169+
/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */
166170

167171
#if PG_INSTR_TSC_CLOCK
168172

169173
bool use_tsc = false;
170174

171175
static uint32 tsc_frequency_khz = 0;
172176

177+
static void
178+
set_ticks_per_ns_for_tsc(void)
179+
{
180+
ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz;
181+
max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
182+
}
183+
184+
#if defined(__x86_64__) || defined(_M_X64)
185+
186+
/*
187+
* x86-64 TSC specific logic
188+
*/
189+
173190
static uint32 tsc_calibrate(void);
174191

175192
/*
@@ -363,11 +380,51 @@ tsc_calibrate(void)
363380
return 0;
364381
}
365382

383+
#elif defined(__aarch64__)
384+
385+
/*
386+
* Check whether this is a heterogeneous Apple Silicon P+E core system
387+
* where CNTVCT_EL0 may tick at different rates on different core types.
388+
*/
389+
static bool
390+
aarch64_has_heterogeneous_cores(void)
391+
{
392+
#if defined(__APPLE__)
393+
int nperflevels = 0;
394+
size_t len = sizeof(nperflevels);
395+
396+
if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0)
397+
return nperflevels > 1;
398+
#endif
399+
400+
return false;
401+
}
402+
403+
/*
404+
* Initialize the AArch64 generic timer as a clock source.
405+
*/
366406
static void
367-
set_ticks_per_ns_for_tsc(void)
407+
tsc_initialize(bool allow_tsc_calibration)
368408
{
369-
ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz;
370-
max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
409+
if (aarch64_has_heterogeneous_cores())
410+
return;
411+
412+
tsc_frequency_khz = aarch64_cntvct_frequency_khz();
413+
if (tsc_frequency_khz != 0)
414+
has_usable_tsc = true;
415+
}
416+
417+
/*
418+
* The ARM generic timer is architecturally guaranteed to be monotonic and
419+
* synchronized across cores of the same type, so we always use it by default
420+
* when available and cores are homogenous.
421+
*/
422+
static bool
423+
tsc_use_by_default(void)
424+
{
425+
return true;
371426
}
372427

428+
#endif /* defined(__aarch64__) */
429+
373430
#endif /* PG_INSTR_TSC_CLOCK */

src/include/port/pg_cpu.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,10 @@ extern uint32 x86_tsc_frequency_khz(void);
5656

5757
#endif /* defined(USE_SSE2) || defined(__i386__) */
5858

59+
#if defined(__aarch64__)
60+
61+
extern uint32 aarch64_cntvct_frequency_khz(void);
62+
63+
#endif /* defined(__aarch64__) */
64+
5965
#endif /* PG_CPU_H */

src/include/portability/instr_time.h

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
* portable high-precision interval timing
55
*
66
* This file provides an abstraction layer to hide portability issues in
7-
* interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in
8-
* certain cases, or alternatively clock_gettime() on Unix-like systems and
7+
* interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on
8+
* AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or
9+
* alternatively clock_gettime() on Unix-like systems and
910
* QueryPerformanceCounter() on Windows. These macros also give some breathing
1011
* room to use other high-precision-timing APIs.
1112
*
@@ -126,6 +127,11 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source);
126127
#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC"
127128
#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP"
128129
#define PG_INSTR_TICKS_TO_NS 1
130+
#elif defined(__aarch64__) && !defined(WIN32)
131+
#define PG_INSTR_TSC_CLOCK 1
132+
#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0"
133+
#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)"
134+
#define PG_INSTR_TICKS_TO_NS 1
129135
#elif defined(WIN32)
130136
#define PG_INSTR_TSC_CLOCK 0
131137
#define PG_INSTR_TICKS_TO_NS 1
@@ -134,7 +140,6 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source);
134140
#define PG_INSTR_TICKS_TO_NS 0
135141
#endif
136142

137-
138143
#if PG_INSTR_TSC_CLOCK
139144
/* Whether the hardware TSC clock is available and usable. */
140145
extern PGDLLIMPORT bool has_usable_tsc;
@@ -264,6 +269,8 @@ pg_ticks_to_ns(int64 ticks)
264269

265270
#if PG_INSTR_TSC_CLOCK
266271

272+
#if defined(__x86_64__) || defined(_M_X64)
273+
267274
#ifdef _MSC_VER
268275
#include <intrin.h>
269276
#endif /* defined(_MSC_VER) */
@@ -306,7 +313,49 @@ pg_get_ticks(void)
306313
return pg_get_ticks_system();
307314
}
308315

309-
#else
316+
#elif defined(__aarch64__) && !defined(WIN32)
317+
318+
/*
319+
* Read the ARM generic timer counter (CNTVCT_EL0).
320+
*
321+
* The "fast" variant reads the counter without a barrier, analogous to RDTSC
322+
* on x86. The regular variant issues an ISB (Instruction Synchronization
323+
* Barrier) first, which acts as a serializing instruction analogous to RDTSCP,
324+
* ensuring all preceding instructions have completed before reading the
325+
* counter.
326+
*/
327+
static inline instr_time
328+
pg_get_ticks_fast(void)
329+
{
330+
if (likely(use_tsc))
331+
{
332+
instr_time now;
333+
334+
now.ticks = __builtin_arm_rsr64("cntvct_el0");
335+
return now;
336+
}
337+
338+
return pg_get_ticks_system();
339+
}
340+
341+
static inline instr_time
342+
pg_get_ticks(void)
343+
{
344+
if (likely(use_tsc))
345+
{
346+
instr_time now;
347+
348+
__builtin_arm_isb(0xf);
349+
now.ticks = __builtin_arm_rsr64("cntvct_el0");
350+
return now;
351+
}
352+
353+
return pg_get_ticks_system();
354+
}
355+
356+
#endif /* defined(__aarch64__) */
357+
358+
#else /* !PG_INSTR_TSC_CLOCK */
310359

311360
static inline instr_time
312361
pg_get_ticks_fast(void)

src/port/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ pgport_sources = [
77
'noblock.c',
88
'path.c',
99
'pg_bitutils.c',
10+
'pg_cpu_arm.c',
1011
'pg_cpu_x86.c',
1112
'pg_localeconv_r.c',
1213
'pg_numa.c',

src/port/pg_cpu_arm.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* pg_cpu_arm.c
4+
* Runtime CPU feature detection for AArch64
5+
*
6+
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7+
* Portions Copyright (c) 1994, Regents of the University of California
8+
*
9+
*
10+
* IDENTIFICATION
11+
* src/port/pg_cpu_arm.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
16+
#include "c.h"
17+
18+
#if defined(__aarch64__) && !defined(WIN32)
19+
20+
#include "port/pg_cpu.h"
21+
22+
/*
23+
* Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz.
24+
*
25+
* The CNTFRQ_EL0 system register is architecturally guaranteed to be readable
26+
* from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets
27+
* this at boot and it does not change.
28+
*
29+
* Returns 0 if the frequency is not available (should not happen on conforming
30+
* implementations).
31+
*/
32+
uint32
33+
aarch64_cntvct_frequency_khz(void)
34+
{
35+
uint64 freq;
36+
37+
freq = __builtin_arm_rsr64("cntfrq_el0");
38+
39+
if (freq == 0)
40+
return 0;
41+
42+
return (uint32) (freq / 1000);
43+
}
44+
45+
#endif /* defined(__aarch64__) */

0 commit comments

Comments
 (0)