Skip to content

Commit f4b13ad

Browse files
committed
instrumentation: ARM support for fast time measurements
Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl <lukas@fittl.com> Reviewed-by: Discussion:
1 parent 9fa050a commit f4b13ad

5 files changed

Lines changed: 167 additions & 8 deletions

File tree

src/common/instr_time.c

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
#include <unistd.h>
2121
#endif
2222

23+
#if defined(__APPLE__)
24+
#include <sys/sysctl.h>
25+
#endif
26+
2327
#include "port/pg_cpu.h"
2428
#include "portability/instr_time.h"
2529

@@ -157,14 +161,27 @@ set_ticks_per_ns(void)
157161
#endif
158162
}
159163

160-
/* TSC specific logic */
164+
/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */
161165

162166
#if PG_INSTR_TSC_CLOCK
163167

164168
bool use_tsc = false;
165169

166170
int32 tsc_frequency_khz = -1;
167171

172+
static void
173+
set_ticks_per_ns_for_tsc(void)
174+
{
175+
ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz;
176+
max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
177+
}
178+
179+
#if defined(__x86_64__) || defined(_M_X64)
180+
181+
/*
182+
* x86-64 TSC specific logic
183+
*/
184+
168185
static uint32 tsc_calibrate(void);
169186

170187
/*
@@ -355,13 +372,54 @@ tsc_calibrate(void)
355372
return 0;
356373
}
357374

375+
#elif defined(__aarch64__)
376+
377+
/*
378+
* Check whether this is a heterogeneous Apple Silicon P+E core system
379+
* where CNTVCT_EL0 may tick at different rates on different core types.
380+
*/
381+
static bool
382+
aarch64_has_heterogeneous_cores(void)
383+
{
384+
#if defined(__APPLE__)
385+
int nperflevels = 0;
386+
size_t len = sizeof(nperflevels);
387+
388+
if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0)
389+
return nperflevels > 1;
390+
#endif
391+
392+
return false;
393+
}
394+
395+
/*
396+
* Detect the generic timer frequency on AArch64.
397+
*/
358398
static void
359-
set_ticks_per_ns_for_tsc(void)
399+
tsc_detect_frequency(void)
360400
{
361-
ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz;
362-
max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
401+
if (aarch64_has_heterogeneous_cores())
402+
{
403+
tsc_frequency_khz = 0;
404+
return;
405+
}
406+
407+
tsc_frequency_khz = aarch64_cntvct_frequency_khz();
408+
}
409+
410+
/*
411+
* The ARM generic timer is architecturally guaranteed to be monotonic and
412+
* synchronized across cores of the same type, so we always use it by default
413+
* when available and cores are homogenous.
414+
*/
415+
static bool
416+
tsc_use_by_default(void)
417+
{
418+
return true;
363419
}
364420

421+
#endif /* defined(__aarch64__) */
422+
365423
/*
366424
* Initialize the TSC clock source by determining its usability and frequency.
367425
*

src/include/port/pg_cpu.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,10 @@ extern uint32 x86_tsc_frequency_khz(void);
5656

5757
#endif /* defined(USE_SSE2) || defined(__i386__) */
5858

59+
#if defined(__aarch64__)
60+
61+
extern uint32 aarch64_cntvct_frequency_khz(void);
62+
63+
#endif /* defined(__aarch64__) */
64+
5965
#endif /* PG_CPU_H */

src/include/portability/instr_time.h

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
* portable high-precision interval timing
55
*
66
* This file provides an abstraction layer to hide portability issues in
7-
* interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in
8-
* certain cases, or alternatively clock_gettime() on Unix-like systems and
7+
* interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on
8+
* AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or
9+
* alternatively clock_gettime() on Unix-like systems and
910
* QueryPerformanceCounter() on Windows. These macros also give some breathing
1011
* room to use other high-precision-timing APIs.
1112
*
@@ -135,6 +136,11 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source);
135136
#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC"
136137
#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP"
137138
#define PG_INSTR_TICKS_TO_NS 1
139+
#elif defined(__aarch64__) && !defined(WIN32)
140+
#define PG_INSTR_TSC_CLOCK 1
141+
#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0"
142+
#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)"
143+
#define PG_INSTR_TICKS_TO_NS 1
138144
#elif defined(WIN32)
139145
#define PG_INSTR_TSC_CLOCK 0
140146
#define PG_INSTR_TICKS_TO_NS 1
@@ -143,7 +149,6 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source);
143149
#define PG_INSTR_TICKS_TO_NS 0
144150
#endif
145151

146-
147152
#if PG_INSTR_TSC_CLOCK
148153
/* Whether to actually use TSC based on availability and GUC settings. */
149154
extern PGDLLIMPORT bool use_tsc;
@@ -287,6 +292,8 @@ pg_ticks_to_ns(int64 ticks)
287292

288293
#if PG_INSTR_TSC_CLOCK
289294

295+
#if defined(__x86_64__) || defined(_M_X64)
296+
290297
#ifdef _MSC_VER
291298
#include <intrin.h>
292299
#endif /* defined(_MSC_VER) */
@@ -329,7 +336,49 @@ pg_get_ticks(void)
329336
return pg_get_ticks_system();
330337
}
331338

332-
#else
339+
#elif defined(__aarch64__) && !defined(WIN32)
340+
341+
/*
342+
* Read the ARM generic timer counter (CNTVCT_EL0).
343+
*
344+
* The "fast" variant reads the counter without a barrier, analogous to RDTSC
345+
* on x86. The regular variant issues an ISB (Instruction Synchronization
346+
* Barrier) first, which acts as a serializing instruction analogous to RDTSCP,
347+
* ensuring all preceding instructions have completed before reading the
348+
* counter.
349+
*/
350+
static inline instr_time
351+
pg_get_ticks_fast(void)
352+
{
353+
if (likely(use_tsc))
354+
{
355+
instr_time now;
356+
357+
now.ticks = __builtin_arm_rsr64("cntvct_el0");
358+
return now;
359+
}
360+
361+
return pg_get_ticks_system();
362+
}
363+
364+
static inline instr_time
365+
pg_get_ticks(void)
366+
{
367+
if (likely(use_tsc))
368+
{
369+
instr_time now;
370+
371+
__builtin_arm_isb(0xf);
372+
now.ticks = __builtin_arm_rsr64("cntvct_el0");
373+
return now;
374+
}
375+
376+
return pg_get_ticks_system();
377+
}
378+
379+
#endif /* defined(__aarch64__) */
380+
381+
#else /* !PG_INSTR_TSC_CLOCK */
333382

334383
static inline instr_time
335384
pg_get_ticks_fast(void)

src/port/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ pgport_sources = [
77
'noblock.c',
88
'path.c',
99
'pg_bitutils.c',
10+
'pg_cpu_arm.c',
1011
'pg_cpu_x86.c',
1112
'pg_localeconv_r.c',
1213
'pg_numa.c',

src/port/pg_cpu_arm.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* pg_cpu_arm.c
4+
* Runtime CPU feature detection for AArch64
5+
*
6+
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7+
* Portions Copyright (c) 1994, Regents of the University of California
8+
*
9+
*
10+
* IDENTIFICATION
11+
* src/port/pg_cpu_arm.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
16+
#include "c.h"
17+
18+
#if defined(__aarch64__) && !defined(WIN32)
19+
20+
#include "port/pg_cpu.h"
21+
22+
/*
23+
* Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz.
24+
*
25+
* The CNTFRQ_EL0 system register is architecturally guaranteed to be readable
26+
* from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets
27+
* this at boot and it does not change.
28+
*
29+
* Returns 0 if the frequency is not available (should not happen on conforming
30+
* implementations).
31+
*/
32+
uint32
33+
aarch64_cntvct_frequency_khz(void)
34+
{
35+
uint64 freq;
36+
37+
freq = __builtin_arm_rsr64("cntfrq_el0");
38+
39+
if (freq == 0)
40+
return 0;
41+
42+
return (uint32) (freq / 1000);
43+
}
44+
45+
#endif /* defined(__aarch64__) */

0 commit comments

Comments
 (0)