diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index 0875c75e..e8302ad5 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -12,7 +12,6 @@ #include "pevent.hpp" #include "unlikely.hpp" -#include #include #include #include @@ -86,12 +85,11 @@ class AllocationTracker { static void ensure_key_initialized(); private: + static void create_key(); static void delete_tl_state(void *tl_state); - // POSIX does not define an invalid pthread_key_t value, but implementations - // allocate keys starting from 0, so -1 (all bits set) is a safe sentinel. - static constexpr pthread_key_t kInvalidKey = static_cast(-1); - static std::atomic _tl_state_key; + static pthread_once_t _tl_key_once; + static pthread_key_t _tl_state_key; static constexpr unsigned k_ratio_max_elt_to_bitset_size = 16; // NOLINTBEGIN(misc-non-private-member-variables-in-classes) diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index ba5c58bc..4f9f73e7 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -18,11 +18,12 @@ #include "tsc_clock.hpp" #include -#include #include #include #include +#include #include +#include #include #include #include @@ -30,8 +31,8 @@ namespace ddprof { AllocationTracker *AllocationTracker::_instance; -std::atomic AllocationTracker::_tl_state_key{ - AllocationTracker::kInvalidKey}; +pthread_once_t AllocationTracker::_tl_key_once = PTHREAD_ONCE_INIT; +pthread_key_t AllocationTracker::_tl_state_key; namespace { @@ -50,51 +51,49 @@ DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, } return Buffer{}; } +// Abort with a diagnostic if a pthread call returns an error. +// pthread functions return the error code directly (they do not set errno), +// so perror() would print the wrong message. +#define PTHREAD_CHECK(call) \ + do { \ + int err_ = (call); \ + if (err_ != 0) { \ + fprintf(stderr, "ddprof: %s failed: %s\n", #call, strerror(err_)); \ + std::abort(); \ + } \ + } while (0) + } // namespace void AllocationTracker::delete_tl_state(void *tl_state) { delete static_cast(tl_state); } +void AllocationTracker::create_key() { + PTHREAD_CHECK(pthread_key_create(&_tl_state_key, delete_tl_state)); +} + void AllocationTracker::ensure_key_initialized() { - if (_tl_state_key.load(std::memory_order_acquire) != kInvalidKey) { - return; - } - pthread_key_t new_key; - if (pthread_key_create(&new_key, delete_tl_state) != 0) { - return; - } - pthread_key_t expected = kInvalidKey; - if (!_tl_state_key.compare_exchange_strong(expected, new_key, - std::memory_order_release)) { - // Another thread beat us, discard our key - pthread_key_delete(new_key); - } + PTHREAD_CHECK(pthread_once(&_tl_key_once, create_key)); } TrackerThreadLocalState *AllocationTracker::get_tl_state() { - const pthread_key_t key = _tl_state_key.load(std::memory_order_relaxed); - if (key == kInvalidKey) { - return nullptr; - } - return static_cast(pthread_getspecific(key)); + return static_cast( + pthread_getspecific(_tl_state_key)); } TrackerThreadLocalState *AllocationTracker::init_tl_state() { - // acquire pairs with the release in ensure_key_initialized(): guarantees - // that if we see a valid key the pthread key is fully published and we won't - // silently return nullptr and drop the thread's initial allocations. - const pthread_key_t key = _tl_state_key.load(std::memory_order_acquire); - if (key == kInvalidKey) { - return nullptr; - } + // Free any previously set state (e.g. inherited across fork). + delete static_cast( + pthread_getspecific(_tl_state_key)); + auto *state = new (std::nothrow) TrackerThreadLocalState{}; if (!state) { return nullptr; } state->tid = ddprof::gettid(); state->stack_bounds = retrieve_stack_bounds(); - pthread_setspecific(key, state); + PTHREAD_CHECK(pthread_setspecific(_tl_state_key, state)); return state; } @@ -200,11 +199,9 @@ void AllocationTracker::free() { pevent_munmap_event(&_pevent); // The pthread key (_tl_state_key) is intentionally not deleted here. - // pthread_key_delete would race with concurrent get_tl_state() calls that - // already loaded the key value but haven't called pthread_getspecific yet. - // The cost is one leaked key per dlclose/reload cycle, which is acceptable: - // POSIX guarantees at least PTHREAD_KEYS_MAX (128) keys per process, and - // library reload is not a supported use case. + // pthread_key_delete would race with concurrent get_tl_state() calls. + // The cost is one leaked key per dlclose/reload cycle, acceptable given + // POSIX guarantees at least PTHREAD_KEYS_MAX (128) keys per process. // Do not destroy the object: // there is an inherent race condition between checking diff --git a/src/lib/dd_profiling.cc b/src/lib/dd_profiling.cc index 43a9a235..c4fc88cd 100644 --- a/src/lib/dd_profiling.cc +++ b/src/lib/dd_profiling.cc @@ -236,6 +236,10 @@ struct ProfilerAutoStart { ProfilerAutoStart() noexcept { init_state(); + // Create the pthread key early, before any thread can fork. + // This avoids a deadlock on musl if fork() races with pthread_once + // (musl has no fork-generation mechanism to recover in-progress state). + AllocationTracker::ensure_key_initialized(); // Note that library needs to be linked with `--no-as-needed` when using // autostart, otherwise linker will completely remove library from DT_NEEDED diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc index 6f10017c..c7e43599 100644 --- a/test/allocation_tracker-bench.cc +++ b/test/allocation_tracker-bench.cc @@ -335,7 +335,6 @@ static void BM_GetTlState(benchmark::State &state) { std::atomic ready_count{0}; std::atomic go{false}; std::atomic done{false}; - std::atomic total_ops{0}; std::vector workers; workers.reserve(nb_threads); @@ -351,13 +350,10 @@ static void BM_GetTlState(benchmark::State &state) { break; } // Hot loop: pure TLS access - int64_t ops = 0; for (int j = 0; j < k_ops_per_iter; ++j) { auto *tl = ddprof::AllocationTracker::get_tl_state(); benchmark::DoNotOptimize(tl); - ++ops; } - total_ops.fetch_add(ops, std::memory_order_relaxed); ready_count.fetch_sub(1, std::memory_order_release); // wait for go to be lowered before looping back while (go.load(std::memory_order_acquire) && @@ -367,15 +363,14 @@ static void BM_GetTlState(benchmark::State &state) { } for (auto _ : state) { - total_ops.store(0, std::memory_order_relaxed); // Wait for all workers to be ready while (ready_count.load(std::memory_order_acquire) != nb_threads) {} go.store(true, std::memory_order_release); // Wait for all workers to finish the hot loop while (ready_count.load(std::memory_order_acquire) != 0) {} go.store(false, std::memory_order_release); - state.SetItemsProcessed(total_ops.load(std::memory_order_relaxed)); } + state.SetItemsProcessed(state.iterations() * nb_threads * k_ops_per_iter); done.store(true, std::memory_order_release); go.store(true, std::memory_order_release); // unblock workers diff --git a/test/allocation_tracker_fork_test.cc b/test/allocation_tracker_fork_test.cc index 91d82253..614e4fad 100644 --- a/test/allocation_tracker_fork_test.cc +++ b/test/allocation_tracker_fork_test.cc @@ -81,7 +81,9 @@ int main() { const LogHandle log_handle(LL_NOTICE); LG_NTC("allocation_tracker_fork_test starting"); - // Before key initialization, get_tl_state() must return NULL. + // Before ensure_key_initialized(), get_tl_state() returns NULL because + // pthread_getspecific on the zero-initialized (uncreated) key returns NULL + // on both glibc and musl. auto *pre_init = AllocationTracker::get_tl_state(); CHECK_OR_RETURN(pre_init == nullptr, "expected NULL before key init (got %p)", static_cast(pre_init));