diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp
index 0875c75e..e8302ad5 100644
--- a/include/lib/allocation_tracker.hpp
+++ b/include/lib/allocation_tracker.hpp
@@ -12,7 +12,6 @@
 #include "pevent.hpp"
 #include "unlikely.hpp"
 
-#include <atomic>
 #include <cstddef>
 #include <functional>
 #include <mutex>
@@ -86,12 +85,11 @@ class AllocationTracker {
   static void ensure_key_initialized();
 
 private:
+  static void create_key();
   static void delete_tl_state(void *tl_state);
 
-  // POSIX does not define an invalid pthread_key_t value, but implementations
-  // allocate keys starting from 0, so -1 (all bits set) is a safe sentinel.
-  static constexpr pthread_key_t kInvalidKey = static_cast<pthread_key_t>(-1);
-  static std::atomic<pthread_key_t> _tl_state_key;
+  static pthread_once_t _tl_key_once;
+  static pthread_key_t _tl_state_key;
   static constexpr unsigned k_ratio_max_elt_to_bitset_size = 16;
 
   // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc
index ba5c58bc..4f9f73e7 100644
--- a/src/lib/allocation_tracker.cc
+++ b/src/lib/allocation_tracker.cc
@@ -18,11 +18,12 @@
 #include "tsc_clock.hpp"
 
 #include <algorithm>
-#include <atomic>
 #include <cassert>
 #include <chrono>
 #include <cstdint>
+#include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <new>
 #include <pthread.h>
 #include <unistd.h>
@@ -30,8 +31,8 @@
 namespace ddprof {
 
 AllocationTracker *AllocationTracker::_instance;
-std::atomic<pthread_key_t> AllocationTracker::_tl_state_key{
-    AllocationTracker::kInvalidKey};
+pthread_once_t AllocationTracker::_tl_key_once = PTHREAD_ONCE_INIT;
+pthread_key_t AllocationTracker::_tl_state_key;
 
 namespace {
 
@@ -50,51 +51,49 @@ DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer,
   }
   return Buffer{};
 }
+// Abort with a diagnostic if a pthread call returns an error.
+// pthread functions return the error code directly (they do not set errno),
+// so perror() would print the wrong message.
+#define PTHREAD_CHECK(call)                                                    \
+  do {                                                                         \
+    int err_ = (call);                                                         \
+    if (err_ != 0) {                                                           \
+      fprintf(stderr, "ddprof: %s failed: %s\n", #call, strerror(err_));       \
+      std::abort();                                                            \
+    }                                                                          \
+  } while (0)
+
 } // namespace
 
 void AllocationTracker::delete_tl_state(void *tl_state) {
   delete static_cast<TrackerThreadLocalState *>(tl_state);
 }
 
+void AllocationTracker::create_key() {
+  PTHREAD_CHECK(pthread_key_create(&_tl_state_key, delete_tl_state));
+}
+
 void AllocationTracker::ensure_key_initialized() {
-  if (_tl_state_key.load(std::memory_order_acquire) != kInvalidKey) {
-    return;
-  }
-  pthread_key_t new_key;
-  if (pthread_key_create(&new_key, delete_tl_state) != 0) {
-    return;
-  }
-  pthread_key_t expected = kInvalidKey;
-  if (!_tl_state_key.compare_exchange_strong(expected, new_key,
-                                             std::memory_order_release)) {
-    // Another thread beat us, discard our key
-    pthread_key_delete(new_key);
-  }
+  PTHREAD_CHECK(pthread_once(&_tl_key_once, create_key));
 }
 
 TrackerThreadLocalState *AllocationTracker::get_tl_state() {
-  const pthread_key_t key = _tl_state_key.load(std::memory_order_relaxed);
-  if (key == kInvalidKey) {
-    return nullptr;
-  }
-  return static_cast<TrackerThreadLocalState *>(pthread_getspecific(key));
+  return static_cast<TrackerThreadLocalState *>(
+      pthread_getspecific(_tl_state_key));
 }
 
 TrackerThreadLocalState *AllocationTracker::init_tl_state() {
-  // acquire pairs with the release in ensure_key_initialized(): guarantees
-  // that if we see a valid key the pthread key is fully published and we won't
-  // silently return nullptr and drop the thread's initial allocations.
-  const pthread_key_t key = _tl_state_key.load(std::memory_order_acquire);
-  if (key == kInvalidKey) {
-    return nullptr;
-  }
+  // Free any previously set state (e.g. inherited across fork).
+  delete static_cast<TrackerThreadLocalState *>(
+      pthread_getspecific(_tl_state_key));
+
   auto *state = new (std::nothrow) TrackerThreadLocalState{};
   if (!state) {
     return nullptr;
   }
   state->tid = ddprof::gettid();
   state->stack_bounds = retrieve_stack_bounds();
-  pthread_setspecific(key, state);
+  PTHREAD_CHECK(pthread_setspecific(_tl_state_key, state));
   return state;
 }
 
@@ -200,11 +199,9 @@ void AllocationTracker::free() {
   pevent_munmap_event(&_pevent);
 
   // The pthread key (_tl_state_key) is intentionally not deleted here.
-  // pthread_key_delete would race with concurrent get_tl_state() calls that
-  // already loaded the key value but haven't called pthread_getspecific yet.
-  // The cost is one leaked key per dlclose/reload cycle, which is acceptable:
-  // POSIX guarantees at least PTHREAD_KEYS_MAX (128) keys per process, and
-  // library reload is not a supported use case.
+  // pthread_key_delete would race with concurrent get_tl_state() calls.
+  // The cost is one leaked key per dlclose/reload cycle, acceptable given
+  // POSIX guarantees at least PTHREAD_KEYS_MAX (128) keys per process.
 
   // Do not destroy the object:
   // there is an inherent race condition between checking
diff --git a/src/lib/dd_profiling.cc b/src/lib/dd_profiling.cc
index 43a9a235..c4fc88cd 100644
--- a/src/lib/dd_profiling.cc
+++ b/src/lib/dd_profiling.cc
@@ -236,6 +236,10 @@ struct ProfilerAutoStart {
 
   ProfilerAutoStart() noexcept {
     init_state();
+    // Create the pthread key early, before any thread can fork.
+    // This avoids a deadlock on musl if fork() races with pthread_once
+    // (musl has no fork-generation mechanism to recover in-progress state).
+    AllocationTracker::ensure_key_initialized();
 
     // Note that library needs to be linked with `--no-as-needed` when using
     // autostart, otherwise linker will completely remove library from DT_NEEDED
diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc
index 6f10017c..c7e43599 100644
--- a/test/allocation_tracker-bench.cc
+++ b/test/allocation_tracker-bench.cc
@@ -335,7 +335,6 @@ static void BM_GetTlState(benchmark::State &state) {
   std::atomic<int> ready_count{0};
   std::atomic<bool> go{false};
   std::atomic<bool> done{false};
-  std::atomic<int64_t> total_ops{0};
 
   std::vector<std::thread> workers;
   workers.reserve(nb_threads);
@@ -351,13 +350,10 @@ static void BM_GetTlState(benchmark::State &state) {
           break;
         }
         // Hot loop: pure TLS access
-        int64_t ops = 0;
         for (int j = 0; j < k_ops_per_iter; ++j) {
           auto *tl = ddprof::AllocationTracker::get_tl_state();
           benchmark::DoNotOptimize(tl);
-          ++ops;
         }
-        total_ops.fetch_add(ops, std::memory_order_relaxed);
         ready_count.fetch_sub(1, std::memory_order_release);
         // wait for go to be lowered before looping back
         while (go.load(std::memory_order_acquire) &&
@@ -367,15 +363,14 @@ static void BM_GetTlState(benchmark::State &state) {
   }
 
   for (auto _ : state) {
-    total_ops.store(0, std::memory_order_relaxed);
     // Wait for all workers to be ready
     while (ready_count.load(std::memory_order_acquire) != nb_threads) {}
     go.store(true, std::memory_order_release);
     // Wait for all workers to finish the hot loop
     while (ready_count.load(std::memory_order_acquire) != 0) {}
     go.store(false, std::memory_order_release);
-    state.SetItemsProcessed(total_ops.load(std::memory_order_relaxed));
   }
+  state.SetItemsProcessed(state.iterations() * nb_threads * k_ops_per_iter);
 
   done.store(true, std::memory_order_release);
   go.store(true, std::memory_order_release); // unblock workers
diff --git a/test/allocation_tracker_fork_test.cc b/test/allocation_tracker_fork_test.cc
index 91d82253..614e4fad 100644
--- a/test/allocation_tracker_fork_test.cc
+++ b/test/allocation_tracker_fork_test.cc
@@ -81,7 +81,9 @@ int main() {
   const LogHandle log_handle(LL_NOTICE);
   LG_NTC("allocation_tracker_fork_test starting");
 
-  // Before key initialization, get_tl_state() must return NULL.
+  // Before ensure_key_initialized(), get_tl_state() returns NULL because
+  // pthread_getspecific on the zero-initialized (uncreated) key returns NULL
+  // on both glibc and musl.
   auto *pre_init = AllocationTracker::get_tl_state();
   CHECK_OR_RETURN(pre_init == nullptr, "expected NULL before key init (got %p)",
                   static_cast<void *>(pre_init));