From ba6be15059edebe0fc46b93e2df24eaf258f6dc3 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Fri, 20 Mar 2026 14:32:17 +0100
Subject: [PATCH 01/13] feat(otel-thread-ctx): add TLS C shim C and build

Declares `custom_labels_current_set_v2` in a C file so the linker emits
a proper TLSDESC dynamic symbol entry readable by out-of-process tools
(e.g. eBPF profiler).  Rust's `#[thread_local]` does not produce a
TLSDESC dialect, so a C shim is required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Cargo.lock                                  |  1 +
 libdd-library-config/Cargo.toml             |  3 +++
 libdd-library-config/build.rs               | 11 +++++++++
 libdd-library-config/src/lib.rs             |  1 +
 libdd-library-config/src/otel_thread_ctx.rs | 25 +++++++++++++++++++++
 libdd-library-config/src/tls_shim.c         | 20 +++++++++++++++++
 6 files changed, 61 insertions(+)
 create mode 100644 libdd-library-config/build.rs
 create mode 100644 libdd-library-config/src/otel_thread_ctx.rs
 create mode 100644 libdd-library-config/src/tls_shim.c

diff --git a/Cargo.lock b/Cargo.lock
index 1961545419..b29f9599ba 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3039,6 +3039,7 @@ name = "libdd-library-config"
 version = "1.1.0"
 dependencies = [
  "anyhow",
+ "cc",
  "libdd-trace-protobuf",
  "memfd",
  "prost",
diff --git a/libdd-library-config/Cargo.toml b/libdd-library-config/Cargo.toml
index 8c2f2e35a8..ae287abf22 100644
--- a/libdd-library-config/Cargo.toml
+++ b/libdd-library-config/Cargo.toml
@@ -26,6 +26,9 @@ rmp-serde = "1.3.0"
 
 libdd-trace-protobuf = { version = "3.0.0", path = "../libdd-trace-protobuf" }
 
+[build-dependencies]
+cc = "1.1.31"
+
 [dev-dependencies]
 tempfile = { version = "3.3" }
 serial_test = "3.2"
diff --git a/libdd-library-config/build.rs b/libdd-library-config/build.rs
new file mode 100644
index 0000000000..6e3701205f
--- /dev/null
+++ b/libdd-library-config/build.rs
@@ -0,0 +1,11 @@
+// Unless explicitly stated otherwise all files in this repository are licensed under the Apache
+// License Version 2.0. This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present Datadog, Inc.
+
+fn main() {
+    // Only compile the TLS shim on Linux; the thread-level context feature is Linux-only.
+    #[cfg(target_os = "linux")]
+    {
+        cc::Build::new().file("src/tls_shim.c").compile("tls_shim");
+        println!("cargo:rerun-if-changed=src/tls_shim.c");
+    }
+}
diff --git a/libdd-library-config/src/lib.rs b/libdd-library-config/src/lib.rs
index f243678c09..7262511250 100644
--- a/libdd-library-config/src/lib.rs
+++ b/libdd-library-config/src/lib.rs
@@ -1,6 +1,7 @@
 // Copyright 2021-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 pub mod otel_process_ctx;
+pub mod otel_thread_ctx;
 pub mod tracer_metadata;
 
 use std::borrow::Cow;
diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
new file mode 100644
index 0000000000..03b3a16db0
--- /dev/null
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -0,0 +1,25 @@
+// Unless explicitly stated otherwise all files in this repository are licensed under the Apache
+// License Version 2.0. This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present Datadog, Inc.
+
+//! # Thread-level context sharing
+//!
+//! This module implements the publisher side of the Thread Context OTEP (PR #4947).
+//!
+//! Since `rustc` doesn't currently support the TLSDESC dialect, we use a C shim to set and get the
+//! thread-local storage used for the context.
+
+#[cfg(target_os = "linux")]
+pub mod linux {
+    use std::ffi::c_void;
+
+    extern "C" {
+        /// Returns the current thread's value of `custom_labels_current_set_v2`.
+        fn libdd_get_custom_labels_current_set_v2() -> *mut *mut c_void;
+    }
+
+    /// Read the TLS pointer for the current thread.
+    #[allow(clippy::missing_safety_doc)]
+    pub(crate) unsafe fn get_tls_ptr() -> *mut *mut c_void {
+        libdd_get_custom_labels_current_set_v2()
+    }
+}
diff --git a/libdd-library-config/src/tls_shim.c b/libdd-library-config/src/tls_shim.c
new file mode 100644
index 0000000000..4a6d612e03
--- /dev/null
+++ b/libdd-library-config/src/tls_shim.c
@@ -0,0 +1,20 @@
+// Unless explicitly stated otherwise all files in this repository are licensed under the Apache License Version 2.0.
+// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present Datadog, Inc.
+
+// Declares the thread-local pointer that external readers (e.g. the eBPF
+// profiler) discover via the dynsym table. The Rust layer accesses this
+// pointer in otel_thread_ctx.rs.
+//
+// The variable is be declared in C in order to use the TLSDESC dialect for
+// thread-local storage, which is required by the OTel thread-level context
+// sharing spec. Unfortunately, it's not possible to have Rust use this dialect
+// as of today.
+#include <stddef.h>
+
+__attribute__((visibility("default")))
+__thread void *custom_labels_current_set_v2 = NULL;
+
+// Return the resolved address of the thread-local variable.
+void **libdd_get_custom_labels_current_set_v2(void) {
+    return &custom_labels_current_set_v2;
+}

From a664e173147b81e0db59572d35c7b64bac43ee1e Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Tue, 24 Mar 2026 00:16:29 +0100
Subject: [PATCH 02/13] feat: add thread context record type and wrappers

---
 libdd-library-config/build.rs                |  11 +-
 libdd-library-config/src/otel_process_ctx.rs |   5 +-
 libdd-library-config/src/otel_thread_ctx.rs  | 580 ++++++++++++++++++-
 libdd-library-config/src/tls_shim.c          |   6 +-
 4 files changed, 589 insertions(+), 13 deletions(-)

diff --git a/libdd-library-config/build.rs b/libdd-library-config/build.rs
index 6e3701205f..4fa8078d52 100644
--- a/libdd-library-config/build.rs
+++ b/libdd-library-config/build.rs
@@ -5,7 +5,16 @@ fn main() {
     // Only compile the TLS shim on Linux; the thread-level context feature is Linux-only.
     #[cfg(target_os = "linux")]
     {
-        cc::Build::new().file("src/tls_shim.c").compile("tls_shim");
+        let mut build = cc::Build::new();
+
+        // - On aarch64, TLSDESC is already the only dynamic TLS model so no flag is needed.
+        // - On x86-64, we use `-mtls-dialect=gnu2` (supported since GCC 4.4 and Clang 19+) to force
+        //   the use of TLSDESC as mandated by the spec. If it's not supported, this build will
+        //   fail.
+        #[cfg(target_arch = "x86_64")]
+        build.flag("-mtls-dialect=gnu2");
+
+        build.file("src/tls_shim.c").compile("tls_shim");
         println!("cargo:rerun-if-changed=src/tls_shim.c");
     }
 }
diff --git a/libdd-library-config/src/otel_process_ctx.rs b/libdd-library-config/src/otel_process_ctx.rs
index a85221e357..b3efad4aab 100644
--- a/libdd-library-config/src/otel_process_ctx.rs
+++ b/libdd-library-config/src/otel_process_ctx.rs
@@ -1,4 +1,4 @@
-// Copyright 2021-Present Datadog, Inc. https://www.datadoghq.com/
+// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 
 //! Implementation of the publisher part of the [OTEL process
@@ -8,7 +8,8 @@
 //!
 //! Process context sharing implies concurrently writing to a memory area that another process
 //! might be actively reading. However, reading isn't done as direct memory accesses but goes
-//! through the OS, so the Rust definition of race conditions doesn't really apply.
+//! through the OS, so the Rust definition of race conditions doesn't really apply. We also use
+//! atomics and fences, see MappingHeader's documentation.
 
 #[cfg(target_os = "linux")]
 #[cfg(target_has_atomic = "64")]
diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 03b3a16db0..475e46c086 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -1,5 +1,5 @@
-// Unless explicitly stated otherwise all files in this repository are licensed under the Apache
-// License Version 2.0. This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present Datadog, Inc.
+// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/
+// SPDX-License-Identifier: Apache-2.0
 
 //! # Thread-level context sharing
 //!
@@ -7,19 +7,585 @@
 //!
 //! Since `rustc` doesn't currently support the TLSDESC dialect, we use a C shim to set and get the
 //! thread-local storage used for the context.
+//!
+//! ## Synchronization
+//!
+//! Readers are constrained to the same thread as the writer and operate like async-signal
+//! handlers: the writer thread is always stopped while a reader runs. There is thus no
+//! cross-thread synchronization concerns. The only hazard is compiler reordering or not committing
+//! writes to memory, which is handled using volatile writes.
 
 #[cfg(target_os = "linux")]
 pub mod linux {
-    use std::ffi::c_void;
+    use std::{
+        ffi::c_void,
+        mem,
+        ops::{Deref, DerefMut},
+        ptr::{self, NonNull},
+    };
 
     extern "C" {
-        /// Returns the current thread's value of `custom_labels_current_set_v2`.
+        /// Return the address of the current thread's `custom_labels_current_set_v2` local.
         fn libdd_get_custom_labels_current_set_v2() -> *mut *mut c_void;
     }
 
-    /// Read the TLS pointer for the current thread.
+    /// Return a pointer to the TLS slot holding the context. The address calculation requires a
+    /// call to a C shim in order to use the TLSDESC dialect from Rust. Note that the returned
+    /// address is stable (per thread), so the result can be reused .
+    ///
+    /// Note that the address is read by an external process or a signal handler, so it should be
+    /// written to using [std::ptr::write_volatile].
     #[allow(clippy::missing_safety_doc)]
-    pub(crate) unsafe fn get_tls_ptr() -> *mut *mut c_void {
-        libdd_get_custom_labels_current_set_v2()
+    fn get_tls_ptr() -> *mut *mut ThreadContextRecord {
+        // Safety: this is just an FFI call, but there's no particular pre-condition to uphold: the
+        // TLS slot should always be accessible.
+        unsafe { libdd_get_custom_labels_current_set_v2().cast() }
+    }
+
+    /// Maximum size in bytes of the `attrs_data` field.
+    ///
+    /// Chosen so that the total record size (`28 + MAX_ATTRS_DATA_SIZE`) stays within the 640-byte
+    /// limit recommended by the spec (the eBPF profiler read limit).
+    pub const MAX_ATTRS_DATA_SIZE: usize = 612;
+
+    /// In-memory layout of a thread-level context. The structure MUST match exactly the
+    /// specification.
+    ///
+    /// # Synchronization
+    ///
+    /// Readers are async-signal handlers on the same thread; the writer is always stopped while a
+    /// reader runs. `valid` is written with `ptr::write_volatile` so the compiler cannot reorder
+    /// its store past the surrounding field writes:
+    ///
+    /// - The writer sets `valid = 1` *after* all other fields are populated (publish).
+    /// - The writer sets `valid = 0` *before* modifying fields in-place (modify).
+    #[repr(C)]
+    pub struct ThreadContextRecord {
+        /// 128-bit trace identifier; all-zeroes means "no trace".
+        pub trace_id: [u8; 16],
+        /// 64-bit span identifier.
+        pub span_id: [u8; 8],
+        /// Whether the record is ready/consistent. Written with `ptr::write_volatile`.
+        pub valid: u8,
+        pub _reserved: u8,
+        /// Number of populated bytes in `attrs_data`.
+        pub attrs_data_size: u16,
+        /// Packed variable-length key-value records.
+        ///
+        /// It's a contiguous list of blocks with layout:
+        ///
+        /// 1. 1-byte `key_index`
+        /// 2. 1-byte `val_len`
+        /// 3. `val_len` bytes of a string value.
+        ///
+        /// # Size
+        ///
+        /// Currently, we always allocate the max recommended size. This potentially wastes a few
+        /// hundred bytes per thread, but it guarantees that we can modify the context in-place
+        /// without (re)allocation in the hot path. Having a hybrid scheme (starting smaller and
+        /// resizing up a few times) is not out of the question.
+        pub attrs_data: [u8; MAX_ATTRS_DATA_SIZE],
+    }
+
+    impl ThreadContextRecord {
+        /// Build a record with the given trace id, span id and attributes.
+        pub fn new(trace_id: [u8; 16], span_id: [u8; 8], attrs: &[(u8, &str)]) -> Self {
+            let mut record = Self {
+                trace_id,
+                span_id,
+                ..Default::default()
+            };
+            record.set_attrs(attrs);
+            record
+        }
+
+        /// Write `value` to `self.valid` using a volatile store, preventing the compiler from
+        /// eliminating or reordering or this write past other volatile writes to the record.
+        fn write_valid_volatile(&mut self, value: u8) {
+            // Safety: we have an exclusive borrow of `self`, so `valid` is live and valid for
+            // writes.
+            unsafe { ptr::write_volatile(&mut self.valid as *mut u8, value) }
+        }
+
+        /// Write `trace_id` to `self.trace_id` using a volatile store, preventing the compiler
+        /// from eliminating or reordering this write past other volatile writes to the record.
+        pub fn write_trace_id_volatile(&mut self, trace_id: [u8; 16]) {
+            // Safety: we have an exclusive borrow of `self`, so `trace_id` is live and valid for
+            // writes.
+            unsafe { ptr::write_volatile(&mut self.trace_id, trace_id) }
+        }
+
+        /// Write `span_id` to `self.span_id` using a volatile store, preventing the compiler from
+        /// eliminating or reordering this write past other volatile writes to the record.
+        pub fn write_span_id_volatile(&mut self, span_id: [u8; 8]) {
+            // Safety: we have an exclusive borrow of `self`, so `span_id` is live and valid for
+            // writes.
+            unsafe { ptr::write_volatile(&mut self.span_id, span_id) }
+        }
+
+        /// Write a single byte of the encoded attributes using a volatile store, preventing the
+        /// compiler from eliminating or reordering or this write past other volatile writes to the
+        /// record.
+        ///
+        /// # Panic
+        ///
+        /// Panics if the offset is out of bound.
+        fn write_attrs_volatile(&mut self, offset: usize, value: u8) {
+            // Safety: we have an exclusive borrow of `self`, so `attrs_data` is live and valid for
+            // writes.
+            unsafe { ptr::write_volatile(&mut self.attrs_data[offset] as *mut u8, value) }
+        }
+
+        /// Copy `src` into the encoded attributes starting at `offset` using volatile stores,
+        /// preventing the compiler from eliminating or reordering or this write past other
+        /// volatile writes to the record.
+        ///
+        /// # Panic
+        ///
+        /// Panics if `offset + src.len() > self.attrs_data.len()`.
+        fn copy_attrs_volatile(&mut self, src: &[u8], offset: usize) {
+            // Safety: we have an exclusive borrow of `self`, so `attrs_data` is live and valid for
+            // writes.
+            for (idx, byte) in src.iter().enumerate() {
+                unsafe { ptr::write_volatile(&mut self.attrs_data[offset + idx] as *mut u8, *byte) }
+            }
+        }
+
+        /// Encode `attributes` into `record.attrs_data` as packed key-value records. Existing data
+        /// are overridden (and if there were more entires than `attributes.len()`, they aren't
+        /// zeroed, but they will be ignored by readers).
+        ///
+        /// # Arguments
+        ///
+        /// Each input entry is a pair of a 1-byte `key_index` and a string value.
+        ///
+        /// # Size limits
+        ///
+        /// Any value over 255 bytes will be capped at this size. If the total size of the encoded
+        /// attributes is over [MAX_ATTRS_DATA_SIZE], extra attributes are ignored. We do this
+        /// instead of raising an error because we encode the attributes on-the-fly. Proper error
+        /// recovery would require us to be able to rollback to the previous attributes which would
+        /// hurt the happy path, or leave the record in a inconsistent state. Another possibility
+        /// would be to error out and reset the record in that situation.
+        ///
+        /// # Memory
+        ///
+        /// Disclaimer: This note is for internal usage only. As a consumer of this library, please
+        /// use [ThreadContext::modify] to mutate the currently attached record.
+        ///
+        /// Writes are volatile, making `set_attrs` suitable for mutating an attached record
+        /// in-place (TODO: does this impact the performance of writing to a non-attached record?
+        /// Should we have two specialized version `set_attrs` and `set_attrs_volatile`?).
+        pub fn set_attrs(&mut self, attributes: &[(u8, &str)]) {
+            let mut offset = 0;
+
+            for &(key_index, val) in attributes {
+                let val_bytes = val.as_bytes();
+                let val_len = val_bytes.len().min(255);
+                let entry_size = 2 + val_len;
+
+                if offset + entry_size > MAX_ATTRS_DATA_SIZE {
+                    break;
+                }
+
+                self.write_attrs_volatile(offset, key_index);
+                // `val_len <= 255` thanks to the `min()`
+                self.write_attrs_volatile(offset + 1, val_len as u8);
+                self.copy_attrs_volatile(&val_bytes[..val_len], offset + 2);
+                offset += entry_size;
+            }
+
+            // `offset < MAX_ATTRS_DATA_SIZE`, which guarantees it fits in a `u16`
+            self.attrs_data_size = offset as u16;
+        }
+    }
+
+    impl Default for ThreadContextRecord {
+        fn default() -> Self {
+            Self {
+                trace_id: [0u8; 16],
+                span_id: [0u8; 8],
+                valid: 0,
+                _reserved: 0,
+                attrs_data_size: 0,
+                attrs_data: [0u8; MAX_ATTRS_DATA_SIZE],
+            }
+        }
+    }
+
+    /// An owned (and non-moving) thread context record allocation.
+    ///
+    /// We don't use `Box` under the hood because it excludes aliasing, while we share the context
+    /// to readers through thread-level context and through the FFI. But it is a boxed
+    /// `ThreadContextRecord` for all intent of purpose.
+    pub struct ThreadContext(NonNull<ThreadContextRecord>);
+
+    impl ThreadContext {
+        /// Create a new thread context with the given trace/span IDs and encoded attributes.
+        pub fn new(trace_id: [u8; 16], span_id: [u8; 8], attrs: &[(u8, &str)]) -> Self {
+            Self::from(ThreadContextRecord::new(trace_id, span_id, attrs))
+        }
+
+        /// Turn this thread context into a raw pointer to the underlying [ThreadContextRecord].
+        /// The pointer must be reconstructed through [`Self::from_raw`] in order to be properly
+        /// dropped, or the record will leak.
+        pub fn into_raw(self) -> *mut ThreadContextRecord {
+            use mem::ManuallyDrop;
+
+            let mdrop = ManuallyDrop::new(self);
+            mdrop.0.as_ptr()
+        }
+
+        /// Reconstruct a [ThreadContextRecord] from a raw pointer to `ThreadContextRecord` that is
+        /// either `null` or comes from [`Self::into_raw`]. Return `None` if `ptr` is null.
+        ///
+        /// # Safety
+        ///
+        /// - `ptr` must be `null` or come from a prior call to [`Self::into_raw`].
+        /// - if `ptr` is aliased, accesses to through aliases must not be interleaved with method
+        ///   calls on the returned [ThreadContextRecord]. More precisely, mutable references might
+        ///   be reconstructed during those calls, so any constraint from either Stacked Borrows,
+        ///   Tree Borrows or whatever is the current aliasing model implemented in Miri applies.
+        pub unsafe fn from_raw(ptr: *mut ThreadContextRecord) -> Option<Self> {
+            NonNull::new(ptr).map(Self)
+        }
+    }
+
+    impl Default for ThreadContext {
+        fn default() -> Self {
+            Self::from(ThreadContextRecord::default())
+        }
+    }
+
+    impl From<ThreadContextRecord> for ThreadContext {
+        fn from(record: ThreadContextRecord) -> Self {
+            // Safety: `Box::into_raw` returns a non-null pointer
+            unsafe { Self(NonNull::new_unchecked(Box::into_raw(Box::new(record)))) }
+        }
+    }
+
+    impl ThreadContext {
+        /// Swap the current context with a pointer value. Return the previously attached context,
+        /// if any.
+        fn swap(
+            slot: *mut *mut ThreadContextRecord,
+            tgt: *mut ThreadContextRecord,
+        ) -> Option<ThreadContext> {
+            unsafe {
+                // Safety: TLS slot is always live and valid for reads and writes.
+                let prev = ThreadContext::from_raw(*slot);
+                ptr::write_volatile(slot, tgt);
+                // Safety: a non-null value in the slot came from a prior `into_raw` call.
+                prev
+            }
+        }
+
+        /// Publish a new (or previously detached) thread context record by writing its pointer
+        /// into the TLS slot. Sets `valid = 1` before publishing. Returns the previously attached
+        /// context, if any.
+        pub fn attach(mut self) -> Option<ThreadContext> {
+            let slot = get_tls_ptr();
+            // Set `valid = 1` written before the TLS pointer is updated, so any reader that
+            // observes the new pointer also observes `valid = 1`.
+            self.write_valid_volatile(1);
+            Self::swap(slot, self.into_raw())
+        }
+
+        /// Modify the currently attached record in-place. Sets `valid = 0` before the update and
+        /// `valid = 1` after, so a reader that fires between the two writes sees an inconsistent
+        /// record and skips it.
+        ///
+        /// If there's currently no attached context, `modify` will create one, and is in this case
+        /// equivalent to `ThreadContext::new(trace_id, span_id, attrs).attach()`.
+        pub fn update(trace_id: [u8; 16], span_id: [u8; 8], attrs: &[(u8, &str)]) {
+            let slot = get_tls_ptr();
+
+            // Safety: the tls slot is always valid for reads and writes.
+            if let Some(current) = unsafe { (*slot).as_mut() } {
+                current.write_valid_volatile(0);
+                current.write_trace_id_volatile(trace_id);
+                current.write_span_id_volatile(span_id);
+                current.set_attrs(attrs);
+                current.write_valid_volatile(1);
+            } else {
+                let mut ctx = ThreadContext::new(trace_id, span_id, attrs);
+                ctx.write_valid_volatile(1);
+                let _ = Self::swap(slot, ctx.into_raw());
+            }
+        }
+
+        /// Detach the current record from the TLS slot. Writes null to the slot, sets `valid = 0`
+        /// on the detached record, and returns it.
+        pub fn detach() -> Option<ThreadContext> {
+            Self::swap(get_tls_ptr(), ptr::null_mut()).map(|mut ctx| {
+                ctx.write_valid_volatile(0);
+                ctx
+            })
+        }
+    }
+
+    impl Drop for ThreadContext {
+        fn drop(&mut self) {
+            // Safety: `self.0` was obtained from a `Box::new`, and `ThreadContext` represents
+            // ownership of the underyling memory.
+            unsafe {
+                let _ = Box::from_raw(self.0.as_ptr());
+            }
+        }
+    }
+
+    impl Deref for ThreadContext {
+        type Target = ThreadContextRecord;
+
+        fn deref(&self) -> &Self::Target {
+            // Safety: `ThreadContext` represents ownership of a valid, alive
+            // `ThreadContextRecord`.
+            unsafe { self.0.as_ref() }
+        }
+    }
+
+    impl DerefMut for ThreadContext {
+        fn deref_mut(&mut self) -> &mut Self::Target {
+            // Safety: `ThreadContext` represents ownership of a valid, alive
+            // `ThreadContextRecord`.
+            unsafe { self.0.as_mut() }
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::{ThreadContext, ThreadContextRecord};
+
+        /// Read the TLS pointer for the current thread (the value stored in the TLS slot, not the
+        /// address of the slot itself).
+        fn read_tls_context_ptr() -> *const ThreadContextRecord {
+            unsafe { *super::get_tls_ptr() as *const ThreadContextRecord }
+        }
+
+        #[test]
+        fn tls_lifecycle_basic() {
+            let trace_id = [1u8; 16];
+            let span_id = [2u8; 8];
+
+            assert!(
+                read_tls_context_ptr().is_null(),
+                "TLS must be null initially"
+            );
+            ThreadContext::new(trace_id, span_id, &[]).attach();
+            assert!(
+                !read_tls_context_ptr().is_null(),
+                "TLS must not be null after attach"
+            );
+
+            let prev = ThreadContext::detach().unwrap();
+            assert!(
+                prev.trace_id == trace_id,
+                "got back a different trace_id than attached"
+            );
+            assert!(
+                prev.span_id == span_id,
+                "got back a different span_id than attached"
+            );
+
+            assert!(
+                read_tls_context_ptr().is_null(),
+                "TLS must be null after detach"
+            );
+        }
+
+        #[test]
+        fn raw_tls_pointer_read() {
+            let trace_id = [1u8; 16];
+            let span_id = [2u8; 8];
+
+            ThreadContext::new(trace_id, span_id, &[]).attach();
+
+            let ptr = read_tls_context_ptr();
+            assert!(!ptr.is_null(), "TLS must be non-null after attach");
+
+            // Safety: context is still live.
+            let record = unsafe { &*ptr };
+            assert_eq!(record.trace_id, trace_id);
+            assert_eq!(record.span_id, span_id);
+            assert_eq!(record.valid, 1);
+            assert_eq!(record.attrs_data_size, 0);
+
+            let _ = ThreadContext::detach();
+        }
+
+        #[test]
+        fn attribute_encoding_basic() {
+            let attrs: &[(u8, &str)] = &[(0, "GET"), (1, "/api/v1")];
+            ThreadContext::new([0u8; 16], [0u8; 8], attrs).attach();
+
+            let ptr = read_tls_context_ptr();
+            assert!(!ptr.is_null());
+            let record = unsafe { &*ptr };
+            let expected_size: u16 = (2 + 3 + 2 + 7) as u16;
+            assert_eq!(record.attrs_data_size, expected_size);
+            assert_eq!(record.attrs_data[0], 0);
+            assert_eq!(record.attrs_data[1], 3);
+            assert_eq!(&record.attrs_data[2..5], b"GET");
+            assert_eq!(record.attrs_data[5], 1);
+            assert_eq!(record.attrs_data[6], 7);
+            assert_eq!(&record.attrs_data[7..14], b"/api/v1");
+
+            let _ = ThreadContext::detach();
+        }
+
+        #[test]
+        fn attribute_truncation_on_overflow() {
+            // Build attributes whose combined encoded size exceeds MAX_ATTRS_DATA_SIZE.
+            // Each max entry: 1 (key) + 1 (len) + 255 (val) = 257 bytes.
+            // Two such entries: 514 bytes. A third entry of 100 chars would need 102 bytes,
+            // bringing the total to 616 > 612, so the third entry must be dropped.
+            let val_a = "a".repeat(255); // 257 bytes encoded
+            let val_b = "b".repeat(255); // 257 bytes encoded → 514 total
+            let val_c = "c".repeat(100); // 102 bytes encoded → 616 total: must be dropped
+
+            let attrs: &[(u8, &str)] = &[
+                (0, val_a.as_str()),
+                (1, val_b.as_str()),
+                (2, val_c.as_str()),
+            ];
+
+            ThreadContext::new([0u8; 16], [0u8; 8], attrs).attach();
+
+            let ptr = read_tls_context_ptr();
+            assert!(!ptr.is_null());
+            let record = unsafe { &*ptr };
+            // Only the first two entries fit (514 bytes).
+            assert_eq!(record.attrs_data_size, 514);
+            assert_eq!(record.attrs_data[0], 0);
+            assert_eq!(record.attrs_data[1], 255);
+            assert_eq!(record.attrs_data[257], 1);
+            assert_eq!(record.attrs_data[258], 255);
+
+            let _ = ThreadContext::detach();
+        }
+
+        #[test]
+        fn update_record_in_place() {
+            let trace_id_1 = [1u8; 16];
+            let span_id_1 = [1u8; 8];
+            let trace_id_2 = [2u8; 16];
+            let span_id_2 = [2u8; 8];
+
+            // Updating before any context is attached should be equivalent to `attach()`
+            ThreadContext::update(trace_id_1, span_id_1, &[(0, "v1")]);
+
+            let ptr_before = read_tls_context_ptr();
+            assert!(!ptr_before.is_null());
+            let record = unsafe { &*ptr_before };
+            assert_eq!(record.trace_id, trace_id_1);
+            assert_eq!(record.span_id, span_id_1);
+            assert_eq!(record.valid, 1);
+            assert_eq!(record.attrs_data[0], 0);
+            assert_eq!(record.attrs_data[1], 2);
+            assert_eq!(&record.attrs_data[2..4], b"v1");
+
+            ThreadContext::update(trace_id_2, span_id_2, &[(0, "v2")]);
+
+            let ptr_after = read_tls_context_ptr();
+            assert_eq!(
+                ptr_before, ptr_after,
+                "modify must not change the TLS pointer"
+            );
+
+            let record = unsafe { &*ptr_after };
+            assert_eq!(record.trace_id, trace_id_2);
+            assert_eq!(record.span_id, span_id_2);
+            assert_eq!(record.valid, 1);
+            assert_eq!(record.attrs_data[0], 0);
+            assert_eq!(record.attrs_data[1], 2);
+            assert_eq!(&record.attrs_data[2..4], b"v2");
+
+            let _ = ThreadContext::detach();
+            assert!(read_tls_context_ptr().is_null());
+        }
+
+        #[test]
+        fn explicit_detach_nulls_tls() {
+            ThreadContext::new([3u8; 16], [3u8; 8], &[]).attach();
+            assert!(!read_tls_context_ptr().is_null());
+
+            let _ = ThreadContext::detach();
+            assert!(read_tls_context_ptr().is_null());
+
+            // Calling detach again is safe (no-op, returns None).
+            let _ = ThreadContext::detach();
+            assert!(read_tls_context_ptr().is_null());
+        }
+
+        #[test]
+        fn long_value_capped_at_255_bytes() {
+            let long_val = "a".repeat(300);
+            ThreadContext::new([0u8; 16], [0u8; 8], &[(0, long_val.as_str())]).attach();
+
+            let ptr = read_tls_context_ptr();
+            assert!(!ptr.is_null());
+            let record = unsafe { &*ptr };
+            let val_len = record.attrs_data[1] as usize;
+            assert_eq!(val_len, 255, "value must be capped at 255 bytes");
+            assert_eq!(record.attrs_data_size as usize, 2 + 255);
+
+            let _ = ThreadContext::detach();
+        }
+
+        // Make sure the C shim is indeed providing a thread-local address.
+        #[test]
+        fn tls_slots_are_per_thread() {
+            use std::sync::{Arc, Barrier};
+
+            let barrier = Arc::new(Barrier::new(2));
+            let b = barrier.clone();
+
+            let spawned_trace_id = [0xABu8; 16];
+            let spawned_span_id = [0xCDu8; 8];
+            let main_trace_id = [0x11u8; 16];
+            let main_span_id = [0x22u8; 8];
+
+            let handle = std::thread::spawn(move || {
+                ThreadContext::new(spawned_trace_id, spawned_span_id, &[]).attach();
+
+                // Let the main thread attach its own record and verify its slot.
+                b.wait();
+                // Wait for the main thread to finish observing before we verify ours.
+                b.wait();
+
+                // The main thread's attach must not have touched this slot.
+                let ptr = read_tls_context_ptr();
+                assert!(!ptr.is_null(), "spawned thread TLS must still be set");
+                let record = unsafe { &*ptr };
+                assert_eq!(record.trace_id, spawned_trace_id);
+                assert_eq!(record.span_id, spawned_span_id);
+
+                let _ = ThreadContext::detach();
+                assert!(read_tls_context_ptr().is_null());
+            });
+
+            // Wait for the spawned thread to attach its record, then attach our own.
+            barrier.wait();
+
+            assert!(
+                read_tls_context_ptr().is_null(),
+                "main thread should see a null pointer and not another thread's context"
+            );
+
+            ThreadContext::new(main_trace_id, main_span_id, &[]).attach();
+
+            let ptr = read_tls_context_ptr();
+            assert!(!ptr.is_null(), "main thread TLS must be set");
+            let record = unsafe { &*ptr };
+            assert_eq!(record.trace_id, main_trace_id);
+            assert_eq!(record.span_id, main_span_id);
+
+            barrier.wait();
+
+            let _ = ThreadContext::detach();
+            assert!(read_tls_context_ptr().is_null());
+
+            handle.join().unwrap();
+        }
     }
 }
diff --git a/libdd-library-config/src/tls_shim.c b/libdd-library-config/src/tls_shim.c
index 4a6d612e03..4b2abd3502 100644
--- a/libdd-library-config/src/tls_shim.c
+++ b/libdd-library-config/src/tls_shim.c
@@ -1,11 +1,11 @@
-// Unless explicitly stated otherwise all files in this repository are licensed under the Apache License Version 2.0.
-// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present Datadog, Inc.
+// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/
+// SPDX-License-Identifier: Apache-2.0
 
 // Declares the thread-local pointer that external readers (e.g. the eBPF
 // profiler) discover via the dynsym table. The Rust layer accesses this
 // pointer in otel_thread_ctx.rs.
 //
-// The variable is be declared in C in order to use the TLSDESC dialect for
+// The variable is declared in C in order to use the TLSDESC dialect for
 // thread-local storage, which is required by the OTel thread-level context
 // sharing spec. Unfortunately, it's not possible to have Rust use this dialect
 // as of today.

From 577aa448163cb82b9c6e45734ab2f345fb66e1bd Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Wed, 25 Mar 2026 14:40:19 +0100
Subject: [PATCH 03/13] chore: exclude test with threads from miri

---
 libdd-library-config/src/otel_thread_ctx.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 475e46c086..e018ab2d9e 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -534,6 +534,7 @@ pub mod linux {
 
         // Make sure the C shim is indeed providing a thread-local address.
         #[test]
+        #[cfg_attr(miri, ignore)]
         fn tls_slots_are_per_thread() {
             use std::sync::{Arc, Barrier};
 

From 95343539061333ef757af959cee29b8c27fc05f3 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Wed, 25 Mar 2026 15:05:24 +0100
Subject: [PATCH 04/13] chore: exclude otel thread ctx tests from miri

---
 libdd-library-config/src/otel_thread_ctx.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index e018ab2d9e..404d5dad2a 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -353,6 +353,8 @@ pub mod linux {
     }
 
     #[cfg(test)]
+    // Accessing the TLS through C isn't supported in Miri
+    #[cfg_attr(miri, ignore)]
     mod tests {
         use super::{ThreadContext, ThreadContextRecord};
 
@@ -534,7 +536,6 @@ pub mod linux {
 
         // Make sure the C shim is indeed providing a thread-local address.
         #[test]
-        #[cfg_attr(miri, ignore)]
         fn tls_slots_are_per_thread() {
             use std::sync::{Arc, Barrier};
 

From 3ed562ab38ed990c8b4dc1bff7fdbddeaddea6d4 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@gmail.com>
Date: Wed, 25 Mar 2026 17:09:00 +0100
Subject: [PATCH 05/13] chore: improve comment

Apply suggestion from @ivoanjo

Co-authored-by: Ivo Anjo <ivo.anjo@datadoghq.com>
---
 libdd-library-config/src/otel_thread_ctx.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 404d5dad2a..482a544a89 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -57,8 +57,8 @@ pub mod linux {
     /// reader runs. `valid` is written with `ptr::write_volatile` so the compiler cannot reorder
     /// its store past the surrounding field writes:
     ///
-    /// - The writer sets `valid = 1` *after* all other fields are populated (publish).
     /// - The writer sets `valid = 0` *before* modifying fields in-place (modify).
+    /// - The writer sets `valid = 1` *after* all other fields are populated (publish).
     #[repr(C)]
     pub struct ThreadContextRecord {
         /// 128-bit trace identifier; all-zeroes means "no trace".

From b5403b96fafc4c7548092d886e20117789635e72 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Thu, 26 Mar 2026 14:46:44 +0100
Subject: [PATCH 06/13] feat(otel-thread-ctx): volatile -> atomic + fences

It turns out the previous implementation used volatile to synchronize
with the async-signal-handler reader, but the proper device to use is
actually atomics + compiler-only fences. This commit updates the model
to do that, and get rid of all the `xxx_volatile` details.
---
 libdd-library-config/src/otel_thread_ctx.rs | 240 +++++++++-----------
 1 file changed, 107 insertions(+), 133 deletions(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 482a544a89..a532782bfb 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -12,8 +12,10 @@
 //!
 //! Readers are constrained to the same thread as the writer and operate like async-signal
 //! handlers: the writer thread is always stopped while a reader runs. There is thus no
-//! cross-thread synchronization concerns. The only hazard is compiler reordering or not committing
-//! writes to memory, which is handled using volatile writes.
+//! cross-thread synchronization concerns. The only hazard is compiler reordering, which is
+//! handled by making `valid` atomic and using compiler-only fences (equivalent to C's
+//! `atomic_signal_fence`) to keep field writes boxed between the `valid = 0` and `valid = 1`
+//! stores during in-place updates.
 
 #[cfg(target_os = "linux")]
 pub mod linux {
@@ -22,24 +24,43 @@ pub mod linux {
         mem,
         ops::{Deref, DerefMut},
         ptr::{self, NonNull},
+        sync::atomic::{compiler_fence, AtomicPtr, AtomicU8, Ordering},
     };
 
     extern "C" {
         /// Return the address of the current thread's `custom_labels_current_set_v2` local.
+        ///
+        /// **CAUTION**: do not use this directly, always go through [get_tls_slot] to read and
+        /// write it atomically.
         fn libdd_get_custom_labels_current_set_v2() -> *mut *mut c_void;
     }
 
-    /// Return a pointer to the TLS slot holding the context. The address calculation requires a
-    /// call to a C shim in order to use the TLSDESC dialect from Rust. Note that the returned
-    /// address is stable (per thread), so the result can be reused .
+    /// Return an atomic view of the TLS slot. The address calculation requires a call to a C shim
+    /// in order to use the TLSDESC dialect from Rust. The returned address is stable (per thread),
+    /// so the resulting atomic should be reused whenever possible, to reduce the number of calls
+    /// to this function.
     ///
-    /// Note that the address is read by an external process or a signal handler, so it should be
-    /// written to using [std::ptr::write_volatile].
-    #[allow(clippy::missing_safety_doc)]
-    fn get_tls_ptr() -> *mut *mut ThreadContextRecord {
-        // Safety: this is just an FFI call, but there's no particular pre-condition to uphold: the
-        // TLS slot should always be accessible.
-        unsafe { libdd_get_custom_labels_current_set_v2().cast() }
+    /// The slot is read by an async signal handler. Atomic operations should in general use
+    /// [Odering::Relaxed], but modifications to the record might need additional compiler-only
+    /// fences (see [ThreadContext::update] for an example).
+    fn get_tls_slot<'a>() -> &'a AtomicPtr<ThreadContextRecord> {
+        const {
+            assert!(
+                mem::align_of::<AtomicPtr<ThreadContextRecord>>()
+                    == mem::align_of::<*mut ThreadContextRecord>()
+            )
+        }
+
+        // Safety: the const assertion above ensures the alignment is correct. The TLS slot is
+        // valid for writes during the lifetime of the program.
+        //
+        // We forbid direct usage of `libdd_get_custom_labels_current_set_v2`, which guarantees
+        // that there's never conflicting non-atomic accesses to the TLS slot.
+        unsafe {
+            AtomicPtr::from_ptr(
+                libdd_get_custom_labels_current_set_v2().cast::<*mut ThreadContextRecord>(),
+            )
+        }
     }
 
     /// Maximum size in bytes of the `attrs_data` field.
@@ -48,25 +69,32 @@ pub mod linux {
     /// limit recommended by the spec (the eBPF profiler read limit).
     pub const MAX_ATTRS_DATA_SIZE: usize = 612;
 
-    /// In-memory layout of a thread-level context. The structure MUST match exactly the
-    /// specification.
+    /// In-memory layout of a thread-level context.
+    ///
+    /// **CAUTION**: The structure MUST match exactly the OTel thread-level context specification.
+    /// It is read by external, out-of-process code. Do not re-order fields or modify in any way,
+    /// unless you know exactly what you're doing.
     ///
     /// # Synchronization
     ///
-    /// Readers are async-signal handlers on the same thread; the writer is always stopped while a
-    /// reader runs. `valid` is written with `ptr::write_volatile` so the compiler cannot reorder
-    /// its store past the surrounding field writes:
+    /// Readers are async-signal handlers. The writer is always stopped while a reader runs.
+    /// Sharing memory with a signal handler still requires some form of synchronization, which is
+    /// achieved through atomics and compiler fence, using `valid` and/or the TLS slot as
+    /// synchronization points.
     ///
-    /// - The writer sets `valid = 0` *before* modifying fields in-place (modify).
-    /// - The writer sets `valid = 1` *after* all other fields are populated (publish).
+    /// - The writer stores `valid = 0` *before* modifying fields in-place, guarded by a fence.
+    /// - The writer stores `valid = 1` *after* all fields are populated, guarded by a fence.
+    /// - `valid` starts at `1` on construction and is never set to `0` except during an in-place
+    ///   update.
     #[repr(C)]
     pub struct ThreadContextRecord {
         /// 128-bit trace identifier; all-zeroes means "no trace".
         pub trace_id: [u8; 16],
         /// 64-bit span identifier.
         pub span_id: [u8; 8],
-        /// Whether the record is ready/consistent. Written with `ptr::write_volatile`.
-        pub valid: u8,
+        /// Whether the record is ready/consistent. Always set to `1` except during in-place update
+        /// of the current record.
+        pub valid: AtomicU8,
         pub _reserved: u8,
         /// Number of populated bytes in `attrs_data`.
         pub attrs_data_size: u16,
@@ -99,58 +127,6 @@ pub mod linux {
             record
         }
 
-        /// Write `value` to `self.valid` using a volatile store, preventing the compiler from
-        /// eliminating or reordering or this write past other volatile writes to the record.
-        fn write_valid_volatile(&mut self, value: u8) {
-            // Safety: we have an exclusive borrow of `self`, so `valid` is live and valid for
-            // writes.
-            unsafe { ptr::write_volatile(&mut self.valid as *mut u8, value) }
-        }
-
-        /// Write `trace_id` to `self.trace_id` using a volatile store, preventing the compiler
-        /// from eliminating or reordering this write past other volatile writes to the record.
-        pub fn write_trace_id_volatile(&mut self, trace_id: [u8; 16]) {
-            // Safety: we have an exclusive borrow of `self`, so `trace_id` is live and valid for
-            // writes.
-            unsafe { ptr::write_volatile(&mut self.trace_id, trace_id) }
-        }
-
-        /// Write `span_id` to `self.span_id` using a volatile store, preventing the compiler from
-        /// eliminating or reordering this write past other volatile writes to the record.
-        pub fn write_span_id_volatile(&mut self, span_id: [u8; 8]) {
-            // Safety: we have an exclusive borrow of `self`, so `span_id` is live and valid for
-            // writes.
-            unsafe { ptr::write_volatile(&mut self.span_id, span_id) }
-        }
-
-        /// Write a single byte of the encoded attributes using a volatile store, preventing the
-        /// compiler from eliminating or reordering or this write past other volatile writes to the
-        /// record.
-        ///
-        /// # Panic
-        ///
-        /// Panics if the offset is out of bound.
-        fn write_attrs_volatile(&mut self, offset: usize, value: u8) {
-            // Safety: we have an exclusive borrow of `self`, so `attrs_data` is live and valid for
-            // writes.
-            unsafe { ptr::write_volatile(&mut self.attrs_data[offset] as *mut u8, value) }
-        }
-
-        /// Copy `src` into the encoded attributes starting at `offset` using volatile stores,
-        /// preventing the compiler from eliminating or reordering or this write past other
-        /// volatile writes to the record.
-        ///
-        /// # Panic
-        ///
-        /// Panics if `offset + src.len() > self.attrs_data.len()`.
-        fn copy_attrs_volatile(&mut self, src: &[u8], offset: usize) {
-            // Safety: we have an exclusive borrow of `self`, so `attrs_data` is live and valid for
-            // writes.
-            for (idx, byte) in src.iter().enumerate() {
-                unsafe { ptr::write_volatile(&mut self.attrs_data[offset + idx] as *mut u8, *byte) }
-            }
-        }
-
         /// Encode `attributes` into `record.attrs_data` as packed key-value records. Existing data
         /// are overridden (and if there were more entires than `attributes.len()`, they aren't
         /// zeroed, but they will be ignored by readers).
@@ -167,15 +143,6 @@ pub mod linux {
         /// recovery would require us to be able to rollback to the previous attributes which would
         /// hurt the happy path, or leave the record in a inconsistent state. Another possibility
         /// would be to error out and reset the record in that situation.
-        ///
-        /// # Memory
-        ///
-        /// Disclaimer: This note is for internal usage only. As a consumer of this library, please
-        /// use [ThreadContext::modify] to mutate the currently attached record.
-        ///
-        /// Writes are volatile, making `set_attrs` suitable for mutating an attached record
-        /// in-place (TODO: does this impact the performance of writing to a non-attached record?
-        /// Should we have two specialized version `set_attrs` and `set_attrs_volatile`?).
         pub fn set_attrs(&mut self, attributes: &[(u8, &str)]) {
             let mut offset = 0;
 
@@ -188,10 +155,11 @@ pub mod linux {
                     break;
                 }
 
-                self.write_attrs_volatile(offset, key_index);
+                self.attrs_data[offset] = key_index;
                 // `val_len <= 255` thanks to the `min()`
-                self.write_attrs_volatile(offset + 1, val_len as u8);
-                self.copy_attrs_volatile(&val_bytes[..val_len], offset + 2);
+                self.attrs_data[offset + 1] = val_len as u8;
+                self.attrs_data[offset + 2..offset + 2 + val_len]
+                    .copy_from_slice(&val_bytes[..val_len]);
                 offset += entry_size;
             }
 
@@ -205,7 +173,7 @@ pub mod linux {
             Self {
                 trace_id: [0u8; 16],
                 span_id: [0u8; 8],
-                valid: 0,
+                valid: AtomicU8::new(1),
                 _reserved: 0,
                 attrs_data_size: 0,
                 attrs_data: [0u8; MAX_ATTRS_DATA_SIZE],
@@ -215,7 +183,7 @@ pub mod linux {
 
     /// An owned (and non-moving) thread context record allocation.
     ///
-    /// We don't use `Box` under the hood because it excludes aliasing, while we share the context
+    /// We don't use `Box` under the hood because it precludes aliasing, while we share the context
     /// to readers through thread-level context and through the FFI. But it is a boxed
     /// `ThreadContextRecord` for all intent of purpose.
     pub struct ThreadContext(NonNull<ThreadContextRecord>);
@@ -230,14 +198,12 @@ pub mod linux {
         /// The pointer must be reconstructed through [`Self::from_raw`] in order to be properly
         /// dropped, or the record will leak.
         pub fn into_raw(self) -> *mut ThreadContextRecord {
-            use mem::ManuallyDrop;
-
-            let mdrop = ManuallyDrop::new(self);
+            let mdrop = mem::ManuallyDrop::new(self);
             mdrop.0.as_ptr()
         }
 
-        /// Reconstruct a [ThreadContextRecord] from a raw pointer to `ThreadContextRecord` that is
-        /// either `null` or comes from [`Self::into_raw`]. Return `None` if `ptr` is null.
+        /// Reconstruct a [ThreadContextRecord] from a raw pointer that is either `null` or comes
+        /// from [`Self::into_raw`]. Return `None` if `ptr` is null.
         ///
         /// # Safety
         ///
@@ -265,62 +231,68 @@ pub mod linux {
     }
 
     impl ThreadContext {
-        /// Swap the current context with a pointer value. Return the previously attached context,
-        /// if any.
+        /// Atomically swap the current context with a pointer value. Return the previously
+        /// attached context, if any.
         fn swap(
-            slot: *mut *mut ThreadContextRecord,
+            slot: &AtomicPtr<ThreadContextRecord>,
             tgt: *mut ThreadContextRecord,
         ) -> Option<ThreadContext> {
-            unsafe {
-                // Safety: TLS slot is always live and valid for reads and writes.
-                let prev = ThreadContext::from_raw(*slot);
-                ptr::write_volatile(slot, tgt);
-                // Safety: a non-null value in the slot came from a prior `into_raw` call.
-                prev
-            }
+            // Safety: a non-null value in the slot came from a prior `into_raw` call.
+            unsafe { ThreadContext::from_raw(slot.swap(tgt, Ordering::Relaxed)) }
         }
 
         /// Publish a new (or previously detached) thread context record by writing its pointer
-        /// into the TLS slot. Sets `valid = 1` before publishing. Returns the previously attached
-        /// context, if any.
-        pub fn attach(mut self) -> Option<ThreadContext> {
-            let slot = get_tls_ptr();
-            // Set `valid = 1` written before the TLS pointer is updated, so any reader that
-            // observes the new pointer also observes `valid = 1`.
-            self.write_valid_volatile(1);
-            Self::swap(slot, self.into_raw())
+        /// into the TLS slot. Returns the previously attached context, if any.
+        ///
+        /// `valid` is already `1` since construction, so any reader that observes the new pointer
+        /// also observes `valid = 1`.
+        pub fn attach(self) -> Option<ThreadContext> {
+            // [^tls-slot-ordering]: since we get back the previous context, we should in principle
+            // use an `Acquire` compiler fence to make sure we don't get back a not-yet-initiliazed
+            // record.
+            //
+            // However, this thread (excluding the reader signal handler) is the only one to ever
+            // _write_ to the context, so the store we load the value from automatically
+            // happens-before (because it's sequenced-before) the swap.
+            Self::swap(get_tls_slot(), self.into_raw())
         }
 
-        /// Modify the currently attached record in-place. Sets `valid = 0` before the update and
+        /// Update the currently attached record in-place. Sets `valid = 0` before the update and
         /// `valid = 1` after, so a reader that fires between the two writes sees an inconsistent
-        /// record and skips it.
+        /// record and skips it. Compiler fences prevent the compiler from reordering field writes
+        /// outside that window.
         ///
-        /// If there's currently no attached context, `modify` will create one, and is in this case
+        /// If there's currently no attached context, `update` will create one, and is in this case
         /// equivalent to `ThreadContext::new(trace_id, span_id, attrs).attach()`.
         pub fn update(trace_id: [u8; 16], span_id: [u8; 8], attrs: &[(u8, &str)]) {
-            let slot = get_tls_ptr();
+            let slot = get_tls_slot();
 
-            // Safety: the tls slot is always valid for reads and writes.
-            if let Some(current) = unsafe { (*slot).as_mut() } {
-                current.write_valid_volatile(0);
-                current.write_trace_id_volatile(trace_id);
-                current.write_span_id_volatile(span_id);
+            if let Some(current) = unsafe { slot.load(Ordering::Relaxed).as_mut() } {
+                current.valid.store(0, Ordering::Relaxed);
+                compiler_fence(Ordering::SeqCst);
+
+                current.trace_id = trace_id;
+                current.span_id = span_id;
                 current.set_attrs(attrs);
-                current.write_valid_volatile(1);
+
+                compiler_fence(Ordering::SeqCst);
+                current.valid.store(1, Ordering::Relaxed);
             } else {
-                let mut ctx = ThreadContext::new(trace_id, span_id, attrs);
-                ctx.write_valid_volatile(1);
-                let _ = Self::swap(slot, ctx.into_raw());
+                // No need for `AcqRel`, see [^tls-slot-ordering].
+                compiler_fence(Ordering::Release);
+                // `ThreadContext::new` already initialises `valid = 1`.
+                let _ = Self::swap(
+                    slot,
+                    ThreadContext::new(trace_id, span_id, attrs).into_raw(),
+                );
             }
         }
 
-        /// Detach the current record from the TLS slot. Writes null to the slot, sets `valid = 0`
-        /// on the detached record, and returns it.
+        /// Detach the current record from the TLS slot. Writes null to the slot and returns the
+        /// detached record.
         pub fn detach() -> Option<ThreadContext> {
-            Self::swap(get_tls_ptr(), ptr::null_mut()).map(|mut ctx| {
-                ctx.write_valid_volatile(0);
-                ctx
-            })
+            // We don't need any fence here, see [^tls-slot-ordering].
+            Self::swap(get_tls_slot(), ptr::null_mut())
         }
     }
 
@@ -356,12 +328,14 @@ pub mod linux {
     // Accessing the TLS through C isn't supported in Miri
     #[cfg_attr(miri, ignore)]
     mod tests {
+        use std::sync::atomic::Ordering;
+
         use super::{ThreadContext, ThreadContextRecord};
 
         /// Read the TLS pointer for the current thread (the value stored in the TLS slot, not the
         /// address of the slot itself).
         fn read_tls_context_ptr() -> *const ThreadContextRecord {
-            unsafe { *super::get_tls_ptr() as *const ThreadContextRecord }
+            super::get_tls_slot().load(Ordering::Relaxed)
         }
 
         #[test]
@@ -409,7 +383,7 @@ pub mod linux {
             let record = unsafe { &*ptr };
             assert_eq!(record.trace_id, trace_id);
             assert_eq!(record.span_id, span_id);
-            assert_eq!(record.valid, 1);
+            assert_eq!(record.valid.load(Ordering::Relaxed), 1);
             assert_eq!(record.attrs_data_size, 0);
 
             let _ = ThreadContext::detach();
@@ -481,7 +455,7 @@ pub mod linux {
             let record = unsafe { &*ptr_before };
             assert_eq!(record.trace_id, trace_id_1);
             assert_eq!(record.span_id, span_id_1);
-            assert_eq!(record.valid, 1);
+            assert_eq!(record.valid.load(Ordering::Relaxed), 1);
             assert_eq!(record.attrs_data[0], 0);
             assert_eq!(record.attrs_data[1], 2);
             assert_eq!(&record.attrs_data[2..4], b"v1");
@@ -497,7 +471,7 @@ pub mod linux {
             let record = unsafe { &*ptr_after };
             assert_eq!(record.trace_id, trace_id_2);
             assert_eq!(record.span_id, span_id_2);
-            assert_eq!(record.valid, 1);
+            assert_eq!(record.valid.load(Ordering::Relaxed), 1);
             assert_eq!(record.attrs_data[0], 0);
             assert_eq!(record.attrs_data[1], 2);
             assert_eq!(&record.attrs_data[2..4], b"v2");

From 6802b1de2b3fe596ec8845d2c07da6b8af10cc2f Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Thu, 26 Mar 2026 14:55:05 +0100
Subject: [PATCH 07/13] chore: indicative return value for set_attrs

---
 libdd-library-config/src/otel_thread_ctx.rs | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index a532782bfb..d95b2a677d 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -131,6 +131,11 @@ pub mod linux {
         /// are overridden (and if there were more entires than `attributes.len()`, they aren't
         /// zeroed, but they will be ignored by readers).
         ///
+        /// # Return
+        ///
+        /// Returns `true` if all attributes were properly encoded, or `false` if some of the data
+        /// needed to be dropped. See Size limits below.
+        ///
         /// # Arguments
         ///
         /// Each input entry is a pair of a 1-byte `key_index` and a string value.
@@ -143,15 +148,23 @@ pub mod linux {
         /// recovery would require us to be able to rollback to the previous attributes which would
         /// hurt the happy path, or leave the record in a inconsistent state. Another possibility
         /// would be to error out and reset the record in that situation.
-        pub fn set_attrs(&mut self, attributes: &[(u8, &str)]) {
+        pub fn set_attrs(&mut self, attributes: &[(u8, &str)]) -> bool {
             let mut offset = 0;
+            let mut fully_encoded = true;
 
             for &(key_index, val) in attributes {
                 let val_bytes = val.as_bytes();
-                let val_len = val_bytes.len().min(255);
+                let val_len = val_bytes.len();
+                let val_len = if val_len > 255 {
+                    fully_encoded = false;
+                    255
+                } else {
+                    val_len
+                };
                 let entry_size = 2 + val_len;
 
                 if offset + entry_size > MAX_ATTRS_DATA_SIZE {
+                    fully_encoded = false;
                     break;
                 }
 
@@ -165,6 +178,7 @@ pub mod linux {
 
             // `offset < MAX_ATTRS_DATA_SIZE`, which guarantees it fits in a `u16`
             self.attrs_data_size = offset as u16;
+            fully_encoded
         }
     }
 

From c72399eae5f9f3010dfb5dda5eac8bfb6e164ff7 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Thu, 26 Mar 2026 15:05:02 +0100
Subject: [PATCH 08/13] chore: insist on ctx not being thread-safe

---
 libdd-library-config/src/otel_thread_ctx.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index d95b2a677d..26f5028d36 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -200,6 +200,9 @@ pub mod linux {
     /// We don't use `Box` under the hood because it precludes aliasing, while we share the context
     /// to readers through thread-level context and through the FFI. But it is a boxed
     /// `ThreadContextRecord` for all intent of purpose.
+    ///
+    /// The context is `!Send` and `!Sync`; it is supposed to stay on the same thread and is thus
+    /// not thread-safe.
     pub struct ThreadContext(NonNull<ThreadContextRecord>);
 
     impl ThreadContext {

From 2e4a5dae92fce2923a7f690f4ce0da48cd926318 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Thu, 26 Mar 2026 15:12:47 +0100
Subject: [PATCH 09/13] chore: make ThreadContextRecord private

---
 libdd-library-config/src/otel_thread_ctx.rs | 57 ++++++++-------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 26f5028d36..6ca2830d44 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -22,7 +22,6 @@ pub mod linux {
     use std::{
         ffi::c_void,
         mem,
-        ops::{Deref, DerefMut},
         ptr::{self, NonNull},
         sync::atomic::{compiler_fence, AtomicPtr, AtomicU8, Ordering},
     };
@@ -87,17 +86,17 @@ pub mod linux {
     /// - `valid` starts at `1` on construction and is never set to `0` except during an in-place
     ///   update.
     #[repr(C)]
-    pub struct ThreadContextRecord {
+    struct ThreadContextRecord {
         /// 128-bit trace identifier; all-zeroes means "no trace".
-        pub trace_id: [u8; 16],
+        trace_id: [u8; 16],
         /// 64-bit span identifier.
-        pub span_id: [u8; 8],
+        span_id: [u8; 8],
         /// Whether the record is ready/consistent. Always set to `1` except during in-place update
         /// of the current record.
-        pub valid: AtomicU8,
-        pub _reserved: u8,
+        valid: AtomicU8,
+        _reserved: u8,
         /// Number of populated bytes in `attrs_data`.
-        pub attrs_data_size: u16,
+        attrs_data_size: u16,
         /// Packed variable-length key-value records.
         ///
         /// It's a contiguous list of blocks with layout:
@@ -112,7 +111,7 @@ pub mod linux {
         /// hundred bytes per thread, but it guarantees that we can modify the context in-place
         /// without (re)allocation in the hot path. Having a hybrid scheme (starting smaller and
         /// resizing up a few times) is not out of the question.
-        pub attrs_data: [u8; MAX_ATTRS_DATA_SIZE],
+        attrs_data: [u8; MAX_ATTRS_DATA_SIZE],
     }
 
     impl ThreadContextRecord {
@@ -187,6 +186,7 @@ pub mod linux {
             Self {
                 trace_id: [0u8; 16],
                 span_id: [0u8; 8],
+                // We only ever set `valid` to `0` during in-place update of an attached context.
                 valid: AtomicU8::new(1),
                 _reserved: 0,
                 attrs_data_size: 0,
@@ -214,7 +214,7 @@ pub mod linux {
         /// Turn this thread context into a raw pointer to the underlying [ThreadContextRecord].
         /// The pointer must be reconstructed through [`Self::from_raw`] in order to be properly
         /// dropped, or the record will leak.
-        pub fn into_raw(self) -> *mut ThreadContextRecord {
+        fn into_raw(self) -> *mut ThreadContextRecord {
             let mdrop = mem::ManuallyDrop::new(self);
             mdrop.0.as_ptr()
         }
@@ -229,7 +229,7 @@ pub mod linux {
         ///   calls on the returned [ThreadContextRecord]. More precisely, mutable references might
         ///   be reconstructed during those calls, so any constraint from either Stacked Borrows,
         ///   Tree Borrows or whatever is the current aliasing model implemented in Miri applies.
-        pub unsafe fn from_raw(ptr: *mut ThreadContextRecord) -> Option<Self> {
+        unsafe fn from_raw(ptr: *mut ThreadContextRecord) -> Option<Self> {
             NonNull::new(ptr).map(Self)
         }
     }
@@ -323,24 +323,6 @@ pub mod linux {
         }
     }
 
-    impl Deref for ThreadContext {
-        type Target = ThreadContextRecord;
-
-        fn deref(&self) -> &Self::Target {
-            // Safety: `ThreadContext` represents ownership of a valid, alive
-            // `ThreadContextRecord`.
-            unsafe { self.0.as_ref() }
-        }
-    }
-
-    impl DerefMut for ThreadContext {
-        fn deref_mut(&mut self) -> &mut Self::Target {
-            // Safety: `ThreadContext` represents ownership of a valid, alive
-            // `ThreadContextRecord`.
-            unsafe { self.0.as_mut() }
-        }
-    }
-
     #[cfg(test)]
     // Accessing the TLS through C isn't supported in Miri
     #[cfg_attr(miri, ignore)]
@@ -371,14 +353,17 @@ pub mod linux {
             );
 
             let prev = ThreadContext::detach().unwrap();
-            assert!(
-                prev.trace_id == trace_id,
-                "got back a different trace_id than attached"
-            );
-            assert!(
-                prev.span_id == span_id,
-                "got back a different span_id than attached"
-            );
+
+            unsafe {
+                assert!(
+                    prev.0.as_ref().trace_id == trace_id,
+                    "got back a different trace_id than attached"
+                );
+                assert!(
+                    prev.0.as_ref().span_id == span_id,
+                    "got back a different span_id than attached"
+                );
+            }
 
             assert!(
                 read_tls_context_ptr().is_null(),

From 98d5cad665cc524a2d210a2ce60ee10477e4f9f3 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Fri, 27 Mar 2026 11:28:31 +0100
Subject: [PATCH 10/13] chore: assert total size of thread ctx is the expected
 one

---
 libdd-library-config/src/otel_thread_ctx.rs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 6ca2830d44..a162b3efca 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -85,11 +85,16 @@ pub mod linux {
     /// - The writer stores `valid = 1` *after* all fields are populated, guarded by a fence.
     /// - `valid` starts at `1` on construction and is never set to `0` except during an in-place
     ///   update.
+    // Note: we don't need to make this struct packed, because it's already designed to avoid
+    // padding. Moreover, doing so would make it 1-aligned, potentially making access to
+    // `attrs_data_size` unaligned and thus slower, and prevent us from using `AtomicU8` for
+    // `valid`. We just use a const assertion in `new()` to surprises and make sure this struct has
+    // the right total size.
     #[repr(C)]
     struct ThreadContextRecord {
-        /// 128-bit trace identifier; all-zeroes means "no trace".
+        /// Trace identifier; all-zeroes means "no trace".
         trace_id: [u8; 16],
-        /// 64-bit span identifier.
+        /// Span identifier.
         span_id: [u8; 8],
         /// Whether the record is ready/consistent. Always set to `1` except during in-place update
         /// of the current record.
@@ -117,6 +122,8 @@ pub mod linux {
     impl ThreadContextRecord {
         /// Build a record with the given trace id, span id and attributes.
         pub fn new(trace_id: [u8; 16], span_id: [u8; 8], attrs: &[(u8, &str)]) -> Self {
+            const { assert!(size_of::<ThreadContextRecord>() == 640) }
+
             let mut record = Self {
                 trace_id,
                 span_id,

From 113036cce881feddde22c194e433415a48a2f679 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Fri, 27 Mar 2026 11:30:13 +0100
Subject: [PATCH 11/13] chore: add zeroing precision in comment

---
 libdd-library-config/src/otel_thread_ctx.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index a162b3efca..15ef466c23 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -182,7 +182,9 @@ pub mod linux {
                 offset += entry_size;
             }
 
-            // `offset < MAX_ATTRS_DATA_SIZE`, which guarantees it fits in a `u16`
+            // `offset < MAX_ATTRS_DATA_SIZE`, which guarantees it fits in a `u16`. This also
+            // effectively hide the remaining of the previous `attrs` bytes, so we don't have to
+            // zero them.
             self.attrs_data_size = offset as u16;
             fully_encoded
         }

From e8655c15423d5c158d3dc949a2c2b3b0c4043536 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Fri, 27 Mar 2026 16:55:54 +0100
Subject: [PATCH 12/13] chore: add example usage of otel thread ctx

---
 libdd-library-config/src/otel_thread_ctx.rs | 47 +++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-library-config/src/otel_thread_ctx.rs
index 15ef466c23..f14d2fda34 100644
--- a/libdd-library-config/src/otel_thread_ctx.rs
+++ b/libdd-library-config/src/otel_thread_ctx.rs
@@ -8,6 +8,53 @@
 //! Since `rustc` doesn't currently support the TLSDESC dialect, we use a C shim to set and get the
 //! thread-local storage used for the context.
 //!
+//! ## Usage
+//!
+//! There are two main patterns for publishing and updating thread contexts.
+//!
+//! ### In-place update
+//!
+//! The simplest pattern, when applicable, is to attach one record and then mutate it in place.
+//! This avoid allocation in the hot path.
+//!
+//! ```ignore
+//! #use libdd_library_config::otel_thread_ctx::linux::ThreadContext;
+//!
+//! let trace_id = [0u8; 16];
+//! let span_id  = [1u8; 8];
+//!
+//! // First call allocates a record and attaches it.
+//! ThreadContext::new(trace_id, span_id, &[(0, "first")]).attach();
+//! ThreadContext::update(trace_id, span_id, &[(0, "second")]);
+//! ThreadContext::detach();
+//! ```
+//!
+//! ### Swapping
+//!
+//! Swapping can be used when it's beneficial to pre-allocate or keep around a bunch of contexts to
+//! be saved and restored repeatedly. Could be the case with async-runtimes were several tasks
+//! might run on the same thread, or even move from one thread to another, for example.
+//!
+//! ```ignore
+//! #use libdd_library_config::otel_thread_ctx::linux::ThreadContext;
+//!
+//! let trace_id = [0u8; 16];
+//! let span_id  = [1u8; 8];
+//! let attrs: &[(u8, &str)] = &[(0, "GET"), (1, "/api/v1")];
+//!
+//! // Publish a new context and save the previously attached one (if any).
+//! let ctx = ThreadContext::new(trace_id, span_id, attrs);
+//! let previous = ctx.attach();
+//!
+//! // ... do work inside the span ...
+//!
+//! // Restore the previous context: detach the current one and re-attach the saved one.
+//! if let Some(prev) = previous {
+//!     // here we drop `ctx`, but we could store for later usage
+//!     let _ = prev.attach();
+//! }
+//! ```
+//!
 //! ## Synchronization
 //!
 //! Readers are constrained to the same thread as the writer and operate like async-signal

From ce8e143885a0daf6bd8273f7a1e98e6056493308 Mon Sep 17 00:00:00 2001
From: Yann Hamdaoui <yann.hamdaoui@datadoghq.com>
Date: Fri, 27 Mar 2026 17:04:29 +0100
Subject: [PATCH 13/13] refactor: move otel_thread_ctx to libdd-profiling

The module and its C shim (tls_shim.c) logically belong in the profiling
crate rather than library-config. Move otel_thread_ctx.rs and tls_shim.c
to libdd-profiling/src/, wire up the cc build step in its build.rs, and
restore libdd-library-config to its pre-PR state.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Cargo.lock                                    |  2 +-
 libdd-library-config/Cargo.toml               |  3 ---
 libdd-library-config/build.rs                 | 20 -------------------
 libdd-library-config/src/lib.rs               |  1 -
 libdd-profiling/Cargo.toml                    |  1 +
 libdd-profiling/build.rs                      | 16 +++++++++++++++
 libdd-profiling/src/lib.rs                    |  1 +
 .../src/otel_thread_ctx.rs                    |  0
 .../src/tls_shim.c                            |  0
 9 files changed, 19 insertions(+), 25 deletions(-)
 delete mode 100644 libdd-library-config/build.rs
 rename {libdd-library-config => libdd-profiling}/src/otel_thread_ctx.rs (100%)
 rename {libdd-library-config => libdd-profiling}/src/tls_shim.c (100%)

diff --git a/Cargo.lock b/Cargo.lock
index b29f9599ba..1d3de5f9aa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3039,7 +3039,6 @@ name = "libdd-library-config"
 version = "1.1.0"
 dependencies = [
  "anyhow",
- "cc",
  "libdd-trace-protobuf",
  "memfd",
  "prost",
@@ -3105,6 +3104,7 @@ dependencies = [
  "bolero",
  "byteorder",
  "bytes",
+ "cc",
  "chrono",
  "criterion",
  "crossbeam-channel",
diff --git a/libdd-library-config/Cargo.toml b/libdd-library-config/Cargo.toml
index ae287abf22..8c2f2e35a8 100644
--- a/libdd-library-config/Cargo.toml
+++ b/libdd-library-config/Cargo.toml
@@ -26,9 +26,6 @@ rmp-serde = "1.3.0"
 
 libdd-trace-protobuf = { version = "3.0.0", path = "../libdd-trace-protobuf" }
 
-[build-dependencies]
-cc = "1.1.31"
-
 [dev-dependencies]
 tempfile = { version = "3.3" }
 serial_test = "3.2"
diff --git a/libdd-library-config/build.rs b/libdd-library-config/build.rs
deleted file mode 100644
index 4fa8078d52..0000000000
--- a/libdd-library-config/build.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-// Unless explicitly stated otherwise all files in this repository are licensed under the Apache
-// License Version 2.0. This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present Datadog, Inc.
-
-fn main() {
-    // Only compile the TLS shim on Linux; the thread-level context feature is Linux-only.
-    #[cfg(target_os = "linux")]
-    {
-        let mut build = cc::Build::new();
-
-        // - On aarch64, TLSDESC is already the only dynamic TLS model so no flag is needed.
-        // - On x86-64, we use `-mtls-dialect=gnu2` (supported since GCC 4.4 and Clang 19+) to force
-        //   the use of TLSDESC as mandated by the spec. If it's not supported, this build will
-        //   fail.
-        #[cfg(target_arch = "x86_64")]
-        build.flag("-mtls-dialect=gnu2");
-
-        build.file("src/tls_shim.c").compile("tls_shim");
-        println!("cargo:rerun-if-changed=src/tls_shim.c");
-    }
-}
diff --git a/libdd-library-config/src/lib.rs b/libdd-library-config/src/lib.rs
index 7262511250..f243678c09 100644
--- a/libdd-library-config/src/lib.rs
+++ b/libdd-library-config/src/lib.rs
@@ -1,7 +1,6 @@
 // Copyright 2021-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 pub mod otel_process_ctx;
-pub mod otel_thread_ctx;
 pub mod tracer_metadata;
 
 use std::borrow::Cow;
diff --git a/libdd-profiling/Cargo.toml b/libdd-profiling/Cargo.toml
index 731bef4ff5..37db386fcb 100644
--- a/libdd-profiling/Cargo.toml
+++ b/libdd-profiling/Cargo.toml
@@ -81,4 +81,5 @@ strum = { version = "0.26", features = ["derive"] }
 tempfile = "3"
 
 [build-dependencies]
+cc = "1.1.31"
 cxx-build = { version = "1.0", optional = true }
diff --git a/libdd-profiling/build.rs b/libdd-profiling/build.rs
index b32dff0f98..6a67b55d82 100644
--- a/libdd-profiling/build.rs
+++ b/libdd-profiling/build.rs
@@ -11,4 +11,20 @@ fn main() {
 
         println!("cargo:rerun-if-changed=src/cxx.rs");
     }
+
+    // Only compile the TLS shim on Linux; the thread-level context feature is Linux-only.
+    #[cfg(target_os = "linux")]
+    {
+        let mut build = cc::Build::new();
+
+        // - On aarch64, TLSDESC is already the only dynamic TLS model so no flag is needed.
+        // - On x86-64, we use `-mtls-dialect=gnu2` (supported since GCC 4.4 and Clang 19+) to force
+        //   the use of TLSDESC as mandated by the spec. If it's not supported, this build will
+        //   fail.
+        #[cfg(target_arch = "x86_64")]
+        build.flag("-mtls-dialect=gnu2");
+
+        build.file("src/tls_shim.c").compile("tls_shim");
+        println!("cargo:rerun-if-changed=src/tls_shim.c");
+    }
 }
diff --git a/libdd-profiling/src/lib.rs b/libdd-profiling/src/lib.rs
index b9d7c1df1a..bf02565b90 100644
--- a/libdd-profiling/src/lib.rs
+++ b/libdd-profiling/src/lib.rs
@@ -14,5 +14,6 @@ pub mod cxx;
 pub mod exporter;
 pub mod internal;
 pub mod iter;
+pub mod otel_thread_ctx;
 pub mod pprof;
 pub mod profiles;
diff --git a/libdd-library-config/src/otel_thread_ctx.rs b/libdd-profiling/src/otel_thread_ctx.rs
similarity index 100%
rename from libdd-library-config/src/otel_thread_ctx.rs
rename to libdd-profiling/src/otel_thread_ctx.rs
diff --git a/libdd-library-config/src/tls_shim.c b/libdd-profiling/src/tls_shim.c
similarity index 100%
rename from libdd-library-config/src/tls_shim.c
rename to libdd-profiling/src/tls_shim.c