Skip to content

Commit 96a3e84

Browse files
committed
Reuse CCCL TMA descriptor construction for tiled TensorMap and keep validated views alive to avoid DLPack-backed pointer lifetime hazards.
Add explicit tiled element-stride coverage and acknowledge the DLPack include-layout compatibility follow-up in NVIDIA/cccl#7871. Made-with: Cursor
1 parent bdf39a2 commit 96a3e84

9 files changed

Lines changed: 497 additions & 85 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ __pycache__/
1414
!*_impl.cpp
1515
!cuda_bindings/cuda/bindings/_lib/param_packer.cpp
1616
!cuda_bindings/cuda/bindings/_bindings/loader.cpp
17+
!cuda_core/cuda/core/_cpp/*.cpp
1718
cache_driver
1819
cache_runtime
1920
cache_nvrtc

cuda_core/cuda/core/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@
6060
StridedMemoryView,
6161
args_viewable_as_strided_memory,
6262
)
63-
<<<<<<< HEAD
6463
from cuda.core._module import Kernel, ObjectCode
6564
from cuda.core._program import Program, ProgramOptions
6665
from cuda.core._stream import (
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include "tensor_map_cccl.h"
6+
7+
#include <string.h>
8+
9+
#include <algorithm>
10+
#include <exception>
11+
12+
#if defined(__has_include)
13+
# if __has_include(<cuda/tma>)
14+
# include <cuda/tma>
15+
# define CUDA_CORE_HAS_CUDA_TMA 1
16+
# else
17+
# define CUDA_CORE_HAS_CUDA_TMA 0
18+
# endif
19+
# if __has_include(<dlpack/dlpack.h>)
20+
# include <dlpack/dlpack.h>
21+
# define CUDA_CORE_HAS_DLPACK_H 1
22+
# else
23+
# define CUDA_CORE_HAS_DLPACK_H 0
24+
# endif
25+
#else
26+
# define CUDA_CORE_HAS_CUDA_TMA 0
27+
# define CUDA_CORE_HAS_DLPACK_H 0
28+
#endif
29+
30+
static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept
31+
{
32+
if (!err || cap == 0)
33+
return;
34+
if (!msg)
35+
{
36+
err[0] = '\0';
37+
return;
38+
}
39+
size_t n = ::strlen(msg);
40+
if (n >= cap)
41+
n = cap - 1;
42+
::memcpy(err, msg, n);
43+
err[n] = '\0';
44+
}
45+
46+
int cuda_core_cccl_make_tma_descriptor_tiled(
47+
void* out_tensor_map,
48+
void* data,
49+
int device_type,
50+
int device_id,
51+
int ndim,
52+
const int64_t* shape,
53+
const int64_t* strides,
54+
uint8_t dtype_code,
55+
uint8_t dtype_bits,
56+
uint16_t dtype_lanes,
57+
const int* box_sizes,
58+
const int* elem_strides,
59+
int interleave_layout,
60+
int swizzle,
61+
int l2_fetch_size,
62+
int oob_fill,
63+
char* err,
64+
size_t err_cap) noexcept
65+
{
66+
#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H)
67+
(void)out_tensor_map;
68+
(void)data;
69+
(void)device_type;
70+
(void)device_id;
71+
(void)ndim;
72+
(void)shape;
73+
(void)strides;
74+
(void)dtype_code;
75+
(void)dtype_bits;
76+
(void)dtype_lanes;
77+
(void)box_sizes;
78+
(void)elem_strides;
79+
(void)interleave_layout;
80+
(void)swizzle;
81+
(void)l2_fetch_size;
82+
(void)oob_fill;
83+
cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time");
84+
return 1;
85+
#else
86+
try
87+
{
88+
if (!out_tensor_map)
89+
{
90+
cuda_core_write_err(err, err_cap, "out_tensor_map is NULL");
91+
return 1;
92+
}
93+
if (!data)
94+
{
95+
cuda_core_write_err(err, err_cap, "tensor data pointer is NULL");
96+
return 1;
97+
}
98+
if (!shape || !box_sizes || ndim <= 0)
99+
{
100+
cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes");
101+
return 1;
102+
}
103+
104+
DLTensor t{};
105+
t.data = data;
106+
t.device = {static_cast<DLDeviceType>(device_type), device_id};
107+
t.ndim = ndim;
108+
t.dtype.code = dtype_code;
109+
t.dtype.bits = dtype_bits;
110+
t.dtype.lanes = dtype_lanes;
111+
// CCCL promises not to mutate the arrays, but DLPack uses non-const pointers.
112+
t.shape = const_cast<int64_t*>(shape);
113+
t.strides = const_cast<int64_t*>(strides);
114+
t.byte_offset = 0;
115+
116+
const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout);
117+
const auto swz = static_cast<cuda::tma_swizzle>(swizzle);
118+
const auto l2 = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size);
119+
const auto oob = static_cast<cuda::tma_oob_fill>(oob_fill);
120+
121+
auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim));
122+
123+
CUtensorMap desc{};
124+
if (elem_strides)
125+
{
126+
auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim));
127+
desc = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob);
128+
}
129+
else
130+
{
131+
desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob);
132+
}
133+
134+
::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap));
135+
cuda_core_write_err(err, err_cap, nullptr);
136+
return 0;
137+
}
138+
catch (const std::exception& e)
139+
{
140+
cuda_core_write_err(err, err_cap, e.what());
141+
return 1;
142+
}
143+
catch (...)
144+
{
145+
cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor");
146+
return 1;
147+
}
148+
#endif
149+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
6+
#define CUDA_CORE_TENSOR_MAP_CCCL_H_
7+
8+
#include <stddef.h>
9+
#include <stdint.h>
10+
11+
#ifdef __cplusplus
12+
extern "C" {
13+
#endif
14+
15+
// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).
16+
//
17+
// Returns 0 on success; on failure returns non-zero and writes a best-effort
18+
// human-readable message into (err, err_cap) if provided.
19+
int cuda_core_cccl_make_tma_descriptor_tiled(
20+
void* out_tensor_map,
21+
void* data,
22+
int device_type,
23+
int device_id,
24+
int ndim,
25+
const int64_t* shape, // length ndim
26+
const int64_t* strides, // length ndim, or NULL for contiguous
27+
uint8_t dtype_code,
28+
uint8_t dtype_bits,
29+
uint16_t dtype_lanes,
30+
const int* box_sizes, // length ndim
31+
const int* elem_strides, // length ndim, or NULL for all-ones overload
32+
int interleave_layout,
33+
int swizzle,
34+
int l2_fetch_size,
35+
int oob_fill,
36+
char* err,
37+
size_t err_cap) noexcept;
38+
39+
#ifdef __cplusplus
40+
} // extern "C"
41+
#endif
42+
43+
#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_

cuda_core/cuda/core/_tensor_map.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver
88
cdef class TensorMapDescriptor:
99
cdef cydriver.CUtensorMap _tensor_map
1010
cdef object _source_ref
11+
cdef object _view_ref
1112
cdef object _repr_info
1213

1314
cdef void* _get_data_ptr(self)

0 commit comments

Comments
 (0)