forked from deepseek-ai/DeepEP
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathsetup.py
More file actions
196 lines (167 loc) · 7.57 KB
/
setup.py
File metadata and controls
196 lines (167 loc) · 7.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
import subprocess
import setuptools
import importlib
from pathlib import Path
from paddle.utils.cpp_extension import BuildExtension, CUDAExtension, _get_cuda_arch_flags
from paddle.utils.cpp_extension.extension_utils import (
add_compile_flag,
)
# Wheel specific: the wheels only include the soname of the host library `libnvshmem_host.so.X`
def get_nvshmem_host_lib_name(base_dir):
path = Path(base_dir).joinpath('lib')
for file in path.rglob('libnvshmem_host.so.*'):
return file.name
raise ModuleNotFoundError('libnvshmem_host.so not found')
def _detect_local_gpu_arch():
'''Auto-detect the compute capability of the first visible GPU via nvidia-smi.
Returns a string like '9.0', or None if detection fails.
DeepEP requires GPU compute capability >= 9.0 (SM90+).
'''
try:
out = subprocess.check_output(
['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
stderr=subprocess.DEVNULL,
)
caps = {line.strip() for line in out.decode().splitlines() if line.strip()}
# Return the first available architecture (usually all GPUs are same)
return sorted(caps)[0] if caps else None
except Exception:
return None
def _resolve_cuda_arch():
'''Resolve CUDA architecture for compilation.
Priority (highest first):
1. PADDLE_CUDA_ARCH_LIST env var (user-specified)
2. Auto-detect from the local GPU via nvidia-smi
3. Default to '9.0' (SM90)
Returns a string like '9.0', '10.0', '10.3', etc.
Raises ValueError if the detected/archived arch is < 9.0.
'''
import re
# 1. Try user-specified env var
raw = os.environ.get('PADDLE_CUDA_ARCH_LIST', '').strip()
if raw:
# Parse semicolon- or comma-separated values, use the first one
arch = re.split(r'[;,]', raw)[0].strip()
else:
# 2. Auto-detect from local GPU
arch = _detect_local_gpu_arch()
if not arch:
# 3. Fallback to default
arch = '9.0'
# Validate: DeepEP requires SM90+ (compute capability >= 9.0)
try:
major, minor = map(int, arch.split('.'))
if major < 9:
raise ValueError(
f'DeepEP requires GPU compute capability >= 9.0 (SM90+), '
f'but detected architecture {arch}. '
f'Please use a GPU with SM90+'
f'set PADDLE_CUDA_ARCH_LIST to a supported architecture.'
)
except ValueError as e:
raise ValueError(f'Invalid CUDA architecture format: {arch}') from e
return arch
if __name__ == '__main__':
disable_nvshmem = False
nvshmem_dir = os.getenv('NVSHMEM_DIR', None)
nvshmem_host_lib = 'libnvshmem_host.so'
if nvshmem_dir is None:
try:
nvshmem_dir = importlib.util.find_spec("nvidia.nvshmem").submodule_search_locations[0]
nvshmem_host_lib = get_nvshmem_host_lib_name(nvshmem_dir)
import nvidia.nvshmem as nvshmem # noqa: F401
except (ModuleNotFoundError, AttributeError, IndexError):
print(
'Warning: `NVSHMEM_DIR` is not specified, and the NVSHMEM module is not installed. All internode and low-latency features are disabled\n'
)
disable_nvshmem = True
else:
disable_nvshmem = False
if not disable_nvshmem:
assert os.path.exists(nvshmem_dir), f'The specified NVSHMEM directory does not exist: {nvshmem_dir}'
cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable', '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
nvcc_flags = ['-O3', '-Xcompiler', '-O3']
sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu']
include_dirs = ['csrc/']
library_dirs = []
nvcc_dlink = []
extra_link_args = ['-lcuda']
# NVSHMEM flags
if disable_nvshmem:
cxx_flags.append('-DDISABLE_NVSHMEM')
nvcc_flags.append('-DDISABLE_NVSHMEM')
else:
sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu'])
include_dirs.extend([f'{nvshmem_dir}/include'])
library_dirs.extend([f'{nvshmem_dir}/lib'])
nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem_device'])
extra_link_args.extend([f'-l:{nvshmem_host_lib}', '-l:libnvshmem_device.a', f'-Wl,-rpath,{nvshmem_dir}/lib'])
# Resolve CUDA architecture for compilation
# Priority: user env var > auto-detect > default 9.0
# DeepEP requires SM90+ (compute capability >= 9.0)
if 'PADDLE_CUDA_ARCH_LIST' not in os.environ:
resolved_arch = _resolve_cuda_arch()
os.environ['PADDLE_CUDA_ARCH_LIST'] = resolved_arch
if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
# Disable some SM90 features: FP8, launch methods, and TMA
cxx_flags.append('-DDISABLE_SM90_FEATURES')
nvcc_flags.append('-DDISABLE_SM90_FEATURES')
# Disable internode and low-latency kernels
assert disable_nvshmem
# CUDA 12 flags
nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
arch = os.environ['PADDLE_CUDA_ARCH_LIST'].strip()
if arch not in ('9.0', '9.0a'):
assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
# Disable aggressive PTX instructions
if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '1')):
cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
# Bits of `topk_idx.dtype`, choices are 32 and 64
if "TOPK_IDX_BITS" in os.environ:
topk_idx_bits = int(os.environ['TOPK_IDX_BITS'])
cxx_flags.append(f'-DTOPK_IDX_BITS={topk_idx_bits}')
nvcc_flags.append(f'-DTOPK_IDX_BITS={topk_idx_bits}')
# Put them together
extra_compile_args = {
'cxx': cxx_flags,
'nvcc': nvcc_flags,
}
if len(nvcc_dlink) > 0:
nvcc_dlink = nvcc_dlink + _get_cuda_arch_flags()
extra_compile_args['nvcc_dlink'] = nvcc_dlink
# Summary
print('Build summary:')
print(f' > Sources: {sources}')
print(f' > Includes: {include_dirs}')
print(f' > Libraries: {library_dirs}')
print(f' > Compilation flags: {extra_compile_args}')
print(f' > Link flags: {extra_link_args}')
print(f' > Arch list: {os.environ["PADDLE_CUDA_ARCH_LIST"]}')
print(f' > NVSHMEM path: {nvshmem_dir}')
print()
# noinspection PyBroadException
try:
cmd = ['git', 'rev-parse', '--short', 'HEAD']
revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
except Exception as _:
revision = ''
add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_CUDA'])
add_compile_flag(extra_compile_args, ['-DWITH_DISTRIBUTE'])
add_compile_flag(extra_compile_args, ['-DWITH_NVSHMEM'])
add_compile_flag(extra_compile_args, ['-DWITH_GPU'])
add_compile_flag(extra_compile_args, ['-DWITH_FLUID_ONLY'])
setuptools.setup(name='deep_ep',
version='1.2.1' + revision,
packages=setuptools.find_packages(include=['deep_ep']),
ext_modules=[
CUDAExtension(name='deep_ep_cpp',
include_dirs=include_dirs,
library_dirs=library_dirs,
sources=sources,
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args)
],
cmdclass={'build_ext': BuildExtension})