Skip to content

Commit bbd1a7f

Browse files
authored
llama_rope:验证成功 (#63)
* 遇到cuda:an illegal memory access was encountered 在print函数前 * llama_rope:验证成功(py+ompsimd)
1 parent beefa70 commit bbd1a7f

16 files changed

Lines changed: 129 additions & 331 deletions

excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,7 @@ namespace deepx::tensorfunc
9696
default:
9797
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
9898
}
99-
cudaError_t err = cudaGetLastError();
100-
if (err != cudaSuccess)
101-
{
102-
throw std::runtime_error("cuda error");
103-
}
99+
throwcudaerror("Failed to launch transpose kernel",cudaGetLastError());
104100
}
105101

106102
template void launch_transpose<double>(const double *input, const int *inputStrides, double *output, const int *outputStrides, const int dim, const int len, const int *dimOrder);
@@ -224,12 +220,9 @@ namespace deepx::tensorfunc
224220
default:
225221
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
226222
}
227-
cudaError_t err = cudaGetLastError();
228-
if (err != cudaSuccess)
229-
{
230-
throw std::runtime_error("cuda error");
231-
}
223+
throwcudaerror("Failed to launch concat kernel",cudaGetLastError());
232224
}
225+
233226
template void launch_concat<double>(const double **tensorsData, const int *inputStrides, double *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
234227
template void launch_concat<float>(const float **tensorsData, const int *inputStrides, float *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
235228
template void launch_concat<nv_bfloat16>(const nv_bfloat16 **tensorsData, const int *inputStrides, nv_bfloat16 *outputData, const int *outputStrides, const int dim, const int len, const int axis, const int numTensors, const int *shapeAtAxis);
@@ -335,11 +328,7 @@ namespace deepx::tensorfunc
335328
default:
336329
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
337330
}
338-
cudaError_t err = cudaGetLastError();
339-
if (err != cudaSuccess)
340-
{
341-
throw std::runtime_error("cuda error");
342-
}
331+
throwcudaerror("Failed to launch broadcastTo kernel",cudaGetLastError());
343332
}
344333
template void launch_broadcastTo<double>(const double *input, const int *inputStrides, const int inputDim,
345334
const BroadcastMap *broadcastMap,
@@ -489,12 +478,9 @@ namespace deepx::tensorfunc
489478
default:
490479
throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
491480
}
492-
cudaError_t err = cudaGetLastError();
493-
if (err != cudaSuccess)
494-
{
495-
throw std::runtime_error("cuda error");
496-
}
481+
throwcudaerror("Failed to launch indexselect kernel",cudaGetLastError());
497482
}
483+
498484
template void launch_indexselect<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
499485
const int64_t *index, const int *indexStrides, const int indexDim,
500486
const int gatherAxis,

excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,24 +68,21 @@ namespace deepx::tensorfunc
6868

6969
using std::shared_ptr;
7070

71-
inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
72-
{
73-
shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
74-
cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
75-
cudaError_t err=cudaGetLastError();
76-
if(err!=cudaSuccess){
77-
throw std::runtime_error("Failed to copy data from device to host");
78-
79-
}
80-
return {size, host_data};
81-
}
71+
8272

8373
inline void throwcudaerror(const std::string& msg,cudaError_t err){
8474
if (err != cudaSuccess)
8575
{
8676
throw std::runtime_error(msg + "\n" + std::string(cudaGetErrorString(err)));
8777
}
8878
}
79+
inline std::pair<int, std::shared_ptr<unsigned char[]>> device_offload(unsigned char *data,int size)
80+
{
81+
shared_ptr<unsigned char[]> host_data(new unsigned char[size]);
82+
cudaMemcpy(host_data.get(), data, size, cudaMemcpyDeviceToHost);
83+
throwcudaerror("Failed to copy data from device to host",cudaGetLastError());
84+
return {size, host_data};
85+
}
8986
}
9087

9188
#endif

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

Lines changed: 26 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,7 @@ namespace deepx::tensorfunc
2727
void launch_todtype(const T* a, Dtype* c,const int size){
2828
auto [numBlocks, blockSize] = BestDims(size);
2929
todtype_kernel<<<numBlocks, blockSize>>>(a, c, size);
30-
cudaError_t err = cudaGetLastError();
31-
if (err != cudaSuccess)
32-
{
33-
throw std::runtime_error("Failed to launch todtype kernel: " +
34-
std::string(cudaGetErrorString(err)));
35-
}
30+
throwcudaerror("Failed to launch todtype kernel",cudaGetLastError());
3631
}
3732
template void launch_todtype<double, float>(const double *a, float *c, const int size);
3833
template void launch_todtype<double, half>(const double *a, half *c, const int size);
@@ -114,13 +109,7 @@ namespace deepx::tensorfunc
114109
// 启动kernel
115110
auto [numBlocks, blockSize] = BestDims(size);
116111
add_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
117-
// 检查kernel执行是否成功
118-
cudaError_t err = cudaGetLastError();
119-
if (err != cudaSuccess)
120-
{
121-
throw std::runtime_error("Failed to launch add kernel: " +
122-
std::string(cudaGetErrorString(err)));
123-
}
112+
throwcudaerror("Failed to launch add kernel",cudaGetLastError());
124113
}
125114

126115
template void launch_add<double>(const double *a, const double *b, double *c, const int size);
@@ -147,12 +136,7 @@ namespace deepx::tensorfunc
147136
{
148137
auto [numBlocks, blockSize] = BestDims(size);
149138
addscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
150-
cudaError_t err = cudaGetLastError();
151-
if (err != cudaSuccess)
152-
{
153-
throw std::runtime_error("Failed to launch addscalar kernel: " +
154-
std::string(cudaGetErrorString(err)));
155-
}
139+
throwcudaerror("Failed to launch addscalar kernel",cudaGetLastError());
156140
}
157141
template void launch_addscalar<double>(const double *a, const double scalar, double *c, const int size);
158142
template void launch_addscalar<float>(const float *a, const float scalar, float *c, const int size);
@@ -178,12 +162,7 @@ namespace deepx::tensorfunc
178162
{
179163
auto [numBlocks, blockSize] = BestDims(size);
180164
sub_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
181-
cudaError_t err = cudaGetLastError();
182-
if (err != cudaSuccess)
183-
{
184-
throw std::runtime_error("Failed to launch sub kernel: " +
185-
std::string(cudaGetErrorString(err)));
186-
}
165+
throwcudaerror("Failed to launch sub kernel",cudaGetLastError());
187166
}
188167
template void launch_sub<double>(const double *a, const double *b, double *c, const int size);
189168
template void launch_sub<float>(const float *a, const float *b, float *c, const int size);
@@ -209,13 +188,9 @@ namespace deepx::tensorfunc
209188
{
210189
auto [numBlocks, blockSize] = BestDims(size);
211190
subscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
212-
cudaError_t err = cudaGetLastError();
213-
if (err != cudaSuccess)
214-
{
215-
throw std::runtime_error("Failed to launch subscalar kernel: " +
216-
std::string(cudaGetErrorString(err)));
217-
}
191+
throwcudaerror("Failed to launch subscalar kernel",cudaGetLastError());
218192
}
193+
219194
template void launch_subscalar<double>(const double *a, const double scalar, double *c, const int size);
220195
template void launch_subscalar<float>(const float *a, const float scalar, float *c, const int size);
221196
template void launch_subscalar<half>(const half *a, const half scalar, half *c, const int size);
@@ -239,11 +214,7 @@ namespace deepx::tensorfunc
239214
void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){
240215
auto [numBlocks, blockSize] = BestDims(size);
241216
rsubscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
242-
cudaError_t err = cudaGetLastError();
243-
if (err != cudaSuccess)
244-
{
245-
throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err)));
246-
}
217+
throwcudaerror("Failed to launch rsubscalar kernel",cudaGetLastError());
247218
}
248219
template void launch_rsubscalar<double>(const double scalar, const double* a, double* c,const int size);
249220
template void launch_rsubscalar<float>(const float scalar, const float* a, float* c,const int size);
@@ -271,13 +242,9 @@ namespace deepx::tensorfunc
271242
{
272243
auto [numBlocks, blockSize] = BestDims(size);
273244
mul_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
274-
cudaError_t err = cudaGetLastError();
275-
if (err != cudaSuccess)
276-
{
277-
throw std::runtime_error("Failed to launch mul kernel: " +
278-
std::string(cudaGetErrorString(err)));
279-
}
245+
throwcudaerror("Failed to launch mul kernel",cudaGetLastError());
280246
}
247+
281248
template void launch_mul<double>(const double *a, const double *b, double *c, const int size);
282249
template void launch_mul<float>(const float *a, const float *b, float *c, const int size);
283250
template void launch_mul<half>(const half *a, const half *b, half *c, const int size);
@@ -302,12 +269,7 @@ namespace deepx::tensorfunc
302269
{
303270
auto [numBlocks, blockSize] = BestDims(size);
304271
mulscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
305-
cudaError_t err = cudaGetLastError();
306-
if (err != cudaSuccess)
307-
{
308-
throw std::runtime_error("Failed to launch mulscalar kernel: " +
309-
std::string(cudaGetErrorString(err)));
310-
}
272+
throwcudaerror("Failed to launch mulscalar kernel",cudaGetLastError());
311273
}
312274
template void launch_mulscalar<double>(const double *a, const double scalar, double *c, const int size);
313275
template void launch_mulscalar<float>(const float *a, const float scalar, float *c, const int size);
@@ -333,13 +295,9 @@ namespace deepx::tensorfunc
333295
{
334296
auto [numBlocks, blockSize] = BestDims(size);
335297
div_kernel<<<numBlocks, blockSize>>>(a, b, c, size);
336-
cudaError_t err = cudaGetLastError();
337-
if (err != cudaSuccess)
338-
{
339-
throw std::runtime_error("Failed to launch div kernel: " +
340-
std::string(cudaGetErrorString(err)));
341-
}
298+
throwcudaerror("Failed to launch div kernel",cudaGetLastError());
342299
}
300+
343301
template void launch_div<double>(const double *a, const double *b, double *c, const int size);
344302
template void launch_div<float>(const float *a, const float *b, float *c, const int size);
345303
template void launch_div<half>(const half *a, const half *b, half *c, const int size);
@@ -364,13 +322,9 @@ namespace deepx::tensorfunc
364322
{
365323
auto [numBlocks, blockSize] = BestDims(size);
366324
divscalar_kernel<<<numBlocks, blockSize>>>(a, scalar, c, size);
367-
cudaError_t err = cudaGetLastError();
368-
if (err != cudaSuccess)
369-
{
370-
throw std::runtime_error("Failed to launch divscalar kernel: " +
371-
std::string(cudaGetErrorString(err)));
372-
}
325+
throwcudaerror("Failed to launch divscalar kernel",cudaGetLastError());
373326
}
327+
374328
template void launch_divscalar<double>(const double *a, const double scalar, double *c, const int size);
375329
template void launch_divscalar<float>(const float *a, const float scalar, float *c, const int size);
376330
template void launch_divscalar<half>(const half *a, const half scalar, half *c, const int size);
@@ -395,13 +349,9 @@ namespace deepx::tensorfunc
395349
{
396350
auto [numBlocks, blockSize] = BestDims(size);
397351
rdivscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
398-
cudaError_t err = cudaGetLastError();
399-
if (err != cudaSuccess)
400-
{
401-
throw std::runtime_error("Failed to launch rdivscalar kernel: " +
402-
std::string(cudaGetErrorString(err)));
403-
}
352+
throwcudaerror("Failed to launch rdivscalar kernel",cudaGetLastError());
404353
}
354+
405355
template void launch_rdivscalar<double>(const double scalar, const double *a, double *c, const int size);
406356
template void launch_rdivscalar<float>(const float scalar, const float *a, float *c, const int size);
407357
template void launch_rdivscalar<half>(const half scalar, const half *a, half *c, const int size);
@@ -421,17 +371,21 @@ namespace deepx::tensorfunc
421371
}
422372
}
423373

374+
template <>
375+
__global__ void invert_kernel<bool>(const bool *A, bool *C, const int size)
376+
{
377+
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x)
378+
{
379+
C[idx] = !A[idx];
380+
}
381+
}
382+
424383
template <typename T>
425384
void launch_invert(const T *a, T *c, const int size)
426385
{
427386
auto [numBlocks, blockSize] = BestDims(size);
428387
invert_kernel<<<numBlocks, blockSize>>>(a, c, size);
429-
cudaError_t err = cudaGetLastError();
430-
if (err != cudaSuccess)
431-
{
432-
throw std::runtime_error("Failed to launch invert kernel: " +
433-
std::string(cudaGetErrorString(err)));
434-
}
388+
throwcudaerror("Failed to launch invert kernel",cudaGetLastError());
435389
}
436390
template void launch_invert<int64_t>(const int64_t *a, int64_t *c, const int size);
437391
template void launch_invert<int32_t>(const int32_t *a, int32_t *c, const int size);

0 commit comments

Comments
 (0)