@@ -27,12 +27,7 @@ namespace deepx::tensorfunc
2727 void launch_todtype (const T* a, Dtype* c,const int size){
2828 auto [numBlocks, blockSize] = BestDims (size);
2929 todtype_kernel<<<numBlocks, blockSize>>> (a, c, size);
30- cudaError_t err = cudaGetLastError ();
31- if (err != cudaSuccess)
32- {
33- throw std::runtime_error (" Failed to launch todtype kernel: " +
34- std::string (cudaGetErrorString (err)));
35- }
30+ throwcudaerror (" Failed to launch todtype kernel" ,cudaGetLastError ());
3631 }
3732 template void launch_todtype<double , float >(const double *a, float *c, const int size);
3833 template void launch_todtype<double , half>(const double *a, half *c, const int size);
@@ -114,13 +109,7 @@ namespace deepx::tensorfunc
114109 // 启动kernel
115110 auto [numBlocks, blockSize] = BestDims (size);
116111 add_kernel<<<numBlocks, blockSize>>> (a, b, c, size);
117- // 检查kernel执行是否成功
118- cudaError_t err = cudaGetLastError ();
119- if (err != cudaSuccess)
120- {
121- throw std::runtime_error (" Failed to launch add kernel: " +
122- std::string (cudaGetErrorString (err)));
123- }
112+ throwcudaerror (" Failed to launch add kernel" ,cudaGetLastError ());
124113 }
125114
126115 template void launch_add<double >(const double *a, const double *b, double *c, const int size);
@@ -147,12 +136,7 @@ namespace deepx::tensorfunc
147136 {
148137 auto [numBlocks, blockSize] = BestDims (size);
149138 addscalar_kernel<<<numBlocks, blockSize>>> (a, scalar, c, size);
150- cudaError_t err = cudaGetLastError ();
151- if (err != cudaSuccess)
152- {
153- throw std::runtime_error (" Failed to launch addscalar kernel: " +
154- std::string (cudaGetErrorString (err)));
155- }
139+ throwcudaerror (" Failed to launch addscalar kernel" ,cudaGetLastError ());
156140 }
157141 template void launch_addscalar<double >(const double *a, const double scalar, double *c, const int size);
158142 template void launch_addscalar<float >(const float *a, const float scalar, float *c, const int size);
@@ -178,12 +162,7 @@ namespace deepx::tensorfunc
178162 {
179163 auto [numBlocks, blockSize] = BestDims (size);
180164 sub_kernel<<<numBlocks, blockSize>>> (a, b, c, size);
181- cudaError_t err = cudaGetLastError ();
182- if (err != cudaSuccess)
183- {
184- throw std::runtime_error (" Failed to launch sub kernel: " +
185- std::string (cudaGetErrorString (err)));
186- }
165+ throwcudaerror (" Failed to launch sub kernel" ,cudaGetLastError ());
187166 }
188167 template void launch_sub<double >(const double *a, const double *b, double *c, const int size);
189168 template void launch_sub<float >(const float *a, const float *b, float *c, const int size);
@@ -209,13 +188,9 @@ namespace deepx::tensorfunc
209188 {
210189 auto [numBlocks, blockSize] = BestDims (size);
211190 subscalar_kernel<<<numBlocks, blockSize>>> (a, scalar, c, size);
212- cudaError_t err = cudaGetLastError ();
213- if (err != cudaSuccess)
214- {
215- throw std::runtime_error (" Failed to launch subscalar kernel: " +
216- std::string (cudaGetErrorString (err)));
217- }
191+ throwcudaerror (" Failed to launch subscalar kernel" ,cudaGetLastError ());
218192 }
193+
219194 template void launch_subscalar<double >(const double *a, const double scalar, double *c, const int size);
220195 template void launch_subscalar<float >(const float *a, const float scalar, float *c, const int size);
221196 template void launch_subscalar<half>(const half *a, const half scalar, half *c, const int size);
@@ -239,11 +214,7 @@ namespace deepx::tensorfunc
239214 void launch_rsubscalar (const T scalar, const T* a, T* c,const int size){
240215 auto [numBlocks, blockSize] = BestDims (size);
241216 rsubscalar_kernel<<<numBlocks, blockSize>>> (scalar, a, c, size);
242- cudaError_t err = cudaGetLastError ();
243- if (err != cudaSuccess)
244- {
245- throw std::runtime_error (" Failed to launch rsubscalar kernel: " +std::string (cudaGetErrorString (err)));
246- }
217+ throwcudaerror (" Failed to launch rsubscalar kernel" ,cudaGetLastError ());
247218 }
248219 template void launch_rsubscalar<double >(const double scalar, const double * a, double * c,const int size);
249220 template void launch_rsubscalar<float >(const float scalar, const float * a, float * c,const int size);
@@ -271,13 +242,9 @@ namespace deepx::tensorfunc
271242 {
272243 auto [numBlocks, blockSize] = BestDims (size);
273244 mul_kernel<<<numBlocks, blockSize>>> (a, b, c, size);
274- cudaError_t err = cudaGetLastError ();
275- if (err != cudaSuccess)
276- {
277- throw std::runtime_error (" Failed to launch mul kernel: " +
278- std::string (cudaGetErrorString (err)));
279- }
245+ throwcudaerror (" Failed to launch mul kernel" ,cudaGetLastError ());
280246 }
247+
281248 template void launch_mul<double >(const double *a, const double *b, double *c, const int size);
282249 template void launch_mul<float >(const float *a, const float *b, float *c, const int size);
283250 template void launch_mul<half>(const half *a, const half *b, half *c, const int size);
@@ -302,12 +269,7 @@ namespace deepx::tensorfunc
302269 {
303270 auto [numBlocks, blockSize] = BestDims (size);
304271 mulscalar_kernel<<<numBlocks, blockSize>>> (a, scalar, c, size);
305- cudaError_t err = cudaGetLastError ();
306- if (err != cudaSuccess)
307- {
308- throw std::runtime_error (" Failed to launch mulscalar kernel: " +
309- std::string (cudaGetErrorString (err)));
310- }
272+ throwcudaerror (" Failed to launch mulscalar kernel" ,cudaGetLastError ());
311273 }
312274 template void launch_mulscalar<double >(const double *a, const double scalar, double *c, const int size);
313275 template void launch_mulscalar<float >(const float *a, const float scalar, float *c, const int size);
@@ -333,13 +295,9 @@ namespace deepx::tensorfunc
333295 {
334296 auto [numBlocks, blockSize] = BestDims (size);
335297 div_kernel<<<numBlocks, blockSize>>> (a, b, c, size);
336- cudaError_t err = cudaGetLastError ();
337- if (err != cudaSuccess)
338- {
339- throw std::runtime_error (" Failed to launch div kernel: " +
340- std::string (cudaGetErrorString (err)));
341- }
298+ throwcudaerror (" Failed to launch div kernel" ,cudaGetLastError ());
342299 }
300+
343301 template void launch_div<double >(const double *a, const double *b, double *c, const int size);
344302 template void launch_div<float >(const float *a, const float *b, float *c, const int size);
345303 template void launch_div<half>(const half *a, const half *b, half *c, const int size);
@@ -364,13 +322,9 @@ namespace deepx::tensorfunc
364322 {
365323 auto [numBlocks, blockSize] = BestDims (size);
366324 divscalar_kernel<<<numBlocks, blockSize>>> (a, scalar, c, size);
367- cudaError_t err = cudaGetLastError ();
368- if (err != cudaSuccess)
369- {
370- throw std::runtime_error (" Failed to launch divscalar kernel: " +
371- std::string (cudaGetErrorString (err)));
372- }
325+ throwcudaerror (" Failed to launch divscalar kernel" ,cudaGetLastError ());
373326 }
327+
374328 template void launch_divscalar<double >(const double *a, const double scalar, double *c, const int size);
375329 template void launch_divscalar<float >(const float *a, const float scalar, float *c, const int size);
376330 template void launch_divscalar<half>(const half *a, const half scalar, half *c, const int size);
@@ -395,13 +349,9 @@ namespace deepx::tensorfunc
395349 {
396350 auto [numBlocks, blockSize] = BestDims (size);
397351 rdivscalar_kernel<<<numBlocks, blockSize>>> (scalar, a, c, size);
398- cudaError_t err = cudaGetLastError ();
399- if (err != cudaSuccess)
400- {
401- throw std::runtime_error (" Failed to launch rdivscalar kernel: " +
402- std::string (cudaGetErrorString (err)));
403- }
352+ throwcudaerror (" Failed to launch rdivscalar kernel" ,cudaGetLastError ());
404353 }
354+
405355 template void launch_rdivscalar<double >(const double scalar, const double *a, double *c, const int size);
406356 template void launch_rdivscalar<float >(const float scalar, const float *a, float *c, const int size);
407357 template void launch_rdivscalar<half>(const half scalar, const half *a, half *c, const int size);
@@ -421,17 +371,21 @@ namespace deepx::tensorfunc
421371 }
422372 }
423373
374+ template <>
375+ __global__ void invert_kernel<bool >(const bool *A, bool *C, const int size)
376+ {
377+ for (int idx = blockIdx .x * blockDim .x + threadIdx .x ; idx < size; idx += blockDim .x * gridDim .x )
378+ {
379+ C[idx] = !A[idx];
380+ }
381+ }
382+
424383 template <typename T>
425384 void launch_invert (const T *a, T *c, const int size)
426385 {
427386 auto [numBlocks, blockSize] = BestDims (size);
428387 invert_kernel<<<numBlocks, blockSize>>> (a, c, size);
429- cudaError_t err = cudaGetLastError ();
430- if (err != cudaSuccess)
431- {
432- throw std::runtime_error (" Failed to launch invert kernel: " +
433- std::string (cudaGetErrorString (err)));
434- }
388+ throwcudaerror (" Failed to launch invert kernel" ,cudaGetLastError ());
435389 }
436390 template void launch_invert<int64_t >(const int64_t *a, int64_t *c, const int size);
437391 template void launch_invert<int32_t >(const int32_t *a, int32_t *c, const int size);
0 commit comments