Skip to content

Commit beefa70

Browse files
authored
rsubscalar:增加rsubscalar (#61)
1 parent 652f13b commit beefa70

File tree

17 files changed

+305
-19
lines changed

17 files changed

+305
-19
lines changed

doc/excuter/op-mem-cuda/list.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@
5757
| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
5858
| tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
5959
| sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
60+
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
61+
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
62+
| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
6063
| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6164
| log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
6265
| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
@@ -67,8 +70,6 @@
6770
| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6871
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
6972
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
70-
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
71-
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
7273
| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> a)->(tensor<any> b) |
7374
| add | cublas | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
7475
| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |

doc/excuter/op-mem-ompsimd/list.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@
5858
| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
5959
| tan | miaobyte | T3=tan(T1) | tan(tensor<any> A)->(tensor<any> C) |
6060
| sin | miaobyte | T3=sin(T1) | sin(tensor<any> A)->(tensor<any> C) |
61+
| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
62+
| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
63+
| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any> scalar, tensor<any> a)->(tensor<any> c) |
6164
| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6265
| log | miaobyte | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
6366
| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
@@ -68,8 +71,6 @@
6871
| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
6972
| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<float32> scalar, tensor<any> A)->(tensor<any> C) |
7073
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
71-
| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
72-
| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
7374
| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> A)->(tensor<any> C) |
7475
| add | cblas | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
7576
| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ namespace deepx::tensorfunc
5858
subDispatcher<Author, T>::sub(A, B, C);
5959
}
6060

61+
// A-scalar=>C
6162
template <typename Author, typename T>
6263
struct subscalarDispatcher
6364
{
@@ -66,20 +67,34 @@ namespace deepx::tensorfunc
6667
throw NotImplementError("subscalar");
6768
}
6869
};
69-
70-
// A-scalar=>C
7170
template <typename Author, typename T>
7271
void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
7372
{
7473
subscalarDispatcher<Author, T>::subscalar(input, value, output);
7574
}
7675

76+
77+
78+
//scalar-A=>C
79+
template <typename Author, typename T>
80+
struct rsubscalarDispatcher
81+
{
82+
static void rsubscalar(const T value, const Tensor<T> &input, Tensor<T> &output) = delete;
83+
};
84+
template <typename Author, typename T>
85+
void rsubscalar(const T value, const Tensor<T> &input, Tensor<T> &output)
86+
{
87+
rsubscalarDispatcher<Author, T>::rsubscalar(value, input, output);
88+
}
89+
90+
7791
template <typename Author, typename T>
7892
struct mulDispatcher
7993
{
8094
static void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
8195
};
8296

97+
8398
// A*B=>C
8499
template <typename Author, typename T>
85100
void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)

excuter/op-mem-cuda/src/client/tfs.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,16 @@ namespace deepx::tf
232232
{
233233
Param("C", DataCategory::Tensor, Precision::Any),
234234
})));
235+
tffactory.add_tf(std::make_shared<RSubScalar<miaobyte>>(vector<Param>(
236+
{
237+
Param("scalar", DataCategory::Var, Precision::Any),
238+
Param("A", DataCategory::Tensor, Precision::Any),
239+
}),
240+
vector<Param>(
241+
{
242+
Param("C", DataCategory::Tensor, Precision::Any),
243+
})));
244+
235245
tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
236246
{
237247
Param("A", DataCategory::Tensor, Precision::Any),

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,37 @@ namespace deepx::tensorfunc
225225
template void launch_subscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
226226
template void launch_subscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
227227

228+
// rsubscalar
229+
template <typename T>
230+
__global__ void rsubscalar_kernel(const T scalar, const T* A, T* C,const int size){
231+
int stride = blockDim.x * gridDim.x;
232+
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
233+
{
234+
C[idx] = scalar - A[idx];
235+
}
236+
}
237+
238+
template <typename T>
239+
void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){
240+
auto [numBlocks, blockSize] = BestDims(size);
241+
rsubscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
242+
cudaError_t err = cudaGetLastError();
243+
if (err != cudaSuccess)
244+
{
245+
throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err)));
246+
}
247+
}
248+
template void launch_rsubscalar<double>(const double scalar, const double* a, double* c,const int size);
249+
template void launch_rsubscalar<float>(const float scalar, const float* a, float* c,const int size);
250+
template void launch_rsubscalar<half>(const half scalar, const half* a, half* c,const int size);
251+
template void launch_rsubscalar<nv_bfloat16>(const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c,const int size);
252+
template void launch_rsubscalar<int64_t>(const int64_t scalar, const int64_t* a, int64_t* c,const int size);
253+
template void launch_rsubscalar<int32_t>(const int32_t scalar, const int32_t* a, int32_t* c,const int size);
254+
template void launch_rsubscalar<int16_t>(const int16_t scalar, const int16_t* a, int16_t* c,const int size);
255+
template void launch_rsubscalar<int8_t>(const int8_t scalar, const int8_t* a, int8_t* c,const int size);
256+
257+
258+
228259
// mul
229260
template <typename T>
230261
__global__ void mul_kernel(const T *A, const T *B, T *C, const int size)

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,15 @@ namespace deepx::tensorfunc
4444

4545
template <typename T>
4646
void launch_subscalar(const T* a, const T scalar, T* c,const int size);
47-
47+
48+
49+
// rsubscalar
50+
template <typename T>
51+
__global__ void rsubscalar_kernel(const T scalar, const T* A, T* C,const int size);
52+
53+
template <typename T>
54+
void launch_rsubscalar(const T scalar, const T* a, T* c,const int size);
55+
4856
// mul
4957
template <typename T>
5058
__global__ void mul_kernel(const T* A, const T* B, T* C,const int size);

excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,18 @@ namespace deepx::tensorfunc
6969
}
7070
};
7171

72+
template <typename T>
73+
struct rsubscalarDispatcher<miaobyte, T>
74+
{
75+
static void rsubscalar(const T scalar, const Tensor<T> &A, Tensor<T> &C)
76+
{
77+
if (A.shape.size != C.shape.size) {
78+
throw TensorShapeError("rsubscalar");
79+
}
80+
launch_rsubscalar(scalar, A.data, C.data, A.shape.size);
81+
}
82+
};
83+
7284
template <typename T>
7385
struct mulDispatcher<miaobyte, T>
7486
{

excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,75 @@ namespace deepx::tf
636636
}
637637
};
638638

639+
// rsubscalar
640+
template <typename Author>
641+
class RSubScalar : public TF
642+
{
643+
public:
644+
RSubScalar(const vector<Param> &args, const vector<Param> &returns)
645+
{
646+
this->name = "rsubscalar";
647+
this->metadata.author = Author::name();
648+
this->tftype = "elementwise";
649+
this->args = args;
650+
this->returns = returns;
651+
}
652+
653+
string math_formula() const override
654+
{
655+
return "T3=scalar-T1";
656+
}
657+
shared_ptr<TF> clone() const override
658+
{
659+
return make_shared<RSubScalar<Author>>(*this);
660+
}
661+
int run(shared_ptr<MemBase> mem, string &error) override
662+
{
663+
if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
664+
{
665+
return 1;
666+
}
667+
Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
668+
Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
669+
if (a_type != c_type)
670+
{
671+
error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
672+
return 1;
673+
}
674+
switch (a_type)
675+
{
676+
case Precision::Float64:
677+
tensorfunc::rsubscalar<Author, double>(this->getvar<double>(1, mem), *mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
678+
break;
679+
case Precision::Float32:
680+
tensorfunc::rsubscalar<Author, float>(this->getvar<float>(1, mem), *mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
681+
break;
682+
case Precision::Float16:
683+
tensorfunc::rsubscalar<Author, half>(this->getvar<half>(1, mem), *mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
684+
break;
685+
case Precision::BFloat16:
686+
tensorfunc::rsubscalar<Author, nv_bfloat16>(this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
687+
break;
688+
case Precision::Int64:
689+
tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
690+
break;
691+
case Precision::Int32:
692+
tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
693+
break;
694+
case Precision::Int16:
695+
tensorfunc::rsubscalar<Author, int16_t>(this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
696+
break;
697+
case Precision::Int8:
698+
tensorfunc::rsubscalar<Author, int8_t>(this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
699+
break;
700+
default:
701+
error = "Unsupported dtype: " + precision_str(a_type);
702+
return 1;
703+
}
704+
return 0;
705+
}
706+
};
707+
639708
template <typename Author>
640709
class Mul : public TF
641710
{

excuter/op-mem-ompsimd/src/client/tfs.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,18 @@ namespace deepx::tf
246246
{
247247
Param("c", DataCategory::Tensor, Precision::Any),
248248
})));
249+
// rsubscalar author=miaobyte
250+
tffactory.add_tf(std::make_shared<RSubScalar<miaobyte>>(vector<Param>(
251+
{
252+
Param("scalar", DataCategory::Var, Precision::Any),
253+
Param("a", DataCategory::Tensor, Precision::Any),
254+
}),
255+
vector<Param>(
256+
{
257+
Param("c", DataCategory::Tensor, Precision::Any),
258+
})));
259+
260+
249261
// mul author=miaobyte
250262
tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
251263
{

excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,27 @@ namespace deepx::tensorfunc
194194
}
195195
};
196196

197+
template <typename T>
198+
struct rsubscalarDispatcher<miaobyte, T>
199+
{
200+
static void rsubscalar(const T scalar, const Tensor<T> &A, Tensor<T> &C)
201+
{
202+
elementwise_A_b_C<T>(A, scalar, C,
203+
// 标量操作
204+
[](const T &a,const T &scalar, T &c)
205+
{ c = scalar - a; },
206+
// SIMD操作
207+
[]( const T *a,const T scalar, T *c, size_t size)
208+
{
209+
const ScalableTag<T> tag;
210+
auto vec1 = Load(tag, a);
211+
auto vec_scalar = Set(tag, scalar);
212+
auto vec_result = Sub(vec_scalar, vec1);
213+
Store(vec_result, tag, c);
214+
});
215+
}
216+
};
217+
197218
// 添加 mul 的模板特化实现
198219
template <typename T>
199220
struct mulDispatcher<miaobyte, T>

0 commit comments

Comments
 (0)