rsubscalar:增加rsubscalar (#61)

miaobyte · web-flow · commit beefa7073ec3 · 2025-05-18T17:45:50.000+08:00
diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
@@ -57,6 +57,9 @@
 | maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
+| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
 | divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
 | addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
@@ -67,8 +70,6 @@
 | minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
 | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
 | todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> a)->(tensor<any> b) |
 | add | cublas | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
 | add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
@@ -58,6 +58,9 @@
 | maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | tan | miaobyte | T3=tan(T1) | tan(tensor<any> A)->(tensor<any> C) |
 | sin | miaobyte | T3=sin(T1) | sin(tensor<any> A)->(tensor<any> C) |
+| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| rsubscalar | miaobyte | T3=scalar-T1 | rsubscalar(var<any> scalar, tensor<any> a)->(tensor<any> c) |
 | divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | log | miaobyte | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
 | addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
@@ -68,8 +71,6 @@
 | minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<float32> scalar, tensor<any> A)->(tensor<any> C) |
 | rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
 | todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> A)->(tensor<any> C) |
 | add | cblas | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
 | add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -58,6 +58,7 @@ namespace deepx::tensorfunc
         subDispatcher<Author, T>::sub(A, B, C);
     }
 
+    // A-scalar=>C
     template <typename Author, typename T>
     struct subscalarDispatcher
     {
@@ -66,20 +67,34 @@ namespace deepx::tensorfunc
             throw NotImplementError("subscalar");
         }
     };
-
-    // A-scalar=>C
     template <typename Author, typename T>
     void subscalar(const Tensor<T> &input, const T value, Tensor<T> &output)
     {
         subscalarDispatcher<Author, T>::subscalar(input, value, output);
     }
 
+
+
+    //scalar-A=>C
+    template <typename Author, typename T>
+    struct rsubscalarDispatcher
+    {
+        static void rsubscalar(const T value, const Tensor<T> &input, Tensor<T> &output) = delete;
+    };
+    template <typename Author, typename T>
+    void rsubscalar(const T value, const Tensor<T> &input, Tensor<T> &output)
+    {
+        rsubscalarDispatcher<Author, T>::rsubscalar(value, input, output);
+    }
+
+
     template <typename Author, typename T>
     struct mulDispatcher
     {
         static void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C) = delete;
     };
 
+
     // A*B=>C
     template <typename Author, typename T>
     void mul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -232,6 +232,16 @@ namespace deepx::tf
                                                                    {
                                                                        Param("C", DataCategory::Tensor, Precision::Any),
                                                                    })));
+        tffactory.add_tf(std::make_shared<RSubScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                       Param("A", DataCategory::Tensor, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("C", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+                                                                   
         tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("A", DataCategory::Tensor, Precision::Any),
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -225,6 +225,37 @@ namespace deepx::tensorfunc
     template void launch_subscalar<int16_t>(const int16_t *a, const int16_t scalar, int16_t *c, const int size);
     template void launch_subscalar<int8_t>(const int8_t *a, const int8_t scalar, int8_t *c, const int size);
 
+    // rsubscalar
+    template <typename T>
+    __global__ void rsubscalar_kernel(const T scalar, const T* A, T* C,const int size){
+        int stride = blockDim.x * gridDim.x;
+        for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride)
+        {
+            C[idx] = scalar - A[idx];
+        }   
+    }
+
+    template <typename T>
+    void launch_rsubscalar(const T scalar, const T* a, T* c,const int size){
+        auto [numBlocks, blockSize] = BestDims(size);
+        rsubscalar_kernel<<<numBlocks, blockSize>>>(scalar, a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch rsubscalar kernel: "+std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_rsubscalar<double>(const double scalar, const double* a, double* c,const int size);
+    template void launch_rsubscalar<float>(const float scalar, const float* a, float* c,const int size);
+    template void launch_rsubscalar<half>(const half scalar, const half* a, half* c,const int size);
+    template void launch_rsubscalar<nv_bfloat16>(const nv_bfloat16 scalar, const nv_bfloat16* a, nv_bfloat16* c,const int size);
+    template void launch_rsubscalar<int64_t>(const int64_t scalar, const int64_t* a, int64_t* c,const int size);
+    template void launch_rsubscalar<int32_t>(const int32_t scalar, const int32_t* a, int32_t* c,const int size);
+    template void launch_rsubscalar<int16_t>(const int16_t scalar, const int16_t* a, int16_t* c,const int size);
+    template void launch_rsubscalar<int8_t>(const int8_t scalar, const int8_t* a, int8_t* c,const int size);
+
+ 
+
     // mul
     template <typename T>
     __global__ void mul_kernel(const T *A, const T *B, T *C, const int size)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
@@ -44,7 +44,15 @@ namespace deepx::tensorfunc
 
     template <typename T>
     void launch_subscalar(const T* a, const T scalar, T* c,const int size);
- 
+    
+
+    // rsubscalar
+    template <typename T>
+    __global__ void rsubscalar_kernel(const T scalar, const T* A, T* C,const int size);
+
+    template <typename T>
+    void launch_rsubscalar(const T scalar, const T* a, T* c,const int size);
+
     // mul
     template <typename T>
     __global__ void mul_kernel(const T* A, const T* B, T* C,const int size);
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
@@ -69,6 +69,18 @@ namespace deepx::tensorfunc
         }
     };  
 
+    template <typename T>
+    struct rsubscalarDispatcher<miaobyte, T>
+    {
+        static void rsubscalar(const T scalar, const Tensor<T> &A, Tensor<T> &C)
+        {
+            if (A.shape.size != C.shape.size) { 
+                throw TensorShapeError("rsubscalar");
+            }
+            launch_rsubscalar(scalar, A.data, C.data, A.shape.size);
+        }
+    };  
+    
     template <typename T>
     struct mulDispatcher<miaobyte, T>
     {
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -636,6 +636,75 @@ namespace deepx::tf
         }
     };
 
+    // rsubscalar
+    template <typename Author>
+    class RSubScalar : public TF
+    {
+    public:
+        RSubScalar(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "rsubscalar";
+            this->metadata.author = Author::name();
+            this->tftype = "elementwise";
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T3=scalar-T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<RSubScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
+            Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::rsubscalar<Author, double>(this->getvar<double>(1, mem), *mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::rsubscalar<Author, float>(this->getvar<float>(1, mem), *mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Float16:
+                tensorfunc::rsubscalar<Author, half>(this->getvar<half>(1, mem), *mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                break;
+            case Precision::BFloat16:
+                tensorfunc::rsubscalar<Author, nv_bfloat16>(this->getvar<nv_bfloat16>(1, mem), *mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::rsubscalar<Author, int16_t>(this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::rsubscalar<Author, int8_t>(this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
+ 
     template <typename Author>
     class Mul : public TF
     {
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -246,6 +246,18 @@ namespace deepx::tf
                                                                    {
                                                                        Param("c", DataCategory::Tensor, Precision::Any),
                                                                    })));
+        // rsubscalar author=miaobyte
+        tffactory.add_tf(std::make_shared<RSubScalar<miaobyte>>(vector<Param>(
+                                                                   {
+                                                                       Param("scalar", DataCategory::Var, Precision::Any),
+                                                                       Param("a", DataCategory::Tensor, Precision::Any),
+                                                                   }),
+                                                               vector<Param>(
+                                                                   {
+                                                                       Param("c", DataCategory::Tensor, Precision::Any),
+                                                                   })));
+
+
         // mul author=miaobyte
         tffactory.add_tf(std::make_shared<Mul<miaobyte>>(vector<Param>(
                                                              {
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -194,6 +194,27 @@ namespace deepx::tensorfunc
         }
     };
 
+    template <typename T>
+    struct rsubscalarDispatcher<miaobyte, T>
+    {
+        static void rsubscalar(const T scalar, const Tensor<T> &A, Tensor<T> &C)
+    {
+        elementwise_A_b_C<T>(A, scalar, C,
+                             // 标量操作
+                             [](const T &a,const T &scalar, T &c)
+                             { c = scalar - a; },
+                             // SIMD操作
+                             []( const T *a,const T scalar, T *c, size_t size)
+                             {
+                                const ScalableTag<T> tag;
+                                auto vec1 = Load(tag, a);
+                                auto vec_scalar = Set(tag, scalar);
+                                auto vec_result = Sub(vec_scalar, vec1);
+                                Store(vec_result, tag, c);
+                             });
+        }
+    };
+
     // 添加 mul 的模板特化实现
     template <typename T>
     struct mulDispatcher<miaobyte, T>
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -495,6 +495,67 @@ namespace deepx::tf
             return 0;
         }
     };
+    
+    template <typename Author>
+    class RSubScalar : public TF
+    {
+    public:
+        RSubScalar(vector<Param> args, vector<Param> returns)
+        {
+            this->name = "rsubscalar";
+            this->metadata.author = Author::name();
+            this->tftype = "elementwise";
+            this->args = args;
+            this->returns = returns;
+        }
+        string math_formula() const override
+        {
+            return "T3=scalar-T1";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<RSubScalar<Author>>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[1].textvalue,this->returns[0].textvalue}, mem, error)!=0)
+            {
+                return 1;
+            }
+            Precision a_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
+            Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
+            if (a_type != c_type)
+            {
+                error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(c_type);
+                return 1;
+            }
+            switch (a_type)
+            {
+            case Precision::Float64:
+                tensorfunc::rsubscalar<Author, double>(this->getvar<double>(1, mem), *mem->gettensor<double>(this->args[1].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                break;
+            case Precision::Float32:
+                tensorfunc::rsubscalar<Author, float>(this->getvar<float>(1, mem), *mem->gettensor<float>(this->args[1].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                break;
+            case Precision::Int64:
+                tensorfunc::rsubscalar<Author, int64_t>(this->getvar<int64_t>(1, mem), *mem->gettensor<int64_t>(this->args[1].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int32:
+                tensorfunc::rsubscalar<Author, int32_t>(this->getvar<int32_t>(1, mem), *mem->gettensor<int32_t>(this->args[1].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int16:
+                tensorfunc::rsubscalar<Author, int16_t>(this->getvar<int16_t>(1, mem), *mem->gettensor<int16_t>(this->args[1].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                break;
+            case Precision::Int8:
+                tensorfunc::rsubscalar<Author, int8_t>(this->getvar<int8_t>(1, mem), *mem->gettensor<int8_t>(this->args[1].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                break;
+            default:
+                error = "Unsupported dtype: " + precision_str(a_type);
+                return 1;
+            }
+            return 0;
+        }
+    };
  
     template <typename Author>
     class Mul : public TF
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
@@ -11,6 +11,7 @@
         'addscalar':'miaobyte',
         'sub':'miaobyte',
         'subscalar':'miaobyte',
+        'rsubscalar':'miaobyte',
         'mul':'miaobyte',
         'mulscalar':'miaobyte',
         'div':'miaobyte',
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
diff --git a/front/py/deepx/tensor/elementwise.py b/front/py/deepx/tensor/elementwise.py
diff --git a/front/py/examples/2_ir/2_elementwise_add.py b/front/py/examples/2_ir/2_elementwise_add.py