Skip to content

Commit b2d66fd

Browse files
authored
_compute_llama3_parameters (#53)
* matmul:benchmark功能支持 * rope:实现_compute_default_rope_parameters * rope:实现中,_compute_llama3_parameters
1 parent f7019aa commit b2d66fd

File tree

18 files changed

+209
-83
lines changed

18 files changed

+209
-83
lines changed

doc/excuter/op-mem-cuda/list.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
6666
| notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor<any> A, var<any> scalar, var<float32> epsilon)->(tensor<bool> mask) |
6767
| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
68-
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
68+
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
6969
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
7070
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
7171
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |

excuter/op-mem-cuda/src/client/main.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ int main()
2828
deepx::tf::TfFactory tf_factory;
2929
register_all(tf_factory);
3030

31-
32-
3331
// 将op table输出到markdown文件
3432
string docdir = "../../../doc/excuter/op-mem-cuda/";
3533
std::ofstream md_file(docdir + "list.md");
@@ -68,13 +66,30 @@ int main()
6866
{
6967
opresp.error("op" + op.name + " not found");
7068
server.resp(opresp.to_string());
71-
cerr<<opresp.message<<endl;
69+
cerr << opresp.message << endl;
7270
continue;
7371
}
7472
(*src).init(op.name, op.args, op.returns);
73+
7574
memmutex.lock();
7675
opresp.start_at = chrono::system_clock::now();
77-
int ret = (*src).run(mem,opresp.message);
76+
int ret = 0;
77+
if ((*src).metadata.benchmark.repeat > 1)
78+
{
79+
for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
80+
{
81+
ret = (*src).run(mem, opresp.message);
82+
if (ret != 0)
83+
{
84+
break;
85+
}
86+
}
87+
}
88+
else
89+
{
90+
ret = (*src).run(mem, opresp.message);
91+
}
92+
7893
memmutex.unlock();
7994
if (ret != 0)
8095
{

excuter/op-mem-cuda/src/client/tfs.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ namespace deepx::tf
317317
// rpowscalar
318318
tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
319319
{
320-
Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
320+
Param("scalar", DataCategory::Var, Precision::Float32 | Precision::Int32),
321321
Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
322322
}),
323323
vector<Param>(

excuter/op-mem-ompsimd/src/client/main.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ int main()
2828
client::udpserver server(8080);
2929
deepx::tf::TfFactory tf_factory;
3030
register_all(tf_factory);
31-
31+
3232
// 将op table输出到markdown文件
3333
string docdir = "../../../doc/excuter/op-mem-ompsimd/";
3434
std::ofstream md_file(docdir + "list.md");
@@ -72,14 +72,28 @@ int main()
7272
(*src).init(op.name, op.args, op.returns);
7373
memmutex.lock();
7474
opresp.start_at = chrono::system_clock::now();
75-
76-
int ret = (*src).run(mem,opresp.message);
75+
int ret = 0;
76+
if ((*src).metadata.benchmark.repeat > 1)
77+
{
78+
for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
79+
{
80+
ret = (*src).run(mem, opresp.message);
81+
if (ret != 0)
82+
{
83+
break;
84+
}
85+
}
86+
}
87+
else
88+
{
89+
ret = (*src).run(mem, opresp.message);
90+
}
7791
memmutex.unlock();
7892
if (ret != 0)
7993
{
8094
opresp.error(opresp.message);
8195
server.resp(opresp.to_string());
82-
cerr<<opresp.message<<endl;
96+
cerr << opresp.message << endl;
8397
continue;
8498
}
8599
opresp.finish("");

excuter/op-mem-ompsimd/src/client/tfs.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ namespace deepx::tf
338338
// rpowscalar author=miaobyte
339339
tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
340340
{
341-
Param("scalar", DataCategory::Var, Precision::Any),
341+
Param("scalar", DataCategory::Var, Precision::Float32),
342342
Param("A", DataCategory::Tensor, Precision::Any),
343343
}),
344344
vector<Param>(

excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,29 @@ namespace deepx::tensorfunc
1515
throw std::invalid_argument("A.shape could matmul with B.shape");
1616
}
1717
//TODO
18-
//这里如果对二维矩阵运算,则omp并行不起来,因为C.shape.dim() - 2刚好=0
19-
C.shape.rangeParallel(C.shape.dim() - 2, [&](const std::vector<int> &indices)
20-
{
21-
int aIdx=A.shape.linearat(indices);
22-
int bIdx=B.shape.linearat(indices);
23-
int cIdx=C.shape.linearat(indices);
24-
int m=A.shape[-2];
25-
int k=A.shape[-1];
26-
int n=B.shape[-1];
27-
for(int i=0;i<m;i++){
28-
for(int j=0;j<n;j++){
29-
T sum=0;
30-
for(int l=0;l<k;l++){
31-
sum+=A.data[aIdx+i*k+l]*B.data[bIdx+l*n+j];
32-
}
33-
C.data[cIdx+i*n+j]=sum;
34-
}
35-
} });
18+
//这里需要进一步优化
19+
C.shape.rangeParallel(C.shape.dim(), [&A,&B,&C](const int idx,const std::vector<int> &indices,ThreadLocalVectors &tlv) {
20+
21+
// int m=A.shape[-2];
22+
int k=A.shape[-1];
23+
// int n=B.shape[-1];
24+
25+
std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin());
26+
tlv.get(0)[indices.size()-2]=A.shape[-2];
27+
tlv.get(0)[indices.size()-1]=indices[-1];
28+
int aIdx=A.shape.linearat(tlv.get(0));
29+
std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin());
30+
tlv.get(1)[indices.size()-2]=0;
31+
tlv.get(1)[indices.size()-1]=indices[-2];
32+
int bIdx=B.shape.linearat(tlv.get(1));
33+
int bstride=k;
34+
35+
T sum=0;
36+
for(int l=0;l<k;l++){
37+
sum+=A.data[aIdx+l]+B.data[bIdx+l*bstride];
38+
}
39+
C.data[idx]=sum;
40+
},{A.shape.dim(),B.shape.dim()});
3641
}
3742
};
3843

excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#ifndef DEEPX_TF_MATMUL_HPP
22
#define DEEPX_TF_MATMUL_HPP
3-
3+
44
#include "deepx/tf/tf.hpp"
55
#include "deepx/dtype.hpp"
66
#include "deepx/dtype_ompsimd.hpp"
@@ -21,7 +21,7 @@ namespace deepx::tf
2121
this->args = args;
2222
this->returns = returns;
2323
}
24-
24+
2525
string math_formula() const override
2626
{
2727
return "T3=T1 @ T2";
@@ -30,7 +30,17 @@ namespace deepx::tf
3030
{
3131
return make_shared<MatMul<Author>>(*this);
3232
}
33-
int compute(shared_ptr<MemBase> mem, Precision a_type,string &error){
33+
34+
int run(shared_ptr<MemBase> mem, string &error) override
35+
{
36+
Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
37+
Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
38+
Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
39+
if (a_type != b_type || a_type != c_type)
40+
{
41+
error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
42+
return 1;
43+
}
3444
switch (a_type)
3545
{
3646
case Precision::Float64:
@@ -57,30 +67,6 @@ namespace deepx::tf
5767
}
5868
return 0;
5969
}
60-
int run(shared_ptr<MemBase> mem, string &error) override
61-
{
62-
Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
63-
Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
64-
Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
65-
if (a_type != b_type || a_type != c_type)
66-
{
67-
error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
68-
return 1;
69-
}
70-
if (metadata.benchmark.repeat > 0)
71-
{
72-
for (int i = 0; i < metadata.benchmark.repeat; i++)
73-
{
74-
if (compute(mem, a_type, error))
75-
{
76-
return 1;
77-
}
78-
}
79-
}else{
80-
return compute(mem, a_type, error);
81-
}
82-
return 0;
83-
}
8470
};
8571
}
8672

front/py/deepx/nn/functional/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"sqrt","pow","exp","log",
2626
"min","max",
2727
"less","greater","equal","notequal",
28-
"switch",
28+
"switch","where",
2929
"todtype",
3030
"invert",
3131
"matmul",

front/py/deepx/nn/functional/elementwise.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,7 @@ def bool(input:Tensor)->Tensor:
5454
from .leaffunc_elementwise import todtype
5555
dest=newtensor(input.shape,dtype='bool',name=input.name)
5656
return todtype(input,dest)
57+
58+
def where(condition:Tensor,x:Tensor,y:Tensor)->Tensor:
59+
from .leaffunc_elementwise import switch_func
60+
return switch_func((x,y),condition)

front/py/deepx/nn/functional/leaffunc_matmul.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from .leaffunc_life import newtensor
55
from .authormap import defaultauthor
66

7-
def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor:
7+
def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor:
88
outtensor=out
99
if isinstance(out,str) or out is None:
1010
outshape=Shape.matmul(a.shape,b.shape)

0 commit comments

Comments
 (0)