summaryrefslogtreecommitdiff
path: root/sc/source/core
diff options
context:
space:
mode:
authorWei Wei <weiwei@multicorewareinc.com>2013-11-15 17:33:19 -0600
committerI-Jui (Ray) Sung <ray@multicorewareinc.com>2013-11-15 18:02:20 -0600
commit1e3bc2925c0ec1b03d6ae7cf3f281b0df3ec88d3 (patch)
tree1c9b6da1792ebc37dc9933fd9b410d8b5d220ace /sc/source/core
parent501bc66c780ab8fde801eeedc1f7c89762050713 (diff)
GPU Calc: implemented parallel reduction for SUMIFS
For now only works for fixed and sliding fixed-sized windows. Change-Id: I25e3f893a86d0e1723ae1e1633ffeeee93926b8d Signed-off-by: I-Jui (Ray) Sung <ray@multicorewareinc.com>
Diffstat (limited to 'sc/source/core')
-rw-r--r--sc/source/core/opencl/formulagroupcl.cxx163
-rw-r--r--sc/source/core/opencl/op_math.cxx225
-rw-r--r--sc/source/core/opencl/op_math.hxx4
-rw-r--r--sc/source/core/opencl/opbase.cxx43
-rw-r--r--sc/source/core/opencl/opbase.hxx6
5 files changed, 365 insertions, 76 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index e4b6bfbd4286..79c33ae09caa 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -419,42 +419,51 @@ public:
bIsStartFixed = mpDVR->IsStartFixed();
bIsEndFixed = mpDVR->IsEndFixed();
}
+ virtual bool NeedParallelReduction(void) const
+ {
+ return GetWindowSize()> 100 &&
+ ( (GetStartFixed() && GetEndFixed()) ||
+ (!GetStartFixed() && !GetEndFixed()) ) ;
+ }
virtual void GenSlidingWindowFunction(std::stringstream &ss) {
- std::string name = Base::GetName();
- ss << "__kernel void "<<name;
- ss << "_reduction(__global double* A, "
- "__global double *result,int arrayLength,int windowSize){\n";
- ss << " double tmp, current_result = 0.0;\n";
- ss << " int writePos = get_group_id(1);\n";
- ss << " int offset = get_group_id(1);\n";
- ss << " int lidx = get_local_id(0);\n";
- ss << " __local double shm_buf[256];\n";
- ss << " if (arrayLength == windowSize)\n";
- ss << " offset = 0;\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " int loop = arrayLength/512 + 1;\n";
- ss << " for (int l=0; l<loop; l++){\n";
- ss << " tmp = 0.0;\n";
- ss << " int loopOffset = l*512;\n";
- ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
- ss << " tmp = A[loopOffset + lidx + offset] + "
- "A[loopOffset + lidx + offset + 256];\n";
- ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
- ss << " tmp = A[loopOffset + lidx + offset];\n";
- ss << " shm_buf[lidx] = tmp;\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " for (int i = 128; i >0; i/=2) {\n";
- ss << " if (lidx < i)\n";
- ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " }\n";
- ss << " if (lidx == 0)\n";
- ss << " current_result += shm_buf[0];\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " }\n";
- ss << " if (lidx == 0)\n";
- ss << " result[writePos] = current_result;\n";
- ss << "}\n";
+ if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+ {
+ std::string name = Base::GetName();
+ ss << "__kernel void "<<name;
+ ss << "_reduction(__global double* A, "
+ "__global double *result,int arrayLength,int windowSize){\n";
+ ss << " double tmp, current_result = 0.0;\n";
+ ss << " int writePos = get_group_id(1);\n";
+ ss << " int offset = get_group_id(1);\n";
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ ss << " if (arrayLength == windowSize)\n";
+ ss << " offset = 0;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = 0.0;\n";
+ ss << " int loopOffset = l*512;\n";
+ ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
+ ss << " tmp = A[loopOffset + lidx + offset] + "
+ "A[loopOffset + lidx + offset + 256];\n";
+ ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
+ ss << " tmp = A[loopOffset + lidx + offset];\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result += shm_buf[0];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }
}
@@ -573,11 +582,16 @@ public:
if (CL_SUCCESS != err)
throw OpenCLError(err);
// reproduce the reduction function name
- std::string kernelName = Base::GetName() + "_reduction";
+ std::string kernelName;
+ if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+ kernelName = Base::GetName() + "_reduction";
+ else throw Unhandled();
+
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError(err);
// set kernel arg of reduction kernel
+ // TODO(Wei Wei): use unique name for kernel
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
(void *)&(Base::mpClmem));
if (CL_SUCCESS != err)
@@ -621,6 +635,14 @@ public:
}
}
+ size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); }
+
+ size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); }
+
+ size_t GetStartFixed(void) const {return bIsStartFixed; }
+
+ size_t GetEndFixed(void) const {return bIsEndFixed; }
+
protected:
bool bIsStartFixed, bIsEndFixed;
const formula::DoubleVectorRefToken *mpDVR;
@@ -1001,6 +1023,75 @@ public:
{
i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
}
+ if (OpSumIfs *OpSumCodeGen = dynamic_cast<OpSumIfs*>(mpCodeGen.get()))
+ {
+ assert(mpClmem == NULL);
+ // Obtain cl context
+ KernelEnv kEnv;
+ OpenclDevice::setKernelEnv(&kEnv);
+ cl_int err;
+ DynamicKernelSlidingArgument<DynamicKernelArgument> *slidingArgPtr =
+ dynamic_cast< DynamicKernelSlidingArgument<DynamicKernelArgument> *>
+ (mvSubArguments[0].get());
+ cl_mem mpClmem2;
+
+ if (OpSumCodeGen->NeedReductionKernel())
+ {
+ assert(slidingArgPtr);
+ size_t nInput = slidingArgPtr -> GetArrayLength();
+ size_t nCurWindowSize = slidingArgPtr -> GetWindowSize();
+ std::vector<cl_mem> vclmem;
+
+ for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
+ ++it)
+ {
+ vclmem.push_back((*it)->GetCLBuffer());
+ }
+ mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE,
+ sizeof(double)*nVectorWidth, NULL, &err);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ std::string kernelName = "SumIfs_reduction";
+ cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err);
+ if (err != CL_SUCCESS)
+ throw OpenCLError(err);
+
+ // set kernel arg of reduction kernel
+ for (size_t j=0; j< vclmem.size(); j++){
+ err = clSetKernelArg(redKernel, j, sizeof(cl_mem),
+ (void *)&vclmem[j]);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ }
+ err = clSetKernelArg(redKernel, vclmem.size(), sizeof(cl_mem), (void *)&mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, vclmem.size()+1, sizeof(cl_int), (void*)&nInput);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, vclmem.size()+2, sizeof(cl_int), (void*)&nCurWindowSize);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ // set work group size and execute
+ size_t global_work_size[] = {256, (size_t)nVectorWidth };
+ size_t local_work_size[] = {256, 1};
+ err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
+ global_work_size, local_work_size, 0, NULL, NULL);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ err = clFinish(kEnv.mpkCmdQueue);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ // Pass mpClmem2 to the "real" kernel
+ err = clSetKernelArg(k, argno, sizeof(cl_mem), (void *)&mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ }
+ }
return i;
}
diff --git a/sc/source/core/opencl/op_math.cxx b/sc/source/core/opencl/op_math.cxx
index 5e5c7494bc2c..30eb759303f5 100644
--- a/sc/source/core/opencl/op_math.cxx
+++ b/sc/source/core/opencl/op_math.cxx
@@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
size_t nCurWindowSize = pCurDVR->GetArrayLength() <
pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength():
pCurDVR->GetRefRowSize() ;
+
+ mNeedReductionKernel = vSubArguments[0]->NeedParallelReduction();
+ if (mNeedReductionKernel)
+ {
+ // generate reduction functions
+ ss << "__kernel void ";
+ ss << "SumIfs_reduction( ";
+ for (unsigned i = 0; i < vSubArguments.size(); i++)
+ {
+ if (i)
+ ss << ",";
+ vSubArguments[i]->GenSlidingWindowDecl(ss);
+ }
+ ss << ", __global double *result,int arrayLength,int windowSize";
+
+ ss << ")\n{\n";
+ ss << " double tmp =0;\n";
+ ss << " int i ;\n";
+
+ GenTmpVariables(ss,vSubArguments);
+ ss << " double current_result = 0.0;\n";
+ ss << " int writePos = get_group_id(1);\n";
+ if (pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed())
+ ss << " int offset = 0;\n";
+ else if (!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
+ ss << " int offset = get_group_id(1);\n";
+ else
+ throw Unhandled();
+ // actually unreachable
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = 0.0;\n";
+ ss << " int loopOffset = l*512;\n";
+
+ ss << " int p1 = loopOffset + lidx + offset, p2 = p1 + 256;\n";
+ ss << " if (p2 < min(offset + windowSize, arrayLength)) {\n";
+ ss << " tmp0 = 0.0;\n";
+ int mm=0;
+ std::string p1 = "p1";
+ std::string p2 = "p2";
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+ {
+ CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
+ CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
+ ss << "";
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << "))";
+ ss << "{\n";
+ }
+ CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
+ {
+ for(int n = 0;n<mm+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n\n";
+ }
+ mm=0;
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+ {
+ CheckSubArgumentIsNan2(ss,vSubArguments,j,p2);
+ CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p2);
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << ")){\n";
+ }
+ CheckSubArgumentIsNan2(ss,vSubArguments,0,p2);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j< vSubArguments.size();j+=2,mm--)
+ {
+ for(int n = 0;n<mm+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n";
+ }
+ ss << " }\n";
+
+ ss << " else if (p1 < min(arrayLength, offset + windowSize)) {\n";
+ mm=0;
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+ {
+ CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
+ CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
+
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << ")){\n";
+ }
+ CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
+ {
+ for(int n = 0;n<mm+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n\n";
+ }
+
+ ss << " }\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result += shm_buf[0];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }// finish generate reduction code
+ // generate functions as usual
ss << "\ndouble " << sSymName;
ss << "_"<< BinFuncName() <<"(";
for (unsigned i = 0; i < vSubArguments.size(); i++)
@@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
ss << ")\n {\n";
ss <<" int gid0=get_global_id(0);\n";
ss << " double tmp =0;\n";
- ss << " int i ;\n";
- GenTmpVariables(ss,vSubArguments);
- ss << " for (i = ";
- if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
- ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
- } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
- ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
- } else {
- ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
- }
- ss << " {\n";
- if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
- {
- ss<< " int doubleIndex =i+gid0;\n";
- }else
- {
- ss<< " int doubleIndex =i;\n";
- }
- ss<< " int singleIndex =gid0;\n";
- int m=0;
- for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
- {
- CheckSubArgumentIsNan(ss,vSubArguments,j);
- CheckSubArgumentIsNan(ss,vSubArguments,j+1);
- ss <<" if(isequal(";
- ss <<"tmp";
- ss <<j;
- ss <<" , ";
- ss << "tmp";
- ss << j+1;
- ss << ")){\n";
- }
- CheckSubArgumentIsNan(ss,vSubArguments,0);
- ss << " tmp += tmp0;\n";
- for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
- {
- for(int n = 0;n<m+1;n++)
+ if (!mNeedReductionKernel)
+ {
+ ss << " int i ;\n";
+ GenTmpVariables(ss,vSubArguments);
+ ss << " for (i = ";
+ if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
+ ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
+ } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
+ ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
+ } else {
+ ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
+ }
+ ss << " {\n";
+ if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
{
- ss << " ";
+ ss<< " int doubleIndex =i+gid0;\n";
+ }else
+ {
+ ss<< " int doubleIndex =i;\n";
}
- ss<< "}\n";
- }
+ ss<< " int singleIndex =gid0;\n";
+ int m=0;
+ for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
+ {
+ CheckSubArgumentIsNan(ss,vSubArguments,j);
+ CheckSubArgumentIsNan(ss,vSubArguments,j+1);
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << ")){\n";
+ }
+ CheckSubArgumentIsNan(ss,vSubArguments,0);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
+ {
+ for(int n = 0;n<m+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n";
+ }
+ }
+ if (mNeedReductionKernel)
+ {
+ ss << "tmp =";
+ vSubArguments[0]->GenDeclRef(ss);
+ ss << "[gid0];\n";
+ }
ss << "return tmp;\n";
ss << "}";
}
diff --git a/sc/source/core/opencl/op_math.hxx b/sc/source/core/opencl/op_math.hxx
index 01cbc82fee40..7081b00b952f 100644
--- a/sc/source/core/opencl/op_math.hxx
+++ b/sc/source/core/opencl/op_math.hxx
@@ -33,9 +33,13 @@ public:
class OpSumIfs: public CheckVariables
{
public:
+ OpSumIfs(void): CheckVariables(), mNeedReductionKernel(false) {}
virtual void GenSlidingWindowFunction(std::stringstream &ss,
const std::string sSymName, SubArguments &vSubArguments);
virtual std::string BinFuncName(void) const { return "SumIfs"; }
+ bool NeedReductionKernel(void) const { return mNeedReductionKernel; }
+protected:
+ bool mNeedReductionKernel;
};
class OpCosh: public Normal
{
diff --git a/sc/source/core/opencl/opbase.cxx b/sc/source/core/opencl/opbase.cxx
index 699833c3c285..07425dfe245e 100644
--- a/sc/source/core/opencl/opbase.cxx
+++ b/sc/source/core/opencl/opbase.cxx
@@ -156,6 +156,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss,
ss << vSubArguments[i]->GenSlidingWindowDeclRef();
ss<<";\n";
}
+
+void CheckVariables::CheckSubArgumentIsNan2( std::stringstream & ss,
+ SubArguments &vSubArguments, int argumentNum, std::string p)
+{
+ int i = argumentNum;
+ if(vSubArguments[i]->GetFormulaToken()->GetType() == formula::svDouble)
+ {
+ ss <<" tmp";
+ ss <<i;
+ ss << "=";
+ vSubArguments[i]->GenDeclRef(ss);
+ ss<<";\n";
+ return;
+ }
+
+#ifdef ISNAN
+ ss<< " tmp";
+ ss<< i;
+ ss<< "= fsum(";
+ vSubArguments[i]->GenDeclRef(ss);
+ if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svDoubleVectorRef)
+ ss<<"["<< p.c_str()<< "]";
+ else if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svSingleVectorRef)
+ ss<<"[get_group_id(1)]";
+ ss<<", 0);\n";
+ return;
+#endif
+ ss <<" tmp";
+ ss <<i;
+ ss << "=";
+ vSubArguments[i]->GenDeclRef(ss);
+ if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svDoubleVectorRef)
+ ss<<"["<< p.c_str()<< "]";
+ else if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svSingleVectorRef)
+ ss<<"[get_group_id(1)]";
+
+ ss<<";\n";
+}
+
void CheckVariables::CheckAllSubArgumentIsNan(
std::stringstream & ss, SubArguments & vSubArguments)
{
diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx
index 41e45877e2e8..6b475df42a6b 100644
--- a/sc/source/core/opencl/opbase.hxx
+++ b/sc/source/core/opencl/opbase.hxx
@@ -104,6 +104,9 @@ public:
virtual void DumpInlineFun(std::set<std::string>& ,
std::set<std::string>& ) const {}
const std::string& GetName(void) const { return mSymName; }
+ cl_mem GetCLBuffer(void) const {return mpClmem; }
+ virtual bool NeedParallelReduction(void) const { return false; }
+
protected:
const std::string mSymName;
FormulaTreeNodeRef mFormulaTree;
@@ -157,6 +160,9 @@ public:
SubArguments &vSubArguments, int argumentNum);
void CheckAllSubArgumentIsNan(std::stringstream &ss,
SubArguments &vSubArguments);
+ // only check isNan
+ void CheckSubArgumentIsNan2(std::stringstream &ss,
+ SubArguments &vSubArguments, int argumentNum, std::string p);
};
}}