summaryrefslogtreecommitdiff
path: root/sc/source/core/opencl
diff options
context:
space:
mode:
authorJing Xian <jingxian@multicorewareinc.com>2013-07-11 11:35:54 +0100
committerKohei Yoshida <kohei.yoshida@gmail.com>2013-07-24 23:29:41 -0400
commite8e9f24d917ef2c0aaef3e4c637df00abaacb916 (patch)
tree2fa5030fa00cb719f7e6746f9fef2f754acc0018 /sc/source/core/opencl
parent21f9cb6f5f774d918a33f72d981385c619270943 (diff)
add min/max/delta kernels and misc. cleanup / bug fixing.
Conflicts: sc/source/core/opencl/openclwrapper.cxx Change-Id: I4a752dcb407a89f8da19886131c186b44c7e2a8e
Diffstat (limited to 'sc/source/core/opencl')
-rw-r--r--sc/source/core/opencl/formulagroupcl.cxx70
-rw-r--r--sc/source/core/opencl/oclkernels.hxx70
-rw-r--r--sc/source/core/opencl/openclwrapper.cxx44
3 files changed, 121 insertions, 63 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index c24f13a91a4e..76a1d2e606d3 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -116,16 +116,19 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
memset(rResult,0,rowSize);
float * fpOclSrcData = NULL; // Point to the input data from CPU
uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
- uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
- float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
+ uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
+ float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
float * fpRightData = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
// The rightData can't be zero for "/"
static OclCalc ocl_calc;
- // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
- // Don't know which formulae will be used previously, so create buffers for different formulae used probably
- ocl_calc.CreateBuffer(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
- ocl_calc.CreateBuffer(fpLeftData,fpRightData,rowSize);
- //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
+ if(ocl_calc.GetOpenclState())
+ {
+ // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
+ // Don't know which formulae will be used previously, so create buffers for different formulae used probably
+ ocl_calc.CreateBuffer(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
+ ocl_calc.CreateBuffer(fpLeftData,fpRightData,rowSize);
+ //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
+ }
///////////////////////////////////////////////////////////////////////////////////////////
// Until we implement group calculation for real, decompose the group into
@@ -157,10 +160,11 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
nRowEnd += i;
size_t nRowSize = nRowEnd - nRowStart + 1;
ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0));
-
- npOclStartPos[i] = nRowStart; // record the start position
- npOclEndPos[i] = nRowEnd; // record the end position
-
+ if(ocl_calc.GetOpenclState())
+ {
+ npOclStartPos[i] = nRowStart; // record the start position
+ npOclEndPos[i] = nRowEnd; // record the end position
+ }
for (size_t nCol = 0; nCol < nColSize; ++nCol)
{
const double* pArray = rArrays[nCol];
@@ -169,12 +173,14 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
fprintf(stderr,"Error: pArray is NULL!\n");
return false;
}
-
- for( size_t u=0; u<rowSize; u++ )
+ if(ocl_calc.GetOpenclState())
{
- // Many video cards can't support double type in kernel, so need transfer the double to float
- fpOclSrcData[u] = (float)pArray[u];
- //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
+ for( size_t u=nRowStart; u<=nRowEnd; u++ )
+ {
+ // Many video cards can't support double type in kernel, so need transfer the double to float
+ fpOclSrcData[u] = (float)pArray[u];
+ //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
+ }
}
for (size_t nRow = 0; nRow < nRowSize; ++nRow)
@@ -199,22 +205,23 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos);
if (!pDest)
return false;
-
- const formula::FormulaToken *pCur = aCode2.First();
- aCode2.Reset();
- while( ( pCur = aCode2.Next() ) != NULL )
+ if(ocl_calc.GetOpenclState())
{
- OpCode eOp = pCur->GetOpCode();
- if(eOp==0)
+ const formula::FormulaToken *pCur = aCode2.First();
+ aCode2.Reset();
+ while( ( pCur = aCode2.Next() ) != NULL )
{
- if(nCount3%2==0)
- fpLeftData[nCount1++] = (float)pCur->GetDouble();
- else
- fpRightData[nCount2++] = (float)pCur->GetDouble();
- nCount3++;
- }
- else if( eOp!=ocOpen && eOp!=ocClose )
- nOclOp = eOp;
+ OpCode eOp = pCur->GetOpCode();
+ if(eOp==0)
+ {
+ if(nCount3%2==0)
+ fpLeftData[nCount1++] = (float)pCur->GetDouble();
+ else
+ fpRightData[nCount2++] = (float)pCur->GetDouble();
+ nCount3++;
+ }
+ else if( eOp!=ocOpen && eOp!=ocClose )
+ nOclOp = eOp;
// if(count1>0){//dbg
// fprintf(stderr,"leftData is %f.\n",leftData[count1-1]);
@@ -224,11 +231,12 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
// fprintf(stderr,"rightData is %f.\n",rightData[count2-1]);
// count2--;
// }
+ }
}
if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState())
{
- fprintf(stderr,"ccCPU flow...\n\n");
+ //fprintf(stderr,"ccCPU flow...\n\n");
generateRPNCode(rDoc, aTmpPos, aCode2);
ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2);
aInterpreter.Interpret();
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
index e13c24a3a158..c231dbd76b3c 100644
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -162,32 +162,66 @@ __kernel void oclFormulaMinverse(__global float *data,
}
// Double precision is a requirement of spreadsheets
-#if 0
-#if defined(cl_khr_fp64) // Khronos extension
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#elif defined(cl_amd_fp64) // AMD extension
-#pragma OPENCL EXTENSION cl_amd_fp64 : enable
-#endif
-typedef double fp_t;
-#else
-typedef float fp_t;
-#endif
-
-__kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, __global int start, __global int end, __global fp_t *output)
+// cl_khr_fp64: Khronos extension
+// cl_amd_fp64: AMD extension
+\n#if 0 \n
+\n#if defined(cl_khr_fp64) \n
+\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n
+\n#elif defined(cl_amd_fp64) \n
+\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n
+\n#endif \n
+\ntypedef double fp_t; \n
+\n#else \n
+\ntypedef float fp_t; \n
+\n#endif \n
+
+__kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output)
{
const unsigned int id = get_global_id(0);
// Average
- int i;
- fp_t sum = 0.0;
- for(i = start; i < end; i++)
- sum += values[i];
- fp_t val = sum/(end-start);
+ fp_t fSum = 0.0f;
+ for(int i = start; i < end; i++)
+ fSum += values[i];
+ fp_t fVal = fSum/(end-start);
// Subtract & output
- output[id] = val - subtract[id];
+ output[id] = fVal - subtract[id];
}
+__kernel void oclMaxDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+
+ // Max
+ float fMaxVal = values[start];
+ for(int i=start+1;i < end;i++)
+ {
+ if(values[i]>fMaxVal)
+ fMaxVal = values[i];
+ }
+
+ // Subtract & output
+ output[id] = fMaxVal - subtract[id];
+}
+
+__kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+
+ // Min
+ float fMinVal = values[start];
+ for(int i=start+1;i < end;i++)
+ {
+ if(values[i]<fMinVal)
+ fMinVal = values[i];
+ }
+
+ // Subtract & output
+ output[id] = fMinVal - subtract[id];
+}
+
+
);
#endif // USE_EXTERNAL_KERNEL
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 584d533baabc..2cf8812b6f66 100644
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -106,6 +106,8 @@ int OpenclDevice::RegistOpenclKernel()
AddKernelConfig(11, (const char*) "oclSignedMul");
AddKernelConfig(12, (const char*) "oclSignedDiv");
AddKernelConfig(13, (const char*) "oclAverageDelta");
+ AddKernelConfig(14, (const char*) "OclMaxDelta");
+ AddKernelConfig(15, (const char*) "OclMinDelta");
return 0;
}
@@ -497,18 +499,17 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
//char options[512];
// create a cl program executable for all the devices specified
+ printf("BuildProgram.\n");
if (!gpuInfo->mnIsUserCreated) {
status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
buildOption, NULL, NULL);
- CHECK_OPENCL(status)
} else {
status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
buildOption, NULL, NULL);
- CHECK_OPENCL(status)
}
- printf("BuildProgram.\n");
if (status != CL_SUCCESS) {
+ printf ("BuildProgram error!\n");
if (!gpuInfo->mnIsUserCreated) {
status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, 0, NULL,
@@ -676,7 +677,7 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
&numDevices);
if (status != CL_SUCCESS) {
- return 1;
+ continue;
}
if (numDevices) {
@@ -684,6 +685,8 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
}
}
}
+ if(status!=CL_SUCCESS)
+ return 1;
free(platforms);
}
if (NULL == gpuInfo->mpPlatformID) {
@@ -2100,7 +2103,7 @@ static cl_mem allocateDoubleBuffer(KernelEnv &rEnv, const double *_pValues,
nElements * sizeof(double), NULL, pStatus);
fp_t *pValues = (fp_t *)clEnqueueMapBuffer(rEnv.mpkCmdQueue,xValues,CL_TRUE,CL_MAP_WRITE,0,
nElements * sizeof(fp_t),0,NULL,NULL,NULL);
- for(size_t i=0;i<nElements;i++)
+ for(int i=0;i<(int)nElements;i++)
pValues[i] = (fp_t)_pValues[i];
clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
@@ -2111,15 +2114,23 @@ static cl_mem allocateDoubleBuffer(KernelEnv &rEnv, const double *_pValues,
double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
const double *pSubtractSingle, size_t nElements)
{
- KernelEnv kEnv;
SetKernelEnv(&kEnv);
// select a kernel: cut & paste coding is utterly evil.
- const char *kernelName;
+ const char *kernelName = NULL;
switch (eOp) {
+ case ocAdd:
+ case ocSub:
+ fprintf(stderr,"ocSub is %d\n",ocSub);
+ case ocMul:
+ case ocDiv:
+ ; // FIXME: fallthrough for now
case ocMax:
+ kernelName = "oclMaxDelta";
+ break;
case ocMin:
- ; // FIXME: fallthrough for now
+ kernelName = "oclMinDelta";
+ break;
case ocAverage:
kernelName = "oclAverageDelta";
break;
@@ -2131,15 +2142,20 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
cl_int clStatus;
size_t global_work_size[1];
- kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+ kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram, kernelName, &clStatus);
CHECK_OPENCL(clStatus);
+ if (!kEnv.mpkKernel)
+ {
+ fprintf(stderr, "could not clCreateKernel '%s'\n", kernelName);
+ return NULL;
+ }
// Ugh - horrible redundant copying ...
cl_mem valuesCl = allocateDoubleBuffer(kEnv, pOpArray, nElements, &clStatus);
cl_mem subtractCl = allocateDoubleBuffer(kEnv, pSubtractSingle, nElements, &clStatus);
- cl_int start = 0;
- cl_int end = (cl_int) nElements;
+ cl_uint start = 0;
+ cl_uint end = (cl_uint) nElements;
cl_mem outputCl = clCreateBuffer(kEnv.mpkContext,
CL_MEM_READ_WRITE,
nElements * sizeof(fp_t),
@@ -2153,10 +2169,10 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
(void *)&subtractCl);
CHECK_OPENCL(clStatus);
- clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_int),
+ clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_uint),
(void *)&start);
CHECK_OPENCL(clStatus);
- clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_int),
+ clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_uint),
(void *)&end);
CHECK_OPENCL(clStatus);
clStatus = clSetKernelArg(kEnv.mpkKernel, 4, sizeof(cl_mem),
@@ -2177,7 +2193,7 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
fp_t *pOutput = (fp_t *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,
CL_MAP_READ,0,nElements*sizeof(fp_t),
0,NULL,NULL,NULL);
- for(size_t i = 0; i < nElements; i++)
+ for(int i = 0; i < (int)nElements; i++)
pResult[i] = (double)pOutput[i];
clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,pOutput,0,NULL,NULL);