4 files changed, 196 insertions, 67 deletions
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index d92a4719f5d9..8bc0224b874f 100755
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -310,11 +310,8 @@ bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAdd
     OpCode eOp; // type of operation: ocAverage, ocMax, ocMin
     const double *pArrayToSubtractOneElementFrom;
     const double *pGroundWaterDataArray;
-    size_t        nGroundWaterDataArrayLen;
 
     // Output:
-    double *pResult = new double[xGroup->mnLength];
-    RETURN_IF_FAIL(pResult != NULL, "buffer alloc failed");
     std::vector<double> aMatrixContent;
 
     const formula::FormulaToken *p;
@@ -338,36 +335,13 @@ bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAdd
 
             p = rCode.NextNoSpaces();
             RETURN_IF_FAIL(p != NULL, "no function argument");
-            if (p->GetType() == formula::svDoubleVectorRef)
-            {
-                // FIXME: this is what I would expect; but table1.cxx's
-                // ScColumn::ResolveStaticReference as called from
-                // GroupTokenConverter::convert returns an ScMatrixToken un-conditionally
-                const formula::DoubleVectorRefToken* pDvr = static_cast<const formula::DoubleVectorRefToken*>(p);
-                const std::vector<const double*>& rArrays = pDvr->GetArrays();
-                RETURN_IF_FAIL(rArrays.size() == 1, "unexpectedly large double ref array");
-                RETURN_IF_FAIL(pDvr->GetArrayLength() == (size_t)xGroup->mnLength, "wrong double ref length");
-                RETURN_IF_FAIL(pDvr->IsStartFixed() && pDvr->IsEndFixed(), "non-fixed ranges )");
-                pGroundWaterDataArray = rArrays[0];
-                nGroundWaterDataArrayLen = xGroup->mnLength;
-            }
-            else
-            {
-                RETURN_IF_FAIL(p->GetType() == formula::svMatrix, "unexpected fn. param type");
-                const ScMatrixToken *pMatTok = static_cast<const ScMatrixToken *>(p);
-                pMatTok->GetMatrix()->GetDoubleArray( aMatrixContent );
-                // FIXME: horrible hackery: the legacy / excel shared formula oddness,
-                // such that the 1st entry is not truly shared, making these a different
-                // shape.
-                if (aMatrixContent.size() > (size_t)xGroup->mnLength + 1)
-                {
-                    fprintf(stderr, "Error size range mismatch: %ld vs %ld\n",
-                            (long)aMatrixContent.size(), (long)xGroup->mnLength);
-                    return false;
-                }
-                pGroundWaterDataArray = &aMatrixContent[0];
-                nGroundWaterDataArrayLen = aMatrixContent.size();
-            }
+            RETURN_IF_FAIL(p->GetType() == formula::svDoubleVectorRef, "wrong type of fn argument");
+            const formula::DoubleVectorRefToken* pDvr = static_cast<const formula::DoubleVectorRefToken*>(p);
+            const std::vector<const double*>& rArrays = pDvr->GetArrays();
+            RETURN_IF_FAIL(rArrays.size() == 1, "unexpectedly large double ref array");
+            RETURN_IF_FAIL(pDvr->GetArrayLength() == (size_t)xGroup->mnLength, "wrong double ref length");
+            RETURN_IF_FAIL(pDvr->IsStartFixed() && pDvr->IsEndFixed(), "non-fixed ranges )");
+            pGroundWaterDataArray = rArrays[0];
 
             p = rCode.NextNoSpaces();
             RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocClose, "missing closing )");
@@ -400,17 +374,22 @@ bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAdd
     //   =AVERAGE(L$6:L$7701) - L6
     // we would get:
     //   eOp => ocAverage
-    //   pGroundWaterDataArray => contains L$6:L$7701
-    //   pGroundWaterDataArrayLen => 7701 - 6 + 1
-    //   pArrayToSubtractOneElementFrom => contains L$5:L$7701 (overlapping)
+    //   pGroundWaterDataArray => contains L$5:L$7701
+    //   pArrayToSubtractOneElementFrom => contains L$5:L$7701 (ie. a copy)
     //   length of this array -> xGroup->mnLength
 
     fprintf (stderr, "Calculate !\n");
 
+    double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
+                                                        pArrayToSubtractOneElementFrom,
+                                                        (size_t) xGroup->mnLength );
+    RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed");
+
     // Insert the double data, in rResult[i] back into the document
     rDoc.SetFormulaResults(rTopPos, pResult, xGroup->mnLength);
 
     delete [] pResult;
+
     SAL_DEBUG ("exit cleanly !");
     return true;
 }
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
index 6c9012641b54..e13c24a3a158 100755
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -142,7 +142,6 @@ __kernel void oclFormulaAverage(__global float *input,__global int *start,__glob
     for(i = start[id];i<=end[id];i++)
         sum += input[i];
     output[id] = sum / (end[id]-start[id]+1);
-
 }
 
 //Sumproduct
@@ -162,6 +161,33 @@ __kernel void oclFormulaMinverse(__global float *data,
 
 }
 
+// Double precision is a requirement of spreadsheets
+#if 0
+#if defined(cl_khr_fp64)  // Khronos extension
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)  // AMD extension
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#endif
+typedef double fp_t;
+#else
+typedef float fp_t;
+#endif
+
+__kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, __global int start, __global int end, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+
+    // Average
+    int i;
+    fp_t sum = 0.0;
+    for(i = start; i < end; i++)
+        sum += values[i];
+    fp_t val = sum/(end-start);
+
+    // Subtract & output
+    output[id] = val - subtract[id];
+}
+
 );
 
 #endif // USE_EXTERNAL_KERNEL
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 597f37097ce8..3030a2e8b703 100755
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -76,16 +76,17 @@ int OpenclDevice::ReleaseOpenclRunEnv() {
 }
 ///////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////
-inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
+inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName)
+{
     strcpy(gpuEnv.mArrykernelNames[kCount], kName);
     gpuEnv.mnKernelCount++;
     return 0;
 }
 
-int OpenclDevice::RegistOpenclKernel() {
-    if (!gpuEnv.mnIsUserCreated) {
+int OpenclDevice::RegistOpenclKernel()
+{
+    if (!gpuEnv.mnIsUserCreated)
         memset(&gpuEnv, 0, sizeof(gpuEnv));
-    }
 
     gpuEnv.mnFileCount = 0; //argc;
     gpuEnv.mnKernelCount = 0UL;
@@ -100,17 +101,22 @@ int OpenclDevice::RegistOpenclKernel() {
     AddKernelConfig(7, (const char*) "oclFormulaSumproduct");
     AddKernelConfig(8, (const char*) "oclFormulaMinverse");
 
-    AddKernelConfig(9,    (const char*) "oclSignedAdd");
+    AddKernelConfig(9,  (const char*) "oclSignedAdd");
     AddKernelConfig(10, (const char*) "oclSignedSub");
     AddKernelConfig(11, (const char*) "oclSignedMul");
     AddKernelConfig(12, (const char*) "oclSignedDiv");
+    AddKernelConfig(13, (const char*) "oclAverageDelta");
+
     return 0;
 }
-OpenclDevice::OpenclDevice(){
+
+OpenclDevice::OpenclDevice()
+{
     //InitEnv();
 }
 
-OpenclDevice::~OpenclDevice() {
+OpenclDevice::~OpenclDevice()
+{
     //ReleaseOpenclRunEnv();
 }
 
@@ -122,13 +128,15 @@ int OpenclDevice::SetKernelEnv(KernelEnv *envInfo)
 
     return 1;
 }
-int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName){
+
+int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName)
+{
     //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
     int kCount;
     for(kCount=0; kCount < gpuEnv.mnKernelCount; kCount++) {
         if(strcasecmp(kernelName, gpuEnv.mArrykernelNames[kCount]) == 0) {
-        printf("match  %s kernel right\n",kernelName);
-        break;
+            printf("match %s kernel right\n",kernelName);
+            break;
         }
     }
     envInfo->mpkKernel = gpuEnv.mpArryKernels[kCount];
@@ -141,7 +149,8 @@ int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName){
     return 1;
 }
 
-int OpenclDevice::ConvertToString(const char *filename, char **source) {
+int OpenclDevice::ConvertToString(const char *filename, char **source)
+{
     int file_size;
     size_t result;
     FILE *file = NULL;
@@ -174,8 +183,9 @@ int OpenclDevice::ConvertToString(const char *filename, char **source) {
     return 0;
 }
 
-int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle) {
-        unsigned int i = 0;
+int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle)
+{
+    unsigned int i = 0;
     cl_int status;
     char *str = NULL;
     FILE *fd = NULL;
@@ -208,7 +218,8 @@ int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle) {
 }
 
 int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
-        size_t numBytes) {
+        size_t numBytes)
+{
     FILE *output = NULL;
     output = fopen(fileName, "wb");
     if (output == NULL) {
@@ -223,7 +234,8 @@ int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
 }
 
 int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
-        const char * clFileName) {
+                                             const char * clFileName)
+{
      unsigned int i = 0;
     cl_int status;
     size_t *binarySizes, numDevices;
@@ -319,10 +331,10 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
     return 1;
 }
 
-int OpenclDevice::InitOpenclAttr(OpenCLEnv * env) {
-    if (gpuEnv.mnIsUserCreated) {
+int OpenclDevice::InitOpenclAttr(OpenCLEnv * env)
+{
+    if (gpuEnv.mnIsUserCreated)
         return 1;
-    }
 
     gpuEnv.mpContext    = env->mpOclContext;
     gpuEnv.mpPlatformID = env->mpOclPlatformID;
@@ -334,21 +346,24 @@ int OpenclDevice::InitOpenclAttr(OpenCLEnv * env) {
     return 0;
 }
 
-int OpenclDevice::CreateKernel(char * kernelname, KernelEnv * env) {
+int OpenclDevice::CreateKernel(char * kernelname, KernelEnv * env)
+{
     int status;
 
-    env->mpkKernel     = clCreateKernel(gpuEnv.mpArryPrograms[0], kernelname, &status);
+    env->mpkKernel   = clCreateKernel(gpuEnv.mpArryPrograms[0], kernelname, &status);
     env->mpkContext  = gpuEnv.mpContext;
     env->mpkCmdQueue = gpuEnv.mpCmdQueue;
     return status != CL_SUCCESS ? 1 : 0;
 }
 
-int OpenclDevice::ReleaseKernel(KernelEnv * env) {
+int OpenclDevice::ReleaseKernel(KernelEnv * env)
+{
     int status = clReleaseKernel(env->mpkKernel);
     return status != CL_SUCCESS ? 1 : 0;
 }
 
-int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
+int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo)
+{
     int i = 0;
     int status = 0;
 
@@ -378,7 +393,8 @@ int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
 }
 
 int OpenclDevice::RunKernelWrapper(cl_kernel_function function,
-        const char * kernelName, void **usrdata) {
+                                   const char * kernelName, void **usrdata)
+{
     printf("oclwrapper:RunKernel_wrapper...\n");
     if (RegisterKernelWrapper(kernelName, function) != 1) {
         fprintf(stderr,
@@ -389,8 +405,9 @@ int OpenclDevice::RunKernelWrapper(cl_kernel_function function,
 }
 
 int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached,
-        const char * clFileName) {
-  int i;
+                                    const char * clFileName)
+{
+    int i;
     for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
         if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
             if (gpuEnvCached->mpArryPrograms[i] != NULL) {
@@ -574,6 +591,7 @@ int OpenclDevice::RunKernel(const char *kernelName, void **userdata) {
     }
     return 0;
 }
+
 int OpenclDevice::InitOpenclRunEnv(int argc, const char *buildOptionKernelfiles)
 {
     int status = 0;
@@ -1007,6 +1025,7 @@ int OclCalc::OclHostFormulaMax(double *srcData,int *start,int *end,double *outpu
     CHECK_OPENCL(clStatus);
     return 0;
 }
+
 int OclCalc::OclHostFormulaMin(double *srcData,int *start,int *end,double *output,int size)
 {
     KernelEnv kEnv;
@@ -1590,7 +1609,6 @@ int OclCalc::OclHostFormulaMin32Bits(float *fpSrcData,uint *npStartPos,uint *npE
     clStatus = clReleaseMemObject(outputCl);
     CHECK_OPENCL(clStatus);
     return 0;
-
 }
 
 int OclCalc::OclHostFormulaAverage32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int size)
@@ -2066,4 +2084,111 @@ int OclCalc::OclHostFormulaSumProduct(float *dpSrcData,int *npStart,int *npEnd,f
 }
 #endif
 
+#if 0
+typedef double fp_;
+#else
+typedef float fp_t;
+#endif
+
+// FIXME: should be templatised in <double> - double buffering [sic] rocks
+static cl_mem allocateDoubleBuffer(KernelEnv &rEnv, const double *_pValues,
+                                   size_t nElements, cl_int *pStatus)
+{
+    // Ugh - horrible redundant copying ...
+    cl_mem xValues = clCreateBuffer(rEnv.mpkContext,(cl_mem_flags) (CL_MEM_READ_WRITE),
+                                    nElements * sizeof(double), NULL, pStatus);
+    fp_t *pValues = (fp_t *)clEnqueueMapBuffer(rEnv.mpkCmdQueue,xValues,CL_TRUE,CL_MAP_WRITE,0,
+                                                   nElements * sizeof(fp_t),0,NULL,NULL,NULL);
+    for(int i=0;i<nElements;i++)
+        pValues[i] = (fp_t)_pValues[i];
+
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
+
+    return xValues;
+}
+
+double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
+                                         const double *pSubtractSingle, size_t nElements)
+{
+    KernelEnv kEnv;
+
+    // select a kernel: cut & paste coding is utterly evil.
+    const char *kernelName;
+    switch (eOp) {
+    case ocMax:
+    case ocMin:
+        ; // FIXME: fallthrough for now
+    case ocAverage:
+        kernelName = "oclAverageDelta";
+        break;
+    default:
+        assert(false);
+    }
+    CheckKernelName(&kEnv,kernelName);
+
+    cl_int clStatus;
+    size_t global_work_size[1];
+
+    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+
+    // Ugh - horrible redundant copying ...
+    cl_mem valuesCl   = allocateDoubleBuffer(kEnv, pOpArray, nElements, &clStatus);
+    cl_mem subtractCl = allocateDoubleBuffer(kEnv, pSubtractSingle, nElements, &clStatus);
+
+    cl_int start = 0;
+    cl_int end = (cl_int) nElements;
+    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext,
+                                     CL_MEM_READ_WRITE,
+                                     nElements * sizeof(fp_t),
+                                     NULL,
+                                     &clStatus);
+    CHECK_OPENCL(clStatus);
+
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
+                              (void *)&valuesCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
+                              (void *)&subtractCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
+                              (void *)&start);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
+                              (void *)&end);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 4, sizeof(cl_mem),
+                              (void *)&outputCl);
+    CHECK_OPENCL(clStatus);
+
+    fprintf(stderr, "prior to enqueue range kernel\n");
+
+    global_work_size[0] = nElements;
+    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+                                      NULL, global_work_size, NULL, 0, NULL, NULL);
+    CHECK_OPENCL(clStatus);
+
+    double *pResult = new double[nElements];
+    if(!pResult)
+        return NULL; // leak.
+
+    fp_t *pOutput = (fp_t *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,
+                                               CL_MAP_READ,0,nElements*sizeof(fp_t),
+                                               0,NULL,NULL,NULL);
+    for(int i = 0; i < nElements; i++)
+        pResult[i] = (double)pOutput[i];
+
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,pOutput,0,NULL,NULL);
+
+    clStatus = clFinish(kEnv.mpkCmdQueue);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseKernel(kEnv.mpkKernel);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(valuesCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(subtractCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(outputCl);
+    CHECK_OPENCL(clStatus);
+
+    fprintf(stderr, "completed opencl delta operation\n");
+
+    return pResult;
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/opencl/openclwrapper.hxx b/sc/source/core/opencl/openclwrapper.hxx
index 3e87f8445e7d..a0c132a2c49f 100755
--- a/sc/source/core/opencl/openclwrapper.hxx
+++ b/sc/source/core/opencl/openclwrapper.hxx
@@ -11,10 +11,8 @@
 #define _OPENCL_WRAPPER_H_
 
 #include <config_features.h>
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
+#include <formula/opcode.hxx>
+#include <cassert>
 #include <CL/cl.h>
 #endif
 
@@ -212,6 +210,7 @@ public:
     int OclHostFormulaMax32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int outputSize);
     int OclHostFormulaMin32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int outputSize);
     int OclHostFormulaAverage32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int outputSize);
+    double *OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements);
 
     //int OclHostFormulaCount(int *startPos,int *endPos,float *output,int outputSize);
     //int OclHostFormulaSum(float *srcData,int *startPos,int *endPos,float *output,int outputSize);