A kernel can be run multiple times with different args, handle that.

The stub functions becomes arg-independant, and the args are built for each KernelEvent.
author: Denis Steckelmacher <steckdenis@yahoo.fr> 2011-08-17 16:04:30 +0200
committer: Denis Steckelmacher <steckdenis@yahoo.fr> 2011-08-17 16:04:30 +0200
commit: 5b6894ddc065a818d70d579a80a4c132265e03aa (patch)
tree: b8cc83daae5ccbb0d7631bf06e4d209301a3b08e
parent: dcec4a2fd162926c0331a85a32b43de4db415bc0 (diff)
5 files changed, 258 insertions, 183 deletions
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
index d0cff0c..9051f09 100644
--- a/src/core/cpu/builtins.cpp
+++ b/src/core/cpu/builtins.cpp
@@ -194,7 +194,7 @@ void CPUKernelWorkGroup::barrier(unsigned int flags)
         next->context.uc_stack.ss_size = p_stack_size;
 
         // Tell it to run the kernel function
-        makecontext(&next->context, p_kernel_func_addr, 0);
+        makecontext(&next->context, (void (*)())p_kernel_func_addr, 1, p_args);
     }
 
     // Switch to the next context
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
index 33e0cdd..b9171b7 100644
--- a/src/core/cpu/kernel.cpp
+++ b/src/core/cpu/kernel.cpp
@@ -11,9 +11,6 @@
 
 #include <llvm/Function.h>
 #include <llvm/Constants.h>
-#include <llvm/ADT/APInt.h>
-#include <llvm/ADT/APFloat.h>
-#include <llvm/Support/Casting.h>
 #include <llvm/Instructions.h>
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
@@ -26,23 +23,6 @@
 
 using namespace Coal;
 
-static llvm::Constant *getPointerConstant(llvm::LLVMContext &C,
-                                          llvm::Type *type,
-                                          void *value)
-{
-    llvm::Constant *rs = 0;
-
-    if (sizeof(void *) == 4)
-        rs = llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), (uint64_t)value);
-    else
-        rs = llvm::ConstantInt::get(llvm::Type::getInt64Ty(C), (uint64_t)value);
-
-    // Cast to kernel's pointer type
-    rs = llvm::ConstantExpr::getIntToPtr(rs, type);
-
-    return rs;
-}
-
 CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function)
 : DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
   p_call_function(0)
@@ -136,12 +116,40 @@ CPUDevice *CPUKernel::device() const
     return p_device;
 }
 
-llvm::Function *CPUKernel::callFunction(std::vector<void *> &freeLocal)
+// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
+template <class T>
+T next_power_of_two(T k) {
+        if (k == 0)
+                return 1;
+        k--;
+        for (int i=1; i<sizeof(T)*8; i<<=1)
+                k = k | k >> i;
+        return k+1;
+}
+
+size_t CPUKernel::typeOffset(size_t &offset, size_t type_len)
+{
+    size_t rs = offset;
+
+    // Align offset to stype_len
+    type_len = next_power_of_two(type_len);
+    size_t mask = ~(type_len - 1);
+
+    while (rs & mask != rs)
+        rs++;
+
+    // Where to try to place the next value
+    offset = rs + type_len;
+
+    return rs;
+}
+
+llvm::Function *CPUKernel::callFunction()
 {
     pthread_mutex_lock(&p_call_function_mutex);
 
     // If we can reuse the same function between work groups, do it
-    if (!p_kernel->needsLocalAllocation() && p_call_function)
+    if (p_call_function)
     {
         llvm::Function *rs = p_call_function;
         pthread_mutex_unlock(&p_call_function_mutex);
@@ -149,171 +157,114 @@ llvm::Function *CPUKernel::callFunction(std::vector<void *> &freeLocal)
         return rs;
     }
 
-    // Create a LLVM function that calls the kernels with its arguments
-    // Code inspired from llvm/lib/ExecutionEngine/JIT/JIT.cpp
-    // Copyright The LLVM Compiler Infrastructure
-    llvm::FunctionType *k_func_type = p_function->getFunctionType();
-    llvm::FunctionType *f_type =
-        llvm::FunctionType::get(p_function->getReturnType(), false);
-    llvm::Function *stub = llvm::Function::Create(f_type,
-                                                  llvm::Function::InternalLinkage,
-                                                "", p_function->getParent());
+    /* Create a stub function in the form of
+     *
+     * void stub(void *args) {
+     *     kernel(*(int *)((char *)args + 0),
+     *            *(float **)((char *)args + sizeof(int)),
+     *            *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
+     * }
+     *
+     * In LLVM, it is exprimed in the form of :
+     *
+     * @stub(i8* args) {
+     *     kernel(
+     *         load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
+     *         load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
+     *         ...
+     *     );
+     * }
+     */
+    llvm::FunctionType *kernel_function_type = p_function->getFunctionType();
+    llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
+        p_function->getReturnType(),
+        llvm::Type::getInt8PtrTy(
+            p_function->getContext()),
+        false);
+    llvm::Function *stub_function = llvm::Function::Create(
+        stub_function_type,
+        llvm::Function::InternalLinkage,
+        "",
+        p_function->getParent());
 
     // Insert a basic block
-    llvm::BasicBlock *block = llvm::BasicBlock::Create(p_function->getContext(),
-                                                       "", stub);
+    llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
+        p_function->getContext(),
+        "",
+        stub_function);
 
+    // Create the function arguments
+    llvm::Argument &stub_arg = stub_function->getArgumentList().front();
     llvm::SmallVector<llvm::Value *, 8> args;
+    size_t args_offset = 0;
 
-    // Add each kernel arg to args
-    for (unsigned int i=0; i<p_kernel->numArgs(); ++i)
+    for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i)
     {
-        const Kernel::Arg &a = p_kernel->arg(i);
-        llvm::Constant *arg_constant = 0;
-
-        // To handle vectors (float4, etc)
-        llvm::SmallVector<llvm::Constant *, 4> vec_elements;
-
-        // Explore the vector elements
-        for (unsigned short k=0; k<a.vecDim(); ++k)
-        {
-            const void *value = a.value(k);
-            llvm::Constant *C = 0;
-
-            switch (a.kind())
-            {
-                case Kernel::Arg::Int8:
-                    C = llvm::ConstantInt::get(stub->getContext(),
-                                               llvm::APInt(8, *(uint8_t *)value));
-                    break;
-
-                case Kernel::Arg::Int16:
-                    C = llvm::ConstantInt::get(stub->getContext(),
-                                               llvm::APInt(16, *(uint16_t *)value));
-                    break;
-
-                case Kernel::Arg::Int32:
-                case Kernel::Arg::Sampler:
-                    C = llvm::ConstantInt::get(stub->getContext(),
-                                               llvm::APInt(32, *(uint32_t *)value));
-                    break;
-
-                case Kernel::Arg::Int64:
-                    C = llvm::ConstantInt::get(stub->getContext(),
-                                               llvm::APInt(64, *(uint64_t *)value));
-                    break;
-
-                case Kernel::Arg::Float:
-                    C = llvm::ConstantFP::get(stub->getContext(),
-                                              llvm::APFloat(*(float *)value));
-                    break;
-
-                case Kernel::Arg::Double:
-                    C = llvm::ConstantFP::get(stub->getContext(),
-                                              llvm::APFloat(*(double *)value));
-                    break;
-
-                case Kernel::Arg::Buffer:
-                {
-                    MemObject *buffer = *(MemObject **)value;
-
-                    if (a.file() == Kernel::Arg::Local)
-                    {
-                        // Alloc a buffer and pass it to the kernel
-                        void *local_buffer = std::malloc(a.allocAtKernelRuntime());
-                        C = getPointerConstant(stub->getContext(),
-                                               k_func_type->getParamType(i),
-                                               local_buffer);
-
-                        freeLocal.push_back(local_buffer);
-                    }
-                    else
-                    {
-                        if (!buffer)
-                        {
-                            // We can do that, just send NULL
-                            C = llvm::ConstantPointerNull::get(
-                                    llvm::cast<llvm::PointerType>(
-                                        k_func_type->getParamType(i)));
-                        }
-                        else
-                        {
-                            // Get the CPU buffer, allocate it and get its pointer
-                            CPUBuffer *cpubuf =
-                                (CPUBuffer *)buffer->deviceBuffer(p_device);
-                            void *buf_ptr = 0;
-
-                            buffer->allocate(p_device);
-
-                            buf_ptr = cpubuf->data();
-
-                            C = getPointerConstant(stub->getContext(),
-                                                   k_func_type->getParamType(i),
-                                                   buf_ptr);
-                        }
-                    }
-
-                    break;
-                }
-
-                case Kernel::Arg::Image2D:
-                case Kernel::Arg::Image3D:
-                {
-                    Image2D *image = *(Image2D **)value;
-                    image->allocate(p_device);
-
-                    // Assign a pointer to the image object, the intrinsic functions
-                    // will handle them
-                    C = getPointerConstant(stub->getContext(),
-                                           k_func_type->getParamType(i),
-                                           (void *)image);
-                    break;
-                }
-
-                default:
-                    break;
-            }
-
-            // Add the vector element
-            vec_elements.push_back(C);
-        }
-
-        // If the arg was a vector, handle it
-        if (a.vecDim() == 1)
-        {
-            arg_constant = vec_elements.front();
-        }
-        else
-        {
-            arg_constant = llvm::ConstantVector::get(vec_elements);
-        }
-
-        // Append the arg
-        args.push_back(arg_constant);
+        llvm::Type *param_type = kernel_function_type->getParamType(i);
+        llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
+        const Kernel::Arg &arg = p_kernel->arg(i);
+
+        // Calculate the size of the arg
+        size_t arg_size = arg.valueSize() * arg.vecDim();
+
+        // Get where to place this argument
+        size_t arg_offset = typeOffset(args_offset, arg_size);
+
+        // %1 = getelementptr(args, $arg_offset);
+        llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
+            &stub_arg,
+            llvm::ConstantInt::get(stub_function->getContext(),
+                                   llvm::APInt(64, arg_offset)),
+            "",
+            basic_block);
+
+        // %2 = bitcast(%1, $param_type_ptr)
+        llvm::Value *bitcast = new llvm::BitCastInst(
+            getelementptr,
+            param_type_ptr,
+            "",
+            basic_block);
+
+        // %3 = load(%2)
+        llvm::Value *load = new llvm::LoadInst(
+            bitcast,
+            "",
+            false,
+            arg_size,   // We ensure that an argument is always aligned on its size, it enables things like fast movaps
+            basic_block);
+
+        // We have the value, send it to the function
+        args.push_back(load);
     }
 
     // Create the call instruction
-    llvm::CallInst *call_inst = llvm::CallInst::Create(p_function, args, "", block);
+    llvm::CallInst *call_inst = llvm::CallInst::Create(
+        p_function,
+        args,
+        "",
+        basic_block);
     call_inst->setCallingConv(p_function->getCallingConv());
     call_inst->setTailCall();
 
     // Create a return instruction to end the stub
-    llvm::ReturnInst::Create(stub->getContext(), block);
+    llvm::ReturnInst::Create(
+        p_function->getContext(),
+        basic_block);
 
     // Retain the function if it can be reused
-    if (!p_kernel->needsLocalAllocation())
-        p_call_function = stub;
+    p_call_function = stub_function;
 
     pthread_mutex_unlock(&p_call_function_mutex);
 
-    return stub;
+    return stub_function;
 }
 
 /*
  * CPUKernelEvent
  */
 CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
-: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0)
+: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0),
+  p_kernel_args(0)
 {
     // Mutex
     pthread_mutex_init(&p_mutex, 0);
@@ -336,6 +287,9 @@ CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
 CPUKernelEvent::~CPUKernelEvent()
 {
     pthread_mutex_destroy(&p_mutex);
+
+    if (p_kernel_args)
+        std::free(p_kernel_args);
 }
 
 bool CPUKernelEvent::reserve()
@@ -386,6 +340,16 @@ CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
     return wg;
 }
 
+void *CPUKernelEvent::kernelArgs() const
+{
+    return p_kernel_args;
+}
+
+void CPUKernelEvent::cacheKernelArgs(void *args)
+{
+    p_kernel_args = args;
+}
+
 /*
  * CPUKernelWorkGroup
  */
@@ -419,12 +383,107 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup()
     p_cpu_event->workGroupFinished();
 }
 
+void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free)
+{
+    if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals())
+    {
+        // We have cached the args and can reuse them
+        return p_cpu_event->kernelArgs();
+    }
+
+    // We need to create them from scratch
+    void *rs;
+
+    size_t args_size = 0;
+
+    for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
+    {
+        const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
+        CPUKernel::typeOffset(args_size, arg.valueSize() * arg.vecDim());
+    }
+
+    rs = std::malloc(args_size);
+
+    if (!rs)
+        return false;
+
+    size_t arg_offset = 0;
+
+    for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
+    {
+        const Kernel::Arg &arg = p_kernel->kernel()->arg(i);
+        size_t size = arg.valueSize() * arg.vecDim();
+        size_t offset = CPUKernel::typeOffset(arg_offset, size);
+
+        // Where to place the argument
+        unsigned char *target = (unsigned char *)rs;
+        target += offset;
+
+        // We may have to perform some changes in the values (buffers, etc)
+        switch (arg.kind())
+        {
+            case Kernel::Arg::Buffer:
+            {
+                MemObject *buffer = *(MemObject **)arg.data();
+
+                if (arg.file() == Kernel::Arg::Local)
+                {
+                    // Alloc a buffer and pass it to the kernel
+                    void *local_buffer = std::malloc(arg.allocAtKernelRuntime());
+                    locals_to_free.push_back(local_buffer);
+                    *(void **)target = local_buffer;
+                }
+                else
+                {
+                    if (!buffer)
+                    {
+                        // We can do that, just send NULL
+                        *(void **)target = NULL;
+                    }
+                    else
+                    {
+                        // Get the CPU buffer, allocate it and get its pointer
+                        CPUBuffer *cpubuf =
+                            (CPUBuffer *)buffer->deviceBuffer(p_kernel->device());
+                        void *buf_ptr = 0;
+
+                        buffer->allocate(p_kernel->device());
+                        buf_ptr = cpubuf->data();
+
+                        *(void **)target = buf_ptr;
+                    }
+                }
+
+                break;
+            }
+            case Kernel::Arg::Image2D:
+            case Kernel::Arg::Image3D:
+            {
+                // We need to ensure the image is allocated
+                Image2D *image = *(Image2D **)arg.data();
+                image->allocate(p_kernel->device());
+
+                // Fall through to the memcpy
+            }
+            default:
+                // Simply copy the arg's data into the buffer
+                std::memcpy(target, arg.data(), size);
+                break;
+        }
+    }
+
+    // Cache the arguments if we can do so
+    if (!p_kernel->kernel()->hasLocals())
+        p_cpu_event->cacheKernelArgs(rs);
+
+    return rs;
+}
+
 bool CPUKernelWorkGroup::run()
 {
     // Get the kernel function to call
-    bool free_after = p_kernel->kernel()->needsLocalAllocation();
-    std::vector<void *> local_to_free;
-    llvm::Function *kernel_func = p_kernel->callFunction(local_to_free);
+    std::vector<void *> locals_to_free;
+    llvm::Function *kernel_func = p_kernel->callFunction();
 
     if (!kernel_func)
         return false;
@@ -432,7 +491,11 @@ bool CPUKernelWorkGroup::run()
     Program *p = (Program *)p_kernel->kernel()->parent();
     CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));
 
-    p_kernel_func_addr = (void(*)())prog->jit()->getPointerToFunction(kernel_func);
+    p_kernel_func_addr =
+        (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func);
+
+    // Get the arguments
+    p_args = callArgs(locals_to_free);
 
     // Tell the builtins this thread will run a kernel work group
     setThreadLocalWorkGroup(this);
@@ -446,7 +509,7 @@ bool CPUKernelWorkGroup::run()
     do
     {
         // Simply call the "call function", it and the builtins will do the rest
-        p_kernel_func_addr();
+        p_kernel_func_addr(p_args);
     } while (!p_had_barrier &&
              !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));
 
@@ -468,16 +531,15 @@ bool CPUKernelWorkGroup::run()
         }
     }
 
-    // We may have some cleanup to do
-    if (free_after)
+    // Free the allocated locals
+    if (p_kernel->kernel()->hasLocals())
     {
-        for (size_t i=0; i<local_to_free.size(); ++i)
+        for (size_t i=0; i<locals_to_free.size(); ++i)
         {
-            std::free(local_to_free[i]);
+            std::free(locals_to_free[i]);
         }
 
-        // Bye function
-        kernel_func->eraseFromParent();
+        std::free(p_args);
     }
 
     return true;
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 14607bf..f28ca87 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -41,7 +41,8 @@ class CPUKernel : public DeviceKernel
         CPUDevice *device() const;
 
         llvm::Function *function() const;
-        llvm::Function *callFunction(std::vector<void *> &freeLocal);
+        llvm::Function *callFunction();
+        static size_t typeOffset(size_t &offset, size_t type_len);
 
     private:
         CPUDevice *p_device;
@@ -60,6 +61,7 @@ class CPUKernelWorkGroup
                            const size_t *work_group_index);
         ~CPUKernelWorkGroup();
 
+        void *callArgs(std::vector<void *> &locals_to_free);
         bool run();
 
         // Native functions
@@ -85,7 +87,8 @@ class CPUKernelWorkGroup
                p_max_local_id[MAX_WORK_DIMS],
                p_global_id_start_offset[MAX_WORK_DIMS];
 
-        void (*p_kernel_func_addr)();
+        void (*p_kernel_func_addr)(void *);
+        void *p_args;
 
         // Machinery to have barrier() working
         struct Context
@@ -115,6 +118,9 @@ class CPUKernelEvent
         bool finished(); /*!< All the work groups have finished */
         CPUKernelWorkGroup *takeInstance(); /*!< Must be called exactly one time after reserve(). Unlocks the event */
 
+        void *kernelArgs() const;
+        void cacheKernelArgs(void *args);
+
         void workGroupFinished();
 
     private:
@@ -124,6 +130,7 @@ class CPUKernelEvent
                p_max_work_groups[MAX_WORK_DIMS];
         size_t p_current_wg, p_finished_wg, p_num_wg;
         pthread_mutex_t p_mutex;
+        void *p_kernel_args;
 };
 
 }
diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp
index 0c81937..0d64c92 100644
--- a/src/core/kernel.cpp
+++ b/src/core/kernel.cpp
@@ -17,7 +17,7 @@
 
 using namespace Coal;
 Kernel::Kernel(Program *program)
-: Object(Object::T_Kernel, program), p_local_args(false)
+: Object(Object::T_Kernel, program), p_has_locals(false)
 {
     // TODO: Say a kernel is attached to the program (that becomes unalterable)
 
@@ -102,7 +102,7 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
 
             // If it's a __local argument, we'll have to allocate memory at run time
             if (file == Arg::Local)
-                p_local_args = true;
+                p_has_locals = true;
 
             kind = Arg::Buffer;
 
@@ -284,9 +284,9 @@ bool Kernel::argsSpecified() const
     return true;
 }
 
-bool Kernel::needsLocalAllocation() const
+bool Kernel::hasLocals() const
 {
-    return p_local_args;
+    return p_has_locals;
 }
 
 DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const
@@ -518,3 +518,8 @@ const void *Kernel::Arg::value(unsigned short index) const
 
     return (const void *)data;
 }
+
+const void *Kernel::Arg::data() const
+{
+    return p_data;
+}
diff --git a/src/core/kernel.h b/src/core/kernel.h
index 84b2cf5..7301b99 100644
--- a/src/core/kernel.h
+++ b/src/core/kernel.h
@@ -69,6 +69,7 @@ class Kernel : public Object
                 bool defined() const;
                 size_t allocAtKernelRuntime() const;
                 const void *value(unsigned short index) const;
+                const void *data() const;
 
             private:
                 unsigned short p_vec_dim;
@@ -89,7 +90,7 @@ class Kernel : public Object
         DeviceKernel *deviceDependentKernel(DeviceInterface *device) const;
 
         bool argsSpecified() const;
-        bool needsLocalAllocation() const;  /*!< One or more arguments is __local */
+        bool hasLocals() const;
 
         cl_int info(cl_kernel_info param_name,
                     size_t param_value_size,
@@ -103,7 +104,7 @@ class Kernel : public Object
 
     private:
         std::string p_name;
-        bool p_local_args;
+        bool p_has_locals;
 
         struct DeviceDependent
         {
author	Denis Steckelmacher <steckdenis@yahoo.fr>	2011-08-17 16:04:30 +0200
committer	Denis Steckelmacher <steckdenis@yahoo.fr>	2011-08-17 16:04:30 +0200
commit	5b6894ddc065a818d70d579a80a4c132265e03aa (patch)
tree	b8cc83daae5ccbb0d7631bf06e4d209301a3b08e
parent	dcec4a2fd162926c0331a85a32b43de4db415bc0 (diff)