From 5b6894ddc065a818d70d579a80a4c132265e03aa Mon Sep 17 00:00:00 2001 From: Denis Steckelmacher Date: Wed, 17 Aug 2011 16:04:30 +0200 Subject: A kernel can be run multiple times with different args, handle that. The stub functions becomes arg-independant, and the args are built for each KernelEvent. --- src/core/cpu/builtins.cpp | 2 +- src/core/cpu/kernel.cpp | 410 ++++++++++++++++++++++++++-------------------- src/core/cpu/kernel.h | 11 +- src/core/kernel.cpp | 13 +- src/core/kernel.h | 5 +- 5 files changed, 258 insertions(+), 183 deletions(-) diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp index d0cff0c..9051f09 100644 --- a/src/core/cpu/builtins.cpp +++ b/src/core/cpu/builtins.cpp @@ -194,7 +194,7 @@ void CPUKernelWorkGroup::barrier(unsigned int flags) next->context.uc_stack.ss_size = p_stack_size; // Tell it to run the kernel function - makecontext(&next->context, p_kernel_func_addr, 0); + makecontext(&next->context, (void (*)())p_kernel_func_addr, 1, p_args); } // Switch to the next context diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp index 33e0cdd..b9171b7 100644 --- a/src/core/cpu/kernel.cpp +++ b/src/core/cpu/kernel.cpp @@ -11,9 +11,6 @@ #include #include -#include -#include -#include #include #include #include @@ -26,23 +23,6 @@ using namespace Coal; -static llvm::Constant *getPointerConstant(llvm::LLVMContext &C, - llvm::Type *type, - void *value) -{ - llvm::Constant *rs = 0; - - if (sizeof(void *) == 4) - rs = llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), (uint64_t)value); - else - rs = llvm::ConstantInt::get(llvm::Type::getInt64Ty(C), (uint64_t)value); - - // Cast to kernel's pointer type - rs = llvm::ConstantExpr::getIntToPtr(rs, type); - - return rs; -} - CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function) : DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function), p_call_function(0) @@ -136,12 +116,40 @@ CPUDevice *CPUKernel::device() const return p_device; } -llvm::Function *CPUKernel::callFunction(std::vector &freeLocal) +// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two +template +T next_power_of_two(T k) { + if (k == 0) + return 1; + k--; + for (int i=1; i> i; + return k+1; +} + +size_t CPUKernel::typeOffset(size_t &offset, size_t type_len) +{ + size_t rs = offset; + + // Align offset to stype_len + type_len = next_power_of_two(type_len); + size_t mask = ~(type_len - 1); + + while (rs & mask != rs) + rs++; + + // Where to try to place the next value + offset = rs + type_len; + + return rs; +} + +llvm::Function *CPUKernel::callFunction() { pthread_mutex_lock(&p_call_function_mutex); // If we can reuse the same function between work groups, do it - if (!p_kernel->needsLocalAllocation() && p_call_function) + if (p_call_function) { llvm::Function *rs = p_call_function; pthread_mutex_unlock(&p_call_function_mutex); @@ -149,171 +157,114 @@ llvm::Function *CPUKernel::callFunction(std::vector &freeLocal) return rs; } - // Create a LLVM function that calls the kernels with its arguments - // Code inspired from llvm/lib/ExecutionEngine/JIT/JIT.cpp - // Copyright The LLVM Compiler Infrastructure - llvm::FunctionType *k_func_type = p_function->getFunctionType(); - llvm::FunctionType *f_type = - llvm::FunctionType::get(p_function->getReturnType(), false); - llvm::Function *stub = llvm::Function::Create(f_type, - llvm::Function::InternalLinkage, - "", p_function->getParent()); + /* Create a stub function in the form of + * + * void stub(void *args) { + * kernel(*(int *)((char *)args + 0), + * *(float **)((char *)args + sizeof(int)), + * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *))); + * } + * + * In LLVM, it is exprimed in the form of : + * + * @stub(i8* args) { + * kernel( + * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)), + * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)), + * ... + * ); + * } + */ + llvm::FunctionType *kernel_function_type = p_function->getFunctionType(); + llvm::FunctionType *stub_function_type = llvm::FunctionType::get( + p_function->getReturnType(), + llvm::Type::getInt8PtrTy( + p_function->getContext()), + false); + llvm::Function *stub_function = llvm::Function::Create( + stub_function_type, + llvm::Function::InternalLinkage, + "", + p_function->getParent()); // Insert a basic block - llvm::BasicBlock *block = llvm::BasicBlock::Create(p_function->getContext(), - "", stub); + llvm::BasicBlock *basic_block = llvm::BasicBlock::Create( + p_function->getContext(), + "", + stub_function); + // Create the function arguments + llvm::Argument &stub_arg = stub_function->getArgumentList().front(); llvm::SmallVector args; + size_t args_offset = 0; - // Add each kernel arg to args - for (unsigned int i=0; inumArgs(); ++i) + for (unsigned int i=0; igetNumParams(); ++i) { - const Kernel::Arg &a = p_kernel->arg(i); - llvm::Constant *arg_constant = 0; - - // To handle vectors (float4, etc) - llvm::SmallVector vec_elements; - - // Explore the vector elements - for (unsigned short k=0; kgetContext(), - llvm::APInt(8, *(uint8_t *)value)); - break; - - case Kernel::Arg::Int16: - C = llvm::ConstantInt::get(stub->getContext(), - llvm::APInt(16, *(uint16_t *)value)); - break; - - case Kernel::Arg::Int32: - case Kernel::Arg::Sampler: - C = llvm::ConstantInt::get(stub->getContext(), - llvm::APInt(32, *(uint32_t *)value)); - break; - - case Kernel::Arg::Int64: - C = llvm::ConstantInt::get(stub->getContext(), - llvm::APInt(64, *(uint64_t *)value)); - break; - - case Kernel::Arg::Float: - C = llvm::ConstantFP::get(stub->getContext(), - llvm::APFloat(*(float *)value)); - break; - - case Kernel::Arg::Double: - C = llvm::ConstantFP::get(stub->getContext(), - llvm::APFloat(*(double *)value)); - break; - - case Kernel::Arg::Buffer: - { - MemObject *buffer = *(MemObject **)value; - - if (a.file() == Kernel::Arg::Local) - { - // Alloc a buffer and pass it to the kernel - void *local_buffer = std::malloc(a.allocAtKernelRuntime()); - C = getPointerConstant(stub->getContext(), - k_func_type->getParamType(i), - local_buffer); - - freeLocal.push_back(local_buffer); - } - else - { - if (!buffer) - { - // We can do that, just send NULL - C = llvm::ConstantPointerNull::get( - llvm::cast( - k_func_type->getParamType(i))); - } - else - { - // Get the CPU buffer, allocate it and get its pointer - CPUBuffer *cpubuf = - (CPUBuffer *)buffer->deviceBuffer(p_device); - void *buf_ptr = 0; - - buffer->allocate(p_device); - - buf_ptr = cpubuf->data(); - - C = getPointerConstant(stub->getContext(), - k_func_type->getParamType(i), - buf_ptr); - } - } - - break; - } - - case Kernel::Arg::Image2D: - case Kernel::Arg::Image3D: - { - Image2D *image = *(Image2D **)value; - image->allocate(p_device); - - // Assign a pointer to the image object, the intrinsic functions - // will handle them - C = getPointerConstant(stub->getContext(), - k_func_type->getParamType(i), - (void *)image); - break; - } - - default: - break; - } - - // Add the vector element - vec_elements.push_back(C); - } - - // If the arg was a vector, handle it - if (a.vecDim() == 1) - { - arg_constant = vec_elements.front(); - } - else - { - arg_constant = llvm::ConstantVector::get(vec_elements); - } - - // Append the arg - args.push_back(arg_constant); + llvm::Type *param_type = kernel_function_type->getParamType(i); + llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value + const Kernel::Arg &arg = p_kernel->arg(i); + + // Calculate the size of the arg + size_t arg_size = arg.valueSize() * arg.vecDim(); + + // Get where to place this argument + size_t arg_offset = typeOffset(args_offset, arg_size); + + // %1 = getelementptr(args, $arg_offset); + llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds( + &stub_arg, + llvm::ConstantInt::get(stub_function->getContext(), + llvm::APInt(64, arg_offset)), + "", + basic_block); + + // %2 = bitcast(%1, $param_type_ptr) + llvm::Value *bitcast = new llvm::BitCastInst( + getelementptr, + param_type_ptr, + "", + basic_block); + + // %3 = load(%2) + llvm::Value *load = new llvm::LoadInst( + bitcast, + "", + false, + arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps + basic_block); + + // We have the value, send it to the function + args.push_back(load); } // Create the call instruction - llvm::CallInst *call_inst = llvm::CallInst::Create(p_function, args, "", block); + llvm::CallInst *call_inst = llvm::CallInst::Create( + p_function, + args, + "", + basic_block); call_inst->setCallingConv(p_function->getCallingConv()); call_inst->setTailCall(); // Create a return instruction to end the stub - llvm::ReturnInst::Create(stub->getContext(), block); + llvm::ReturnInst::Create( + p_function->getContext(), + basic_block); // Retain the function if it can be reused - if (!p_kernel->needsLocalAllocation()) - p_call_function = stub; + p_call_function = stub_function; pthread_mutex_unlock(&p_call_function_mutex); - return stub; + return stub_function; } /* * CPUKernelEvent */ CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event) -: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0) +: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0), + p_kernel_args(0) { // Mutex pthread_mutex_init(&p_mutex, 0); @@ -336,6 +287,9 @@ CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event) CPUKernelEvent::~CPUKernelEvent() { pthread_mutex_destroy(&p_mutex); + + if (p_kernel_args) + std::free(p_kernel_args); } bool CPUKernelEvent::reserve() @@ -386,6 +340,16 @@ CPUKernelWorkGroup *CPUKernelEvent::takeInstance() return wg; } +void *CPUKernelEvent::kernelArgs() const +{ + return p_kernel_args; +} + +void CPUKernelEvent::cacheKernelArgs(void *args) +{ + p_kernel_args = args; +} + /* * CPUKernelWorkGroup */ @@ -419,12 +383,107 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup() p_cpu_event->workGroupFinished(); } +void *CPUKernelWorkGroup::callArgs(std::vector &locals_to_free) +{ + if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals()) + { + // We have cached the args and can reuse them + return p_cpu_event->kernelArgs(); + } + + // We need to create them from scratch + void *rs; + + size_t args_size = 0; + + for (unsigned int i=0; ikernel()->numArgs(); ++i) + { + const Kernel::Arg &arg = p_kernel->kernel()->arg(i); + CPUKernel::typeOffset(args_size, arg.valueSize() * arg.vecDim()); + } + + rs = std::malloc(args_size); + + if (!rs) + return false; + + size_t arg_offset = 0; + + for (unsigned int i=0; ikernel()->numArgs(); ++i) + { + const Kernel::Arg &arg = p_kernel->kernel()->arg(i); + size_t size = arg.valueSize() * arg.vecDim(); + size_t offset = CPUKernel::typeOffset(arg_offset, size); + + // Where to place the argument + unsigned char *target = (unsigned char *)rs; + target += offset; + + // We may have to perform some changes in the values (buffers, etc) + switch (arg.kind()) + { + case Kernel::Arg::Buffer: + { + MemObject *buffer = *(MemObject **)arg.data(); + + if (arg.file() == Kernel::Arg::Local) + { + // Alloc a buffer and pass it to the kernel + void *local_buffer = std::malloc(arg.allocAtKernelRuntime()); + locals_to_free.push_back(local_buffer); + *(void **)target = local_buffer; + } + else + { + if (!buffer) + { + // We can do that, just send NULL + *(void **)target = NULL; + } + else + { + // Get the CPU buffer, allocate it and get its pointer + CPUBuffer *cpubuf = + (CPUBuffer *)buffer->deviceBuffer(p_kernel->device()); + void *buf_ptr = 0; + + buffer->allocate(p_kernel->device()); + buf_ptr = cpubuf->data(); + + *(void **)target = buf_ptr; + } + } + + break; + } + case Kernel::Arg::Image2D: + case Kernel::Arg::Image3D: + { + // We need to ensure the image is allocated + Image2D *image = *(Image2D **)arg.data(); + image->allocate(p_kernel->device()); + + // Fall through to the memcpy + } + default: + // Simply copy the arg's data into the buffer + std::memcpy(target, arg.data(), size); + break; + } + } + + // Cache the arguments if we can do so + if (!p_kernel->kernel()->hasLocals()) + p_cpu_event->cacheKernelArgs(rs); + + return rs; +} + bool CPUKernelWorkGroup::run() { // Get the kernel function to call - bool free_after = p_kernel->kernel()->needsLocalAllocation(); - std::vector local_to_free; - llvm::Function *kernel_func = p_kernel->callFunction(local_to_free); + std::vector locals_to_free; + llvm::Function *kernel_func = p_kernel->callFunction(); if (!kernel_func) return false; @@ -432,7 +491,11 @@ bool CPUKernelWorkGroup::run() Program *p = (Program *)p_kernel->kernel()->parent(); CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device())); - p_kernel_func_addr = (void(*)())prog->jit()->getPointerToFunction(kernel_func); + p_kernel_func_addr = + (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func); + + // Get the arguments + p_args = callArgs(locals_to_free); // Tell the builtins this thread will run a kernel work group setThreadLocalWorkGroup(this); @@ -446,7 +509,7 @@ bool CPUKernelWorkGroup::run() do { // Simply call the "call function", it and the builtins will do the rest - p_kernel_func_addr(); + p_kernel_func_addr(p_args); } while (!p_had_barrier && !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id)); @@ -468,16 +531,15 @@ bool CPUKernelWorkGroup::run() } } - // We may have some cleanup to do - if (free_after) + // Free the allocated locals + if (p_kernel->kernel()->hasLocals()) { - for (size_t i=0; ieraseFromParent(); + std::free(p_args); } return true; diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index 14607bf..f28ca87 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -41,7 +41,8 @@ class CPUKernel : public DeviceKernel CPUDevice *device() const; llvm::Function *function() const; - llvm::Function *callFunction(std::vector &freeLocal); + llvm::Function *callFunction(); + static size_t typeOffset(size_t &offset, size_t type_len); private: CPUDevice *p_device; @@ -60,6 +61,7 @@ class CPUKernelWorkGroup const size_t *work_group_index); ~CPUKernelWorkGroup(); + void *callArgs(std::vector &locals_to_free); bool run(); // Native functions @@ -85,7 +87,8 @@ class CPUKernelWorkGroup p_max_local_id[MAX_WORK_DIMS], p_global_id_start_offset[MAX_WORK_DIMS]; - void (*p_kernel_func_addr)(); + void (*p_kernel_func_addr)(void *); + void *p_args; // Machinery to have barrier() working struct Context @@ -115,6 +118,9 @@ class CPUKernelEvent bool finished(); /*!< All the work groups have finished */ CPUKernelWorkGroup *takeInstance(); /*!< Must be called exactly one time after reserve(). Unlocks the event */ + void *kernelArgs() const; + void cacheKernelArgs(void *args); + void workGroupFinished(); private: @@ -124,6 +130,7 @@ class CPUKernelEvent p_max_work_groups[MAX_WORK_DIMS]; size_t p_current_wg, p_finished_wg, p_num_wg; pthread_mutex_t p_mutex; + void *p_kernel_args; }; } diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp index 0c81937..0d64c92 100644 --- a/src/core/kernel.cpp +++ b/src/core/kernel.cpp @@ -17,7 +17,7 @@ using namespace Coal; Kernel::Kernel(Program *program) -: Object(Object::T_Kernel, program), p_local_args(false) +: Object(Object::T_Kernel, program), p_has_locals(false) { // TODO: Say a kernel is attached to the program (that becomes unalterable) @@ -102,7 +102,7 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, // If it's a __local argument, we'll have to allocate memory at run time if (file == Arg::Local) - p_local_args = true; + p_has_locals = true; kind = Arg::Buffer; @@ -284,9 +284,9 @@ bool Kernel::argsSpecified() const return true; } -bool Kernel::needsLocalAllocation() const +bool Kernel::hasLocals() const { - return p_local_args; + return p_has_locals; } DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const @@ -518,3 +518,8 @@ const void *Kernel::Arg::value(unsigned short index) const return (const void *)data; } + +const void *Kernel::Arg::data() const +{ + return p_data; +} diff --git a/src/core/kernel.h b/src/core/kernel.h index 84b2cf5..7301b99 100644 --- a/src/core/kernel.h +++ b/src/core/kernel.h @@ -69,6 +69,7 @@ class Kernel : public Object bool defined() const; size_t allocAtKernelRuntime() const; const void *value(unsigned short index) const; + const void *data() const; private: unsigned short p_vec_dim; @@ -89,7 +90,7 @@ class Kernel : public Object DeviceKernel *deviceDependentKernel(DeviceInterface *device) const; bool argsSpecified() const; - bool needsLocalAllocation() const; /*!< One or more arguments is __local */ + bool hasLocals() const; cl_int info(cl_kernel_info param_name, size_t param_value_size, @@ -103,7 +104,7 @@ class Kernel : public Object private: std::string p_name; - bool p_local_args; + bool p_has_locals; struct DeviceDependent { -- cgit v1.2.3