diff options
Diffstat (limited to 'src/core/cpu/builtins.cpp')
-rw-r--r-- | src/core/cpu/builtins.cpp | 211 |
1 files changed, 206 insertions, 5 deletions
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp index 6fdab30..64eb945 100644 --- a/src/core/cpu/builtins.cpp +++ b/src/core/cpu/builtins.cpp @@ -1,6 +1,205 @@ +#include "builtins.h" #include "kernel.h" +#include "../events.h" + +#include <sys/mman.h> +#include <signal.h> + +#include <llvm/Function.h> +#include <iostream> +#include <cstring> + +using namespace Coal; + +/* + * TLS-related functions + */ __thread Coal::CPUKernelWorkGroup *g_work_group; +__thread void *work_items_data; +__thread size_t work_items_size; + +void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) +{ + g_work_group = current; +} + +void *getWorkItemsData(size_t &size) +{ + size = work_items_size; + return work_items_data; +} + +void setWorkItemsData(void *ptr, size_t size) +{ + work_items_data = ptr; + work_items_size = size; +} + +/* + * Actual built-ins implementations + */ +cl_uint CPUKernelWorkGroup::getWorkDim() const +{ + return p_work_dim; +} + +size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx]; +} + +size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const +{ + if (dimindx >p_work_dim) + return 1; + + return p_event->global_work_size(dimindx); +} + +size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 1; + + return p_event->local_work_size(dimindx); +} + +size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_current_context->local_id[dimindx]; +} + +size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 1; + + return (p_event->global_work_size(dimindx) / + p_event->local_work_size(dimindx)); +} + +size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_index[dimindx]; +} + +size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_event->global_work_offset(dimindx); +} + +void CPUKernelWorkGroup::barrier(unsigned int flags) +{ + p_had_barrier = true; + + // Allocate or reuse TLS memory for the stacks (it isn't freed between + // the work groups, and even the kernels, so if we need less space than + // allocated, it's good) + if (!p_contexts) + { + if (p_current_work_item != 0) + { + // Completely abnormal, it means that not every work-items + // encounter the barrier + std::cerr << "*** Not every work-items of " + << p_kernel->function()->getNameStr() + << " calls barrier(); !" << std::endl; + return; + } + + // Allocate or reuse the stacks + size_t contexts_size; + p_contexts = getWorkItemsData(contexts_size); + size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context)); + + if (!p_contexts || contexts_size < needed_size) + { + // We must allocate a new space + if (p_contexts) + munmap(p_contexts, contexts_size); + + p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */ + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + + setWorkItemsData(p_contexts, contexts_size); + } + + // Now that we have a real main context, initialize it + p_current_context = getContextAddr(0); + p_current_context->initialized = 1; + std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t)); + + getcontext(&p_current_context->context); + } + + // Take the next context + p_current_work_item++; + if (p_current_work_item == p_num_work_items) p_current_work_item = 0; + + Context *next = getContextAddr(p_current_work_item); + Context *main = getContextAddr(0); // The context not created with makecontext + + // If the next context isn't initialized, initialize it. + // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized + if (next->initialized == 0) + { + next->initialized = 1; + + // local-id of next is the one of the current context, but incVec'ed + std::memcpy(next->local_id, p_current_context->local_id, + MAX_WORK_DIMS * sizeof(size_t)); + + incVec(p_work_dim, next->local_id, p_max_local_id); + + // Initialize the next context + if (getcontext(&next->context) != 0) + return; + + // Get its stack. It is located a next + sizeof(Context) + char *stack = (char *)next; + stack += sizeof(Context); + + next->context.uc_link = &main->context; + next->context.uc_stack.ss_sp = stack; + next->context.uc_stack.ss_size = p_stack_size; + + // Tell it to run the kernel function + makecontext(&next->context, p_kernel_func_addr, 0); + } + + // Switch to the next context + ucontext_t *cur = &p_current_context->context; + p_current_context = next; + + swapcontext(cur, &next->context); + + // When we return here, it means that all the other work items encountered + // a barrier and that we returned to this one. We can continue. +} + +void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const +{ + std::cout << "OpenCL: Non-existant builtin function " << name + << " found in kernel " << p_kernel->function()->getNameStr() + << '.' << std::endl; +} + +/* + * Built-in functions + */ static size_t get_global_id(cl_uint dimindx) { @@ -42,14 +241,14 @@ static size_t get_global_offset(uint dimindx) return g_work_group->getGlobalOffset(dimindx); } -/* - * Utility functions - */ -void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) +static void barrier(unsigned int flags) { - g_work_group = current; + g_work_group->barrier(flags); } +/* + * Bridge between LLVM and us + */ static void unimplemented_stub() { } @@ -72,6 +271,8 @@ void *getBuiltin(const std::string &name) return (void *)&get_group_id; else if (name == "get_global_offset") return (void *)&get_global_offset; + else if (name == "barrier") + return (void *)&barrier; // Function not found g_work_group->builtinNotFound(name); |