diff options
Diffstat (limited to 'src/gallium/state_trackers/guda/guda.c')
-rw-r--r-- | src/gallium/state_trackers/guda/guda.c | 575 |
1 files changed, 575 insertions, 0 deletions
diff --git a/src/gallium/state_trackers/guda/guda.c b/src/gallium/state_trackers/guda/guda.c new file mode 100644 index 00000000000..92dc662bcb9 --- /dev/null +++ b/src/gallium/state_trackers/guda/guda.c @@ -0,0 +1,575 @@ + +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> + +#include <libelf.h> +#include <gelf.h> + +#include <crt/host_runtime.h> +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "pipe-loader/pipe_loader.h" +#include "util/list.h" + +/******************************************************************************/ +/* State tracker internal functions / global state */ +/******************************************************************************/ + +#define USE_FAKE_CONTEXT 0 + +#define UNUSED_BUFFER_OFFSET ~0 +struct cuda_ctx; + +struct cuda_dev { + struct pipe_loader_device *pipe_dev; + struct pipe_screen *screen; +}; + +struct cuda_buffer { + struct list_head list; + unsigned offset; + struct pipe_resource *res; +}; + +enum cuda_arg_type { + CUDA_ARG_SCALAR, + CUDA_ARG_PTR +}; + +struct cuda_arg_info { + struct list_head list; + enum cuda_arg_type type; + unsigned argno; + unsigned offset; + unsigned size; +}; + +struct cuda_exec_state { + struct list_head list; + dim3 grid_dim; + dim3 block_dim; + size_t shared_mem; + cudaStream_t stream; + char *args; + unsigned arg_size; + struct cuda_arg_info arg_info; + void *compute_state; +}; + +struct cuda_ctx { + struct pipe_context *pipe_ctx; + struct cuda_dev *dev; + struct cuda_buffer buffers; + struct cuda_exec_state exec_state; +}; + +// FIXME Support more than one device function; +struct cuda_fatbin { + char *ptr; + char *dev_func_name; +} global_fatbin; + +struct cuda_dev **cuda_dev_list = NULL; + +// FIXME primary_ctx should be shared across all CPU threads. +struct cuda_ctx **primary_ctx_list = NULL; +struct cuda_ctx *current_ctx = NULL; +size_t current_dev = 0; + +#define CUDA_TRACE(fmt, ...) printf("%s: "fmt, __PRETTY_FUNCTION__, __VA_ARGS__) +#define CUDA_TRACE_NOARG(fmt) printf("%s: "fmt, __PRETTY_FUNCTION__) + +static int init_cuda_dev_list() { + struct pipe_loader_device **pipe_dev_list; + int n = pipe_loader_probe(NULL, 0); + int i,j; + + if (n == 0) { + return 0; + } + + pipe_dev_list = malloc(n * sizeof(struct pipe_loader_device*)); + cuda_dev_list = malloc(n * sizeof(struct cuda_dev*)); + + n = pipe_loader_probe(pipe_dev_list, n); + + for (i = 0, j = 0; i < n; i++) { + struct pipe_loader_device *pipe_dev = pipe_dev_list[i]; + struct pipe_screen *screen = pipe_loader_create_screen(pipe_dev); + struct cuda_dev *dev; + if (!screen || !screen->get_param(screen, PIPE_CAP_COMPUTE)) { + if (screen) { + screen->destroy(screen); + } + pipe_loader_release(&pipe_dev, 1); + continue; + } + + dev = malloc(sizeof(struct cuda_dev)); + dev->pipe_dev = pipe_dev; + dev->screen = screen; + cuda_dev_list[j] = dev; + } + + if (j == 0) { + free(pipe_dev_list); + free(cuda_dev_list); + pipe_dev_list = NULL; + cuda_dev_list = NULL; + } + return j; +} + +static void init_primary_ctx_list(int n) { + primary_ctx_list = calloc(n, sizeof(struct cuda_ctx*)); +} + +static struct cuda_ctx *create_cuda_ctx(struct cuda_dev *dev) { + struct cuda_ctx *ctx = malloc(sizeof(struct cuda_ctx)); + if (dev) + ctx->pipe_ctx = dev->screen->context_create(dev->screen, NULL, + PIPE_CONTEXT_COMPUTE_ONLY); + ctx->dev = dev; + memset(&ctx->exec_state, 0, sizeof(ctx->exec_state)); + list_inithead(&ctx->exec_state.list); + memset(&ctx->buffers, 0, sizeof(ctx->buffers)); + list_inithead(&ctx->buffers.list); + return ctx; +} + +static struct cuda_ctx *get_ctx() { +#if USE_FAKE_CONTEXT + if (!current_ctx) { + current_ctx = create_cuda_ctx(NULL); + } +#endif + if (current_ctx) { + return current_ctx; + } + + // There is no current context, so we need to use the primary context. + if (!cuda_dev_list) { + // Is this the best pla + int n = init_cuda_dev_list(); + if (n == 0) { + return NULL; + } + init_primary_ctx_list(n); + } + + if (!primary_ctx_list[current_dev]) { + primary_ctx_list[current_dev] = create_cuda_ctx(cuda_dev_list[current_dev]); + } + + return primary_ctx_list[current_dev]; +} + +static struct cuda_exec_state *get_exec_state() { + struct cuda_ctx *ctx = get_ctx(); + if (!ctx) { + return NULL; + } + return list_first_entry(&ctx->exec_state.list, struct cuda_exec_state, list); +} + + +/******************************************************************************/ +/* Functions calls generated by clang */ +/******************************************************************************/ + +cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset); +cudaError_t cudaLaunch(const void* func); +void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, + char *deviceFun, const char *deviceName, + int thread_limit, void *tid, void *bid, + void *bDim, void *gDim, int *wSize); +void** __cudaRegisterFatBinary(void *fatCubin); +extern void __cudaUnregisterFatBinary(void **fatCubinHandle); + + +cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset) { + CUDA_TRACE("arg = %p size = %ld offset = %ld)\n", arg, size, offset); + struct cuda_exec_state *exec_state = get_exec_state(); + struct cuda_ctx *ctx = get_ctx(); + if (!exec_state) { + return cudaErrorNoDevice; + } + + if (offset + size > exec_state->arg_size) { + exec_state->arg_size *= 2; + if (!realloc(exec_state->args, exec_state->arg_size)) + return cudaErrorUnknown; + } + + list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) { + if (buf->res == *(void**)arg) { + CUDA_TRACE("Arg is a buffer: %p\n", *(void**)arg); + buf->offset = offset; + } + } + memcpy(exec_state->args + offset, arg, size); + return cudaSuccess; +} + +#if 0 +// This function extrace what I think is NV ISA for the given function +static void extract_binary(unsigned char *elf_bin, unsigned elf_size, const char *func_name) { + Elf *elf = elf_memory(elf_bin, elf_size); + size_t section_str_index; + elf_getshdrstrndx(elf, §ion_str_index); + + for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) { + GElf_Shdr header; + if (gelf_getshdr(section, &header) != &header) { + //XXX: Error here + } + + if (strcmp(elf_strptr(elf, section_str_index, header.sh_name), ".symtab")) + continue; + + Elf_Data *const symtab_data = elf_getdata(section, NULL); + GElf_Sym symbol; + GElf_Sym *s; + unsigned i = 0; + + while ((s = gelf_getsym(symtab_data, i++, &symbol))) { + Elf_Scn *symbol_scn; + GElf_Shdr symbol_shdr; + const char *name = elf_strptr(elf, header.sh_link, s->st_name); + size_t symbol_offset; + printf("func_name = %s symbol name = %s idx = %d\n", func_name, name, s->st_shndx); + // FIXME: Not safe! + if (strcmp(name, func_name)) + continue; + + symbol_scn = elf_getscn(elf, s->st_shndx); + gelf_getshdr(symbol_scn, &symbol_shdr); + symbol_offset = symbol_shdr.sh_offset + s->st_value; + printf("size = %d\n", s->st_size); + printf("offset = %d\n", symbol_offset); + for (int j = 0; j < s->st_size; j++) { + printf("%x", *(elf_bin + symbol_offset + j)); + } + } + } + +} +#endif + + +static int read_arginfo(char *elf_bin, unsigned elf_size, + const char *func_name, struct cuda_arg_info *arg_info) { + + Elf *elf = elf_memory(elf_bin, elf_size); + size_t section_str_index; + elf_getshdrstrndx(elf, §ion_str_index); + int req_input_mem = 0; + + for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) { + GElf_Shdr header; + Elf_Data *section_data = NULL; + unsigned int *data; + if (gelf_getshdr(section, &header) != &header) { + //XXX: Error here + } + const char *section_name = + elf_strptr(elf, section_str_index, header.sh_name); + + if (strncmp(section_name, ".nv.info.", 9) || + strcmp(section_name + 9, func_name)) + continue; + + section_data = elf_getdata(section, section_data); + data = section_data->d_buf; + + // FIXME: I have no idea what the first 16 bytes are. + data +=4; + + while (data != section_data->d_buf + header.sh_size) { + unsigned dword2, dword3; + struct cuda_arg_info *arg = malloc(sizeof(struct cuda_arg_info)); + list_inithead(&arg->list); + list_add(&arg->list, &arg_info->list); + + // FIXME: I have no idea what the first 8 bytes are. + data+=2; + dword2 = *data++; + dword3 = *data++; + arg->argno = dword2 & 0xffff; + arg->offset = dword2 >> 16; + // FIXME: Not sure about this one. + arg->size = 0x2 << ((dword3 >> 20) & 0xf); + req_input_mem += arg->size; + CUDA_TRACE("argno = %d offset = %d size = %d\n", arg->argno, arg->offset, arg->size); + } + } + return req_input_mem; +} + +static int get_work_dim(dim3 block_dim, dim3 grid_dim) { + if (block_dim.z * grid_dim.z > 1) + return 3; + if (block_dim.y * grid_dim.y > 1) + return 2; + return 1; +} + +cudaError_t cudaLaunch(const void* func) { + CUDA_TRACE("func = %p\n", func); + // FIXME: I think need to use func to look up which function to execute. We + // currently just execute the most recently registered function. + struct cuda_exec_state *exec_state = get_exec_state(); + struct cuda_ctx *ctx = get_ctx(); + struct cuda_arg_info arg_info; + if (!exec_state) { + return cudaErrorNoDevice; + } + + char *nv_fatbin = global_fatbin.ptr; + //uint32_t nv_fatbin_magic = *(uint32_t*)(nv_fatbin); + //uint32_t nv_fatbin_version = *(uint32_t*)(nv_fatbin + 4); + //uint64_t nv_fatbin_size = *(uint64_t *)(nv_fatbin + 8); + char *fatbin = (nv_fatbin + 16); + //uint32_t fatbin_flags = *(uint32_t*)(fatbin); + uint32_t elf_offset = *(uint32_t*)(fatbin + 4); + char *elf = fatbin + elf_offset; + uint32_t elf_size = *(uint32_t*)(fatbin + 8); + char *ptx_bin = elf + elf_size; + //uint32_t unknown = *(uint32_t*)(ptx_bin); + uint32_t ptx_text_offset = *(uint32_t*)(ptx_bin + 4); + char *ptx_text = ptx_bin + ptx_text_offset; + uint32_t ptx_text_size = *(uint32_t*)(ptx_bin + 8); + struct pipe_compute_state compute_state; + struct pipe_grid_info info; + + memset(&arg_info, 0, sizeof(arg_info)); + list_inithead(&arg_info.list); + // FIXME: Should we be compiling ptx in __cudaRegisterFunction() ? + + // TOOO: compile_ptx_to_isa(); + for (unsigned i = 0; i < ptx_text_size; i++) { + printf("%c", ptx_text[i]); + } + + // TODO: Create compute_state and bind + memset(&compute_state, 0, sizeof(compute_state)); + compute_state.ir_type = PIPE_SHADER_IR_PTX; + compute_state.req_input_mem = read_arginfo(elf, elf_size, global_fatbin.dev_func_name, &arg_info); + compute_state.prog = malloc(ptx_text_size + 4); + *(uint32_t *)compute_state.prog = ptx_text_size; + memcpy(((uint32_t*)compute_state.prog) + 1, ptx_text, ptx_text_size); + + info.work_dim = get_work_dim(exec_state->block_dim, exec_state->grid_dim); + memcpy(info.block, &exec_state->block_dim, sizeof(info.block)); + memcpy(info.grid, &exec_state->grid_dim, sizeof(info.grid)); + info.pc = 0; + info.input = exec_state->args; + +#if !USE_FAKE_CONTEXT + exec_state->compute_state = + ctx->pipe_ctx->create_compute_state(ctx->pipe_ctx, &compute_state); + ctx->pipe_ctx->bind_compute_state(ctx->pipe_ctx, exec_state->compute_state); + // TODO: bind_sampler_states + // TODO: set_sampler_views + // TODO: set_compute_resources + list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) { + char *arg_start = exec_state->args + buf->offset; + ctx->pipe_ctx->set_global_binding(ctx->pipe_ctx, 0, 1, &buf->res, + (uint32_t**)&arg_start); + } + + CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>" + " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1], + info.block[2], info.grid[0], info.grid[1], info.grid[2], + info.pc, compute_state.req_input_mem); + + ctx->pipe_ctx->launch_grid(ctx->pipe_ctx, &info); + // TODO: UBNIND + ctx->pipe_ctx->memory_barrier(ctx->pipe_ctx, PIPE_BARRIER_GLOBAL_BUFFER); +#else + list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) { + char *arg_start = exec_state->args + buf->offset; + *(struct pipe_resource**)arg_start = buf->res; + } + + CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>" + " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1], + info.block[2], info.grid[0], info.grid[1], info.grid[2], + info.pc, compute_state.req_input_mem); + + CUDA_TRACE_NOARG("Inputs: \n"); + for (unsigned i = 0; i < compute_state.req_input_mem; i+=4) { + CUDA_TRACE("Dword %d: %x\n", i / 4, *(int*)(exec_state->args + i)); + } +#endif + + return cudaSuccess; +} + +void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, + char *deviceFun, const char *deviceName, + int thread_limit, void *tid, void *bid, + void *bDim, void *gDim, int *wSize) { + CUDA_TRACE("fatCubinHandle = %p hostFun = %s " + "deviceFun = %s deviceName = %s thread_limit = %d " + "tid = %p bid = %p bDim = %p gDim = %p wSize = %p\n", + fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, + tid, bid, bDim, gDim, wSize); + struct cuda_exec_state *exec_state = get_exec_state(); + if (!exec_state) { + return; + } + global_fatbin.dev_func_name = deviceFun; +} + +void** __cudaRegisterFatBinary(void *fatCubin) { + CUDA_TRACE("fatCubin = %p\n", fatCubin); + uint32_t magic = *(uint32_t*)fatCubin; + uint32_t version = *(uint32_t*)(fatCubin + 4); + char *nv_fatbin = *(char**)(fatCubin + 8); + + CUDA_TRACE("magic = %x version = %x nv_fatbin = %p\n", magic, version, nv_fatbin); + global_fatbin.ptr = nv_fatbin; + + // FIXME: What should we return here? + return (void**)&global_fatbin.ptr; +} + + +void __cudaUnregisterFatBinary(void **fatCubinHandle) { + CUDA_TRACE("fatCubinHandle = %p\n", fatCubinHandle); +} + +/******************************************************************************/ +/* Functions calls called by users */ +/******************************************************************************/ +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); +cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem /*__dv(0)*/, cudaStream_t stream /*__dv(0)*/); +cudaError_t cudaMalloc(void **devPtr, size_t size); +cudaError_t cudaDeviceSynchronize(void); +cudaError_t cudaDeviceReset(void); + +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { + CUDA_TRACE("dst = %p src = %p count = %zd kind = %d\n", dst, src, count, kind); + struct cuda_ctx *ctx = get_ctx(); + struct pipe_transfer *transfer_src = NULL, *transfer_dst = NULL; + struct pipe_box box; + + if (!ctx || !ctx->dev) { + return cudaErrorNoDevice; + } + + memset(&box, 0, sizeof(box)); + box.width = count; + box.height = 1; + box.depth = 1; + + if (kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToDevice) { + dst = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx, + (struct pipe_resource *)dst, + 0, PIPE_TRANSFER_WRITE, &box, + &transfer_dst); + if (!dst) { + return cudaErrorInvalidDevicePointer; + } + } + + if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice) { + src = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx, + (struct pipe_resource *)src, + 0, PIPE_TRANSFER_READ, &box, + &transfer_src); + if (!src) { + return cudaErrorInvalidDevicePointer; + } + } + + memcpy(dst, src, count); + + if (transfer_src) { + ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_src); + } + + if (transfer_dst) { + ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_dst); + } + + return cudaSuccess; +} + +cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream) { + CUDA_TRACE("gridDim = <%d, %d, %d> blockDim = <%d, %d, %d> sharedMem = %zd stream = ?\n", + gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + sharedMem); + struct cuda_ctx *ctx = get_ctx(); + struct cuda_exec_state *state; + + if (!ctx) { + return cudaErrorNoDevice; + } + + state = malloc(sizeof(struct cuda_exec_state)); + state->grid_dim = gridDim; + state->block_dim = blockDim; + state->shared_mem = sharedMem; + state->stream = stream; + state->arg_size = 1024; + state->args = malloc(state->arg_size); + list_inithead(&state->list); + + list_add(&state->list, &ctx->exec_state.list); + return 0; +} + +cudaError_t cudaMalloc(void **devPtr, size_t size) { + CUDA_TRACE("devPtr = %p size = %ld\n", devPtr, size); + struct cuda_ctx *ctx = get_ctx(); + struct pipe_resource info; + struct pipe_resource *res; + struct cuda_buffer *buf; + + if (!ctx) { + return cudaErrorNoDevice; + } + + info.width0 = size; + info.height0 = 1; + info.depth0 = 1; + info.array_size = 1; + info.target = PIPE_BUFFER; + info.bind = PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_COMPUTE_RESOURCE | + PIPE_BIND_GLOBAL; + +#if USE_FAKE_CONTEXT + (void)info; + res = malloc(sizeof(struct pipe_resource)); +#else + res = ctx->dev->screen->resource_create(ctx->dev->screen, &info); + if (!res) { + return cudaErrorMemoryAllocation; + } +#endif + + buf = malloc(sizeof(struct cuda_buffer)); + buf->res = res; + buf->offset = UNUSED_BUFFER_OFFSET; + list_add(&buf->list, &ctx->buffers.list); + + *devPtr = res; + + return cudaSuccess; +} + +cudaError_t cudaDeviceSynchronize(void) { + CUDA_TRACE_NOARG("\n"); + return 0; +} + +cudaError_t cudaDeviceReset(void) { + CUDA_TRACE_NOARG("\n"); + return 0; +} |