summaryrefslogtreecommitdiff
path: root/src/gallium/state_trackers/guda/guda.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/state_trackers/guda/guda.c')
-rw-r--r--src/gallium/state_trackers/guda/guda.c575
1 files changed, 575 insertions, 0 deletions
diff --git a/src/gallium/state_trackers/guda/guda.c b/src/gallium/state_trackers/guda/guda.c
new file mode 100644
index 00000000000..92dc662bcb9
--- /dev/null
+++ b/src/gallium/state_trackers/guda/guda.c
@@ -0,0 +1,575 @@
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <libelf.h>
+#include <gelf.h>
+
+#include <crt/host_runtime.h>
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "pipe-loader/pipe_loader.h"
+#include "util/list.h"
+
+/******************************************************************************/
+/* State tracker internal functions / global state */
+/******************************************************************************/
+
+#define USE_FAKE_CONTEXT 0
+
+#define UNUSED_BUFFER_OFFSET ~0
+struct cuda_ctx;
+
+struct cuda_dev {
+ struct pipe_loader_device *pipe_dev;
+ struct pipe_screen *screen;
+};
+
+struct cuda_buffer {
+ struct list_head list;
+ unsigned offset;
+ struct pipe_resource *res;
+};
+
+enum cuda_arg_type {
+ CUDA_ARG_SCALAR,
+ CUDA_ARG_PTR
+};
+
+struct cuda_arg_info {
+ struct list_head list;
+ enum cuda_arg_type type;
+ unsigned argno;
+ unsigned offset;
+ unsigned size;
+};
+
+struct cuda_exec_state {
+ struct list_head list;
+ dim3 grid_dim;
+ dim3 block_dim;
+ size_t shared_mem;
+ cudaStream_t stream;
+ char *args;
+ unsigned arg_size;
+ struct cuda_arg_info arg_info;
+ void *compute_state;
+};
+
+struct cuda_ctx {
+ struct pipe_context *pipe_ctx;
+ struct cuda_dev *dev;
+ struct cuda_buffer buffers;
+ struct cuda_exec_state exec_state;
+};
+
+// FIXME Support more than one device function;
+struct cuda_fatbin {
+ char *ptr;
+ char *dev_func_name;
+} global_fatbin;
+
+struct cuda_dev **cuda_dev_list = NULL;
+
+// FIXME primary_ctx should be shared across all CPU threads.
+struct cuda_ctx **primary_ctx_list = NULL;
+struct cuda_ctx *current_ctx = NULL;
+size_t current_dev = 0;
+
+#define CUDA_TRACE(fmt, ...) printf("%s: "fmt, __PRETTY_FUNCTION__, __VA_ARGS__)
+#define CUDA_TRACE_NOARG(fmt) printf("%s: "fmt, __PRETTY_FUNCTION__)
+
+static int init_cuda_dev_list() {
+ struct pipe_loader_device **pipe_dev_list;
+ int n = pipe_loader_probe(NULL, 0);
+ int i,j;
+
+ if (n == 0) {
+ return 0;
+ }
+
+ pipe_dev_list = malloc(n * sizeof(struct pipe_loader_device*));
+ cuda_dev_list = malloc(n * sizeof(struct cuda_dev*));
+
+ n = pipe_loader_probe(pipe_dev_list, n);
+
+ for (i = 0, j = 0; i < n; i++) {
+ struct pipe_loader_device *pipe_dev = pipe_dev_list[i];
+ struct pipe_screen *screen = pipe_loader_create_screen(pipe_dev);
+ struct cuda_dev *dev;
+ if (!screen || !screen->get_param(screen, PIPE_CAP_COMPUTE)) {
+ if (screen) {
+ screen->destroy(screen);
+ }
+ pipe_loader_release(&pipe_dev, 1);
+ continue;
+ }
+
+ dev = malloc(sizeof(struct cuda_dev));
+ dev->pipe_dev = pipe_dev;
+ dev->screen = screen;
+ cuda_dev_list[j] = dev;
+ }
+
+ if (j == 0) {
+ free(pipe_dev_list);
+ free(cuda_dev_list);
+ pipe_dev_list = NULL;
+ cuda_dev_list = NULL;
+ }
+ return j;
+}
+
+static void init_primary_ctx_list(int n) {
+ primary_ctx_list = calloc(n, sizeof(struct cuda_ctx*));
+}
+
+static struct cuda_ctx *create_cuda_ctx(struct cuda_dev *dev) {
+ struct cuda_ctx *ctx = malloc(sizeof(struct cuda_ctx));
+ if (dev)
+ ctx->pipe_ctx = dev->screen->context_create(dev->screen, NULL,
+ PIPE_CONTEXT_COMPUTE_ONLY);
+ ctx->dev = dev;
+ memset(&ctx->exec_state, 0, sizeof(ctx->exec_state));
+ list_inithead(&ctx->exec_state.list);
+ memset(&ctx->buffers, 0, sizeof(ctx->buffers));
+ list_inithead(&ctx->buffers.list);
+ return ctx;
+}
+
+static struct cuda_ctx *get_ctx() {
+#if USE_FAKE_CONTEXT
+ if (!current_ctx) {
+ current_ctx = create_cuda_ctx(NULL);
+ }
+#endif
+ if (current_ctx) {
+ return current_ctx;
+ }
+
+ // There is no current context, so we need to use the primary context.
+ if (!cuda_dev_list) {
+ // Is this the best pla
+ int n = init_cuda_dev_list();
+ if (n == 0) {
+ return NULL;
+ }
+ init_primary_ctx_list(n);
+ }
+
+ if (!primary_ctx_list[current_dev]) {
+ primary_ctx_list[current_dev] = create_cuda_ctx(cuda_dev_list[current_dev]);
+ }
+
+ return primary_ctx_list[current_dev];
+}
+
+static struct cuda_exec_state *get_exec_state() {
+ struct cuda_ctx *ctx = get_ctx();
+ if (!ctx) {
+ return NULL;
+ }
+ return list_first_entry(&ctx->exec_state.list, struct cuda_exec_state, list);
+}
+
+
+/******************************************************************************/
+/* Functions calls generated by clang */
+/******************************************************************************/
+
+cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset);
+cudaError_t cudaLaunch(const void* func);
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+ char *deviceFun, const char *deviceName,
+ int thread_limit, void *tid, void *bid,
+ void *bDim, void *gDim, int *wSize);
+void** __cudaRegisterFatBinary(void *fatCubin);
+extern void __cudaUnregisterFatBinary(void **fatCubinHandle);
+
+
+cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset) {
+ CUDA_TRACE("arg = %p size = %ld offset = %ld)\n", arg, size, offset);
+ struct cuda_exec_state *exec_state = get_exec_state();
+ struct cuda_ctx *ctx = get_ctx();
+ if (!exec_state) {
+ return cudaErrorNoDevice;
+ }
+
+ if (offset + size > exec_state->arg_size) {
+ exec_state->arg_size *= 2;
+ if (!realloc(exec_state->args, exec_state->arg_size))
+ return cudaErrorUnknown;
+ }
+
+ list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+ if (buf->res == *(void**)arg) {
+ CUDA_TRACE("Arg is a buffer: %p\n", *(void**)arg);
+ buf->offset = offset;
+ }
+ }
+ memcpy(exec_state->args + offset, arg, size);
+ return cudaSuccess;
+}
+
+#if 0
+// This function extrace what I think is NV ISA for the given function
+static void extract_binary(unsigned char *elf_bin, unsigned elf_size, const char *func_name) {
+ Elf *elf = elf_memory(elf_bin, elf_size);
+ size_t section_str_index;
+ elf_getshdrstrndx(elf, &section_str_index);
+
+ for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) {
+ GElf_Shdr header;
+ if (gelf_getshdr(section, &header) != &header) {
+ //XXX: Error here
+ }
+
+ if (strcmp(elf_strptr(elf, section_str_index, header.sh_name), ".symtab"))
+ continue;
+
+ Elf_Data *const symtab_data = elf_getdata(section, NULL);
+ GElf_Sym symbol;
+ GElf_Sym *s;
+ unsigned i = 0;
+
+ while ((s = gelf_getsym(symtab_data, i++, &symbol))) {
+ Elf_Scn *symbol_scn;
+ GElf_Shdr symbol_shdr;
+ const char *name = elf_strptr(elf, header.sh_link, s->st_name);
+ size_t symbol_offset;
+ printf("func_name = %s symbol name = %s idx = %d\n", func_name, name, s->st_shndx);
+ // FIXME: Not safe!
+ if (strcmp(name, func_name))
+ continue;
+
+ symbol_scn = elf_getscn(elf, s->st_shndx);
+ gelf_getshdr(symbol_scn, &symbol_shdr);
+ symbol_offset = symbol_shdr.sh_offset + s->st_value;
+ printf("size = %d\n", s->st_size);
+ printf("offset = %d\n", symbol_offset);
+ for (int j = 0; j < s->st_size; j++) {
+ printf("%x", *(elf_bin + symbol_offset + j));
+ }
+ }
+ }
+
+}
+#endif
+
+
+static int read_arginfo(char *elf_bin, unsigned elf_size,
+ const char *func_name, struct cuda_arg_info *arg_info) {
+
+ Elf *elf = elf_memory(elf_bin, elf_size);
+ size_t section_str_index;
+ elf_getshdrstrndx(elf, &section_str_index);
+ int req_input_mem = 0;
+
+ for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) {
+ GElf_Shdr header;
+ Elf_Data *section_data = NULL;
+ unsigned int *data;
+ if (gelf_getshdr(section, &header) != &header) {
+ //XXX: Error here
+ }
+ const char *section_name =
+ elf_strptr(elf, section_str_index, header.sh_name);
+
+ if (strncmp(section_name, ".nv.info.", 9) ||
+ strcmp(section_name + 9, func_name))
+ continue;
+
+ section_data = elf_getdata(section, section_data);
+ data = section_data->d_buf;
+
+ // FIXME: I have no idea what the first 16 bytes are.
+ data +=4;
+
+ while (data != section_data->d_buf + header.sh_size) {
+ unsigned dword2, dword3;
+ struct cuda_arg_info *arg = malloc(sizeof(struct cuda_arg_info));
+ list_inithead(&arg->list);
+ list_add(&arg->list, &arg_info->list);
+
+ // FIXME: I have no idea what the first 8 bytes are.
+ data+=2;
+ dword2 = *data++;
+ dword3 = *data++;
+ arg->argno = dword2 & 0xffff;
+ arg->offset = dword2 >> 16;
+ // FIXME: Not sure about this one.
+ arg->size = 0x2 << ((dword3 >> 20) & 0xf);
+ req_input_mem += arg->size;
+ CUDA_TRACE("argno = %d offset = %d size = %d\n", arg->argno, arg->offset, arg->size);
+ }
+ }
+ return req_input_mem;
+}
+
+static int get_work_dim(dim3 block_dim, dim3 grid_dim) {
+ if (block_dim.z * grid_dim.z > 1)
+ return 3;
+ if (block_dim.y * grid_dim.y > 1)
+ return 2;
+ return 1;
+}
+
+cudaError_t cudaLaunch(const void* func) {
+ CUDA_TRACE("func = %p\n", func);
+ // FIXME: I think need to use func to look up which function to execute. We
+ // currently just execute the most recently registered function.
+ struct cuda_exec_state *exec_state = get_exec_state();
+ struct cuda_ctx *ctx = get_ctx();
+ struct cuda_arg_info arg_info;
+ if (!exec_state) {
+ return cudaErrorNoDevice;
+ }
+
+ char *nv_fatbin = global_fatbin.ptr;
+ //uint32_t nv_fatbin_magic = *(uint32_t*)(nv_fatbin);
+ //uint32_t nv_fatbin_version = *(uint32_t*)(nv_fatbin + 4);
+ //uint64_t nv_fatbin_size = *(uint64_t *)(nv_fatbin + 8);
+ char *fatbin = (nv_fatbin + 16);
+ //uint32_t fatbin_flags = *(uint32_t*)(fatbin);
+ uint32_t elf_offset = *(uint32_t*)(fatbin + 4);
+ char *elf = fatbin + elf_offset;
+ uint32_t elf_size = *(uint32_t*)(fatbin + 8);
+ char *ptx_bin = elf + elf_size;
+ //uint32_t unknown = *(uint32_t*)(ptx_bin);
+ uint32_t ptx_text_offset = *(uint32_t*)(ptx_bin + 4);
+ char *ptx_text = ptx_bin + ptx_text_offset;
+ uint32_t ptx_text_size = *(uint32_t*)(ptx_bin + 8);
+ struct pipe_compute_state compute_state;
+ struct pipe_grid_info info;
+
+ memset(&arg_info, 0, sizeof(arg_info));
+ list_inithead(&arg_info.list);
+ // FIXME: Should we be compiling ptx in __cudaRegisterFunction() ?
+
+ // TOOO: compile_ptx_to_isa();
+ for (unsigned i = 0; i < ptx_text_size; i++) {
+ printf("%c", ptx_text[i]);
+ }
+
+ // TODO: Create compute_state and bind
+ memset(&compute_state, 0, sizeof(compute_state));
+ compute_state.ir_type = PIPE_SHADER_IR_PTX;
+ compute_state.req_input_mem = read_arginfo(elf, elf_size, global_fatbin.dev_func_name, &arg_info);
+ compute_state.prog = malloc(ptx_text_size + 4);
+ *(uint32_t *)compute_state.prog = ptx_text_size;
+ memcpy(((uint32_t*)compute_state.prog) + 1, ptx_text, ptx_text_size);
+
+ info.work_dim = get_work_dim(exec_state->block_dim, exec_state->grid_dim);
+ memcpy(info.block, &exec_state->block_dim, sizeof(info.block));
+ memcpy(info.grid, &exec_state->grid_dim, sizeof(info.grid));
+ info.pc = 0;
+ info.input = exec_state->args;
+
+#if !USE_FAKE_CONTEXT
+ exec_state->compute_state =
+ ctx->pipe_ctx->create_compute_state(ctx->pipe_ctx, &compute_state);
+ ctx->pipe_ctx->bind_compute_state(ctx->pipe_ctx, exec_state->compute_state);
+ // TODO: bind_sampler_states
+ // TODO: set_sampler_views
+ // TODO: set_compute_resources
+ list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+ char *arg_start = exec_state->args + buf->offset;
+ ctx->pipe_ctx->set_global_binding(ctx->pipe_ctx, 0, 1, &buf->res,
+ (uint32_t**)&arg_start);
+ }
+
+ CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>"
+ " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1],
+ info.block[2], info.grid[0], info.grid[1], info.grid[2],
+ info.pc, compute_state.req_input_mem);
+
+ ctx->pipe_ctx->launch_grid(ctx->pipe_ctx, &info);
+ // TODO: UBNIND
+ ctx->pipe_ctx->memory_barrier(ctx->pipe_ctx, PIPE_BARRIER_GLOBAL_BUFFER);
+#else
+ list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+ char *arg_start = exec_state->args + buf->offset;
+ *(struct pipe_resource**)arg_start = buf->res;
+ }
+
+ CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>"
+ " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1],
+ info.block[2], info.grid[0], info.grid[1], info.grid[2],
+ info.pc, compute_state.req_input_mem);
+
+ CUDA_TRACE_NOARG("Inputs: \n");
+ for (unsigned i = 0; i < compute_state.req_input_mem; i+=4) {
+ CUDA_TRACE("Dword %d: %x\n", i / 4, *(int*)(exec_state->args + i));
+ }
+#endif
+
+ return cudaSuccess;
+}
+
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+ char *deviceFun, const char *deviceName,
+ int thread_limit, void *tid, void *bid,
+ void *bDim, void *gDim, int *wSize) {
+ CUDA_TRACE("fatCubinHandle = %p hostFun = %s "
+ "deviceFun = %s deviceName = %s thread_limit = %d "
+ "tid = %p bid = %p bDim = %p gDim = %p wSize = %p\n",
+ fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit,
+ tid, bid, bDim, gDim, wSize);
+ struct cuda_exec_state *exec_state = get_exec_state();
+ if (!exec_state) {
+ return;
+ }
+ global_fatbin.dev_func_name = deviceFun;
+}
+
+void** __cudaRegisterFatBinary(void *fatCubin) {
+ CUDA_TRACE("fatCubin = %p\n", fatCubin);
+ uint32_t magic = *(uint32_t*)fatCubin;
+ uint32_t version = *(uint32_t*)(fatCubin + 4);
+ char *nv_fatbin = *(char**)(fatCubin + 8);
+
+ CUDA_TRACE("magic = %x version = %x nv_fatbin = %p\n", magic, version, nv_fatbin);
+ global_fatbin.ptr = nv_fatbin;
+
+ // FIXME: What should we return here?
+ return (void**)&global_fatbin.ptr;
+}
+
+
+void __cudaUnregisterFatBinary(void **fatCubinHandle) {
+ CUDA_TRACE("fatCubinHandle = %p\n", fatCubinHandle);
+}
+
+/******************************************************************************/
+/* Functions calls called by users */
+/******************************************************************************/
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem /*__dv(0)*/, cudaStream_t stream /*__dv(0)*/);
+cudaError_t cudaMalloc(void **devPtr, size_t size);
+cudaError_t cudaDeviceSynchronize(void);
+cudaError_t cudaDeviceReset(void);
+
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
+ CUDA_TRACE("dst = %p src = %p count = %zd kind = %d\n", dst, src, count, kind);
+ struct cuda_ctx *ctx = get_ctx();
+ struct pipe_transfer *transfer_src = NULL, *transfer_dst = NULL;
+ struct pipe_box box;
+
+ if (!ctx || !ctx->dev) {
+ return cudaErrorNoDevice;
+ }
+
+ memset(&box, 0, sizeof(box));
+ box.width = count;
+ box.height = 1;
+ box.depth = 1;
+
+ if (kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToDevice) {
+ dst = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx,
+ (struct pipe_resource *)dst,
+ 0, PIPE_TRANSFER_WRITE, &box,
+ &transfer_dst);
+ if (!dst) {
+ return cudaErrorInvalidDevicePointer;
+ }
+ }
+
+ if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice) {
+ src = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx,
+ (struct pipe_resource *)src,
+ 0, PIPE_TRANSFER_READ, &box,
+ &transfer_src);
+ if (!src) {
+ return cudaErrorInvalidDevicePointer;
+ }
+ }
+
+ memcpy(dst, src, count);
+
+ if (transfer_src) {
+ ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_src);
+ }
+
+ if (transfer_dst) {
+ ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_dst);
+ }
+
+ return cudaSuccess;
+}
+
+cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream) {
+ CUDA_TRACE("gridDim = <%d, %d, %d> blockDim = <%d, %d, %d> sharedMem = %zd stream = ?\n",
+ gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
+ sharedMem);
+ struct cuda_ctx *ctx = get_ctx();
+ struct cuda_exec_state *state;
+
+ if (!ctx) {
+ return cudaErrorNoDevice;
+ }
+
+ state = malloc(sizeof(struct cuda_exec_state));
+ state->grid_dim = gridDim;
+ state->block_dim = blockDim;
+ state->shared_mem = sharedMem;
+ state->stream = stream;
+ state->arg_size = 1024;
+ state->args = malloc(state->arg_size);
+ list_inithead(&state->list);
+
+ list_add(&state->list, &ctx->exec_state.list);
+ return 0;
+}
+
+cudaError_t cudaMalloc(void **devPtr, size_t size) {
+ CUDA_TRACE("devPtr = %p size = %ld\n", devPtr, size);
+ struct cuda_ctx *ctx = get_ctx();
+ struct pipe_resource info;
+ struct pipe_resource *res;
+ struct cuda_buffer *buf;
+
+ if (!ctx) {
+ return cudaErrorNoDevice;
+ }
+
+ info.width0 = size;
+ info.height0 = 1;
+ info.depth0 = 1;
+ info.array_size = 1;
+ info.target = PIPE_BUFFER;
+ info.bind = PIPE_BIND_SAMPLER_VIEW |
+ PIPE_BIND_COMPUTE_RESOURCE |
+ PIPE_BIND_GLOBAL;
+
+#if USE_FAKE_CONTEXT
+ (void)info;
+ res = malloc(sizeof(struct pipe_resource));
+#else
+ res = ctx->dev->screen->resource_create(ctx->dev->screen, &info);
+ if (!res) {
+ return cudaErrorMemoryAllocation;
+ }
+#endif
+
+ buf = malloc(sizeof(struct cuda_buffer));
+ buf->res = res;
+ buf->offset = UNUSED_BUFFER_OFFSET;
+ list_add(&buf->list, &ctx->buffers.list);
+
+ *devPtr = res;
+
+ return cudaSuccess;
+}
+
+cudaError_t cudaDeviceSynchronize(void) {
+ CUDA_TRACE_NOARG("\n");
+ return 0;
+}
+
+cudaError_t cudaDeviceReset(void) {
+ CUDA_TRACE_NOARG("\n");
+ return 0;
+}