1 files changed, 575 insertions, 0 deletions
diff --git a/src/gallium/state_trackers/guda/guda.c b/src/gallium/state_trackers/guda/guda.c
new file mode 100644
index 00000000000..92dc662bcb9
--- /dev/null
+++ b/src/gallium/state_trackers/guda/guda.c
@@ -0,0 +1,575 @@
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <libelf.h>
+#include <gelf.h>
+
+#include <crt/host_runtime.h>
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "pipe-loader/pipe_loader.h"
+#include "util/list.h"
+
+/******************************************************************************/
+/* State tracker internal functions / global state                            */
+/******************************************************************************/
+
+#define USE_FAKE_CONTEXT 0
+
+#define UNUSED_BUFFER_OFFSET ~0
+struct cuda_ctx;
+
+struct cuda_dev {
+   struct pipe_loader_device *pipe_dev;
+   struct pipe_screen *screen;
+};
+
+struct cuda_buffer {
+   struct list_head list;
+   unsigned offset;
+   struct pipe_resource *res;
+};
+
+enum cuda_arg_type {
+   CUDA_ARG_SCALAR,
+   CUDA_ARG_PTR
+};
+
+struct cuda_arg_info {
+   struct list_head list;
+   enum cuda_arg_type type;
+   unsigned argno;
+   unsigned offset;
+   unsigned size;
+};
+
+struct cuda_exec_state {
+   struct list_head list;
+   dim3 grid_dim;
+   dim3 block_dim;
+   size_t shared_mem;
+   cudaStream_t stream;
+   char *args;
+   unsigned arg_size;
+   struct cuda_arg_info arg_info;
+   void *compute_state;
+};
+
+struct cuda_ctx {
+   struct pipe_context *pipe_ctx;
+   struct cuda_dev *dev;
+   struct cuda_buffer buffers;
+   struct cuda_exec_state exec_state;
+};
+
+// FIXME Support more than one device function;
+struct cuda_fatbin {
+   char *ptr;
+   char *dev_func_name;
+} global_fatbin;
+
+struct cuda_dev **cuda_dev_list = NULL;
+
+// FIXME primary_ctx should be shared across all CPU threads.
+struct cuda_ctx **primary_ctx_list = NULL;
+struct cuda_ctx *current_ctx = NULL;
+size_t current_dev = 0;
+
+#define CUDA_TRACE(fmt, ...) printf("%s: "fmt, __PRETTY_FUNCTION__, __VA_ARGS__)
+#define CUDA_TRACE_NOARG(fmt) printf("%s: "fmt, __PRETTY_FUNCTION__)
+
+static int init_cuda_dev_list() {
+   struct pipe_loader_device **pipe_dev_list;
+   int n = pipe_loader_probe(NULL, 0);
+   int i,j;
+
+   if (n == 0) {
+      return 0;
+   }
+
+   pipe_dev_list = malloc(n * sizeof(struct pipe_loader_device*));
+   cuda_dev_list = malloc(n * sizeof(struct cuda_dev*));
+
+   n = pipe_loader_probe(pipe_dev_list, n);
+
+   for (i = 0, j = 0; i < n; i++) {
+      struct pipe_loader_device *pipe_dev = pipe_dev_list[i];
+      struct pipe_screen *screen = pipe_loader_create_screen(pipe_dev);
+      struct cuda_dev *dev;
+      if (!screen || !screen->get_param(screen, PIPE_CAP_COMPUTE)) {
+         if (screen) {
+            screen->destroy(screen);
+         }
+         pipe_loader_release(&pipe_dev, 1);
+         continue;
+      }
+
+      dev = malloc(sizeof(struct cuda_dev));
+      dev->pipe_dev = pipe_dev;
+      dev->screen = screen;
+      cuda_dev_list[j] = dev;
+   } 
+
+   if (j == 0) {
+      free(pipe_dev_list);
+      free(cuda_dev_list);
+      pipe_dev_list = NULL;
+      cuda_dev_list = NULL;
+   }
+   return j;
+}
+
+static void init_primary_ctx_list(int n) {
+   primary_ctx_list = calloc(n, sizeof(struct cuda_ctx*));
+}
+
+static struct cuda_ctx *create_cuda_ctx(struct cuda_dev *dev) {
+   struct cuda_ctx *ctx = malloc(sizeof(struct cuda_ctx));
+   if (dev)
+   ctx->pipe_ctx = dev->screen->context_create(dev->screen, NULL,
+                                               PIPE_CONTEXT_COMPUTE_ONLY);
+   ctx->dev = dev;
+   memset(&ctx->exec_state, 0, sizeof(ctx->exec_state));
+   list_inithead(&ctx->exec_state.list);
+   memset(&ctx->buffers, 0, sizeof(ctx->buffers));
+   list_inithead(&ctx->buffers.list);
+   return ctx;
+}
+
+static struct cuda_ctx *get_ctx() {
+#if USE_FAKE_CONTEXT
+   if (!current_ctx) {
+      current_ctx = create_cuda_ctx(NULL);
+   }
+#endif   
+   if (current_ctx) {
+      return current_ctx;
+   }
+
+   // There is no current context, so we need to use the primary context.
+   if (!cuda_dev_list) {
+      // Is this the best pla
+      int n = init_cuda_dev_list();
+      if (n == 0) {
+         return NULL;
+      }
+      init_primary_ctx_list(n);
+   }
+
+   if (!primary_ctx_list[current_dev]) {
+      primary_ctx_list[current_dev] = create_cuda_ctx(cuda_dev_list[current_dev]);
+   }
+
+   return primary_ctx_list[current_dev];
+}
+
+static struct cuda_exec_state *get_exec_state() {
+   struct cuda_ctx *ctx = get_ctx();
+   if (!ctx) {
+      return NULL;
+   }
+   return list_first_entry(&ctx->exec_state.list, struct cuda_exec_state, list);
+}
+
+
+/******************************************************************************/
+/* Functions calls generated by clang                                         */
+/******************************************************************************/
+
+cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset);
+cudaError_t cudaLaunch(const void* func);
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+                            char *deviceFun, const char *deviceName,
+                            int thread_limit, void *tid, void *bid,
+                            void *bDim, void *gDim, int *wSize);
+void** __cudaRegisterFatBinary(void *fatCubin);
+extern void __cudaUnregisterFatBinary(void **fatCubinHandle);
+
+
+cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset) {
+   CUDA_TRACE("arg = %p size = %ld offset = %ld)\n", arg, size, offset);
+   struct cuda_exec_state *exec_state = get_exec_state();
+   struct cuda_ctx *ctx = get_ctx();
+   if (!exec_state) {
+      return cudaErrorNoDevice;
+   }
+
+   if (offset + size > exec_state->arg_size) {
+      exec_state->arg_size *= 2;
+      if (!realloc(exec_state->args, exec_state->arg_size))
+         return cudaErrorUnknown;
+   }
+
+   list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+      if (buf->res == *(void**)arg) {
+         CUDA_TRACE("Arg is a buffer: %p\n", *(void**)arg);
+         buf->offset = offset;
+      }
+   }
+   memcpy(exec_state->args + offset, arg, size);
+   return cudaSuccess;
+}
+
+#if 0
+// This function extrace what I think is NV ISA for the given function
+static void extract_binary(unsigned char *elf_bin, unsigned elf_size, const char *func_name) {
+   Elf *elf = elf_memory(elf_bin, elf_size);
+   size_t section_str_index;
+   elf_getshdrstrndx(elf, &section_str_index);
+
+   for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) {
+      GElf_Shdr header;
+      if (gelf_getshdr(section, &header) != &header) {
+         //XXX:  Error here
+      }
+
+      if (strcmp(elf_strptr(elf, section_str_index, header.sh_name), ".symtab"))
+         continue;
+
+      Elf_Data *const symtab_data = elf_getdata(section, NULL);
+      GElf_Sym symbol;
+      GElf_Sym *s;
+      unsigned i = 0;
+
+      while ((s = gelf_getsym(symtab_data, i++, &symbol))) {
+         Elf_Scn *symbol_scn;
+         GElf_Shdr symbol_shdr;
+         const char *name = elf_strptr(elf, header.sh_link, s->st_name);
+         size_t symbol_offset;
+         printf("func_name = %s symbol name = %s idx = %d\n", func_name, name, s->st_shndx);
+         // FIXME: Not safe!
+         if (strcmp(name, func_name))
+            continue;
+
+         symbol_scn = elf_getscn(elf, s->st_shndx);
+         gelf_getshdr(symbol_scn, &symbol_shdr);
+         symbol_offset = symbol_shdr.sh_offset + s->st_value;
+         printf("size = %d\n", s->st_size);
+         printf("offset = %d\n", symbol_offset);
+         for (int j = 0; j < s->st_size; j++) {
+                 printf("%x", *(elf_bin + symbol_offset + j));
+         }
+      }
+   }
+
+}
+#endif
+
+
+static int read_arginfo(char *elf_bin, unsigned elf_size,
+                         const char *func_name, struct cuda_arg_info *arg_info) {
+
+   Elf *elf = elf_memory(elf_bin, elf_size);
+   size_t section_str_index;
+   elf_getshdrstrndx(elf, &section_str_index);
+   int req_input_mem = 0;
+
+   for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) {
+      GElf_Shdr header;
+		Elf_Data *section_data = NULL;
+      unsigned int *data;
+      if (gelf_getshdr(section, &header) != &header) {
+         //XXX:  Error here
+      }
+      const char *section_name =
+            elf_strptr(elf, section_str_index, header.sh_name);
+
+      if (strncmp(section_name, ".nv.info.", 9) ||
+          strcmp(section_name + 9, func_name))
+         continue;
+
+      section_data = elf_getdata(section, section_data);
+      data = section_data->d_buf;
+
+      // FIXME: I have no idea what the first 16 bytes are.
+      data +=4;
+
+      while (data != section_data->d_buf + header.sh_size) {
+         unsigned dword2, dword3;
+         struct cuda_arg_info *arg = malloc(sizeof(struct cuda_arg_info));
+         list_inithead(&arg->list);
+         list_add(&arg->list, &arg_info->list);
+
+         // FIXME: I have no idea what the first 8 bytes are.
+         data+=2;
+         dword2 = *data++;
+         dword3 = *data++;
+         arg->argno = dword2 & 0xffff; 
+         arg->offset = dword2 >> 16;
+         // FIXME: Not sure about this one.
+         arg->size = 0x2 << ((dword3 >> 20) & 0xf);
+         req_input_mem += arg->size;
+         CUDA_TRACE("argno = %d offset = %d size = %d\n", arg->argno, arg->offset, arg->size);
+      }
+   }
+   return req_input_mem;
+}
+
+static int get_work_dim(dim3 block_dim, dim3 grid_dim) {
+   if (block_dim.z * grid_dim.z > 1)
+      return 3;
+   if (block_dim.y * grid_dim.y > 1)
+      return 2;
+   return 1;
+}
+
+cudaError_t cudaLaunch(const void* func) {
+   CUDA_TRACE("func = %p\n", func);
+   // FIXME: I think  need to use func to look up which function to execute. We
+   // currently just execute the most recently registered function.
+   struct cuda_exec_state *exec_state = get_exec_state();
+   struct cuda_ctx *ctx = get_ctx();
+   struct cuda_arg_info arg_info;
+   if (!exec_state) {
+      return cudaErrorNoDevice;
+   }
+
+   char *nv_fatbin = global_fatbin.ptr;
+   //uint32_t nv_fatbin_magic = *(uint32_t*)(nv_fatbin);
+   //uint32_t nv_fatbin_version = *(uint32_t*)(nv_fatbin + 4);
+   //uint64_t nv_fatbin_size = *(uint64_t *)(nv_fatbin + 8);
+   char *fatbin = (nv_fatbin + 16);
+   //uint32_t fatbin_flags = *(uint32_t*)(fatbin);
+   uint32_t elf_offset = *(uint32_t*)(fatbin + 4); 
+   char *elf = fatbin + elf_offset;
+   uint32_t elf_size = *(uint32_t*)(fatbin + 8);
+   char *ptx_bin = elf + elf_size;
+   //uint32_t unknown = *(uint32_t*)(ptx_bin);
+   uint32_t ptx_text_offset = *(uint32_t*)(ptx_bin + 4);
+   char *ptx_text = ptx_bin + ptx_text_offset;
+   uint32_t ptx_text_size = *(uint32_t*)(ptx_bin + 8);
+   struct pipe_compute_state compute_state;
+   struct pipe_grid_info info;
+
+   memset(&arg_info, 0, sizeof(arg_info));
+   list_inithead(&arg_info.list);
+   // FIXME: Should we be compiling ptx in __cudaRegisterFunction() ?
+
+   // TOOO: compile_ptx_to_isa();
+   for (unsigned i = 0; i < ptx_text_size; i++) {
+     printf("%c", ptx_text[i]);
+   }
+
+   // TODO: Create compute_state and bind
+   memset(&compute_state, 0, sizeof(compute_state));
+   compute_state.ir_type = PIPE_SHADER_IR_PTX;
+   compute_state.req_input_mem = read_arginfo(elf, elf_size, global_fatbin.dev_func_name, &arg_info);
+   compute_state.prog = malloc(ptx_text_size + 4);
+   *(uint32_t *)compute_state.prog = ptx_text_size;
+   memcpy(((uint32_t*)compute_state.prog) + 1, ptx_text, ptx_text_size);
+
+   info.work_dim = get_work_dim(exec_state->block_dim, exec_state->grid_dim);
+   memcpy(info.block, &exec_state->block_dim, sizeof(info.block));
+   memcpy(info.grid, &exec_state->grid_dim, sizeof(info.grid));
+   info.pc = 0;
+   info.input = exec_state->args;
+
+#if !USE_FAKE_CONTEXT
+   exec_state->compute_state =
+      ctx->pipe_ctx->create_compute_state(ctx->pipe_ctx, &compute_state);
+   ctx->pipe_ctx->bind_compute_state(ctx->pipe_ctx, exec_state->compute_state);
+   // TODO: bind_sampler_states
+   // TODO: set_sampler_views
+   // TODO: set_compute_resources
+   list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+      char *arg_start = exec_state->args + buf->offset; 
+      ctx->pipe_ctx->set_global_binding(ctx->pipe_ctx, 0, 1, &buf->res,
+                                        (uint32_t**)&arg_start);
+   }
+  
+   CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>"
+              " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1],
+              info.block[2], info.grid[0], info.grid[1], info.grid[2],
+              info.pc, compute_state.req_input_mem);
+
+   ctx->pipe_ctx->launch_grid(ctx->pipe_ctx, &info);
+   // TODO: UBNIND
+   ctx->pipe_ctx->memory_barrier(ctx->pipe_ctx, PIPE_BARRIER_GLOBAL_BUFFER);
+#else
+   list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+      char *arg_start = exec_state->args + buf->offset;
+      *(struct pipe_resource**)arg_start = buf->res;
+   }
+
+   CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>"
+              " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1],
+              info.block[2], info.grid[0], info.grid[1], info.grid[2],
+              info.pc, compute_state.req_input_mem);
+
+   CUDA_TRACE_NOARG("Inputs: \n");
+   for (unsigned i = 0; i < compute_state.req_input_mem; i+=4) {
+      CUDA_TRACE("Dword %d: %x\n", i / 4, *(int*)(exec_state->args + i)); 
+   }
+#endif
+
+   return cudaSuccess;
+}
+
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+                            char *deviceFun, const char *deviceName,
+                            int thread_limit, void *tid, void *bid,
+                            void *bDim, void *gDim, int *wSize) {
+   CUDA_TRACE("fatCubinHandle = %p hostFun = %s "
+              "deviceFun = %s deviceName = %s thread_limit = %d "
+              "tid = %p bid = %p bDim = %p gDim = %p wSize = %p\n",
+              fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit,
+              tid, bid, bDim, gDim, wSize);
+   struct cuda_exec_state *exec_state = get_exec_state();
+   if (!exec_state) {
+      return;
+   }
+   global_fatbin.dev_func_name = deviceFun;
+}
+
+void** __cudaRegisterFatBinary(void *fatCubin) {
+   CUDA_TRACE("fatCubin = %p\n", fatCubin);
+   uint32_t magic = *(uint32_t*)fatCubin;
+   uint32_t version = *(uint32_t*)(fatCubin + 4);
+   char *nv_fatbin = *(char**)(fatCubin + 8);
+
+   CUDA_TRACE("magic = %x version = %x nv_fatbin = %p\n", magic, version, nv_fatbin);
+   global_fatbin.ptr = nv_fatbin;
+
+   // FIXME: What should we return here?
+   return (void**)&global_fatbin.ptr;
+}
+
+
+void __cudaUnregisterFatBinary(void **fatCubinHandle) {
+   CUDA_TRACE("fatCubinHandle = %p\n", fatCubinHandle);
+}
+
+/******************************************************************************/
+/* Functions calls called by users                                            */
+/******************************************************************************/
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind  kind);
+cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem /*__dv(0)*/, cudaStream_t stream /*__dv(0)*/);
+cudaError_t cudaMalloc(void **devPtr, size_t size);
+cudaError_t cudaDeviceSynchronize(void);
+cudaError_t cudaDeviceReset(void);
+
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
+   CUDA_TRACE("dst = %p src = %p count = %zd kind = %d\n", dst, src, count, kind);
+   struct cuda_ctx *ctx = get_ctx();
+   struct pipe_transfer *transfer_src = NULL, *transfer_dst = NULL;
+   struct pipe_box box;
+
+   if (!ctx || !ctx->dev) {
+      return cudaErrorNoDevice;
+   }
+
+   memset(&box, 0, sizeof(box));
+   box.width = count;
+   box.height = 1;
+   box.depth = 1;
+
+   if (kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToDevice) {
+      dst = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx,
+                                        (struct pipe_resource *)dst,
+                                        0, PIPE_TRANSFER_WRITE, &box,
+                                        &transfer_dst);
+      if (!dst) {
+         return cudaErrorInvalidDevicePointer;
+      }
+   }
+
+   if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice) {
+      src  = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx,
+                                        (struct pipe_resource *)src,
+                                        0, PIPE_TRANSFER_READ, &box,
+                                        &transfer_src);
+      if (!src) {
+         return cudaErrorInvalidDevicePointer;
+      }
+   }
+   
+   memcpy(dst, src, count);
+
+   if (transfer_src) {
+      ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_src);
+   }
+
+   if (transfer_dst) {
+      ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_dst);
+   }
+
+   return cudaSuccess;
+}
+
+cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream) {
+   CUDA_TRACE("gridDim = <%d, %d, %d> blockDim = <%d, %d, %d> sharedMem = %zd stream = ?\n",
+              gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
+              sharedMem);
+   struct cuda_ctx *ctx = get_ctx();
+   struct cuda_exec_state *state;
+
+   if (!ctx) {
+      return cudaErrorNoDevice;
+   }
+
+   state = malloc(sizeof(struct cuda_exec_state));
+   state->grid_dim = gridDim;
+   state->block_dim = blockDim;
+   state->shared_mem = sharedMem;
+   state->stream = stream;
+   state->arg_size = 1024; 
+   state->args = malloc(state->arg_size);
+   list_inithead(&state->list);
+
+   list_add(&state->list, &ctx->exec_state.list);
+   return 0;
+}
+
+cudaError_t cudaMalloc(void **devPtr, size_t size) {
+   CUDA_TRACE("devPtr = %p size = %ld\n", devPtr, size);
+   struct cuda_ctx *ctx = get_ctx();
+   struct pipe_resource info;
+   struct pipe_resource *res;
+   struct cuda_buffer *buf;
+
+   if (!ctx) {
+      return cudaErrorNoDevice;
+   }
+
+   info.width0 = size;
+   info.height0 = 1;
+   info.depth0 = 1;
+   info.array_size = 1;
+   info.target = PIPE_BUFFER;
+   info.bind = PIPE_BIND_SAMPLER_VIEW |
+               PIPE_BIND_COMPUTE_RESOURCE |
+               PIPE_BIND_GLOBAL;
+
+#if USE_FAKE_CONTEXT
+   (void)info;
+   res = malloc(sizeof(struct pipe_resource));
+#else
+   res = ctx->dev->screen->resource_create(ctx->dev->screen, &info);
+   if (!res) {
+      return cudaErrorMemoryAllocation;
+   }
+#endif
+
+   buf = malloc(sizeof(struct cuda_buffer));
+   buf->res = res;
+   buf->offset = UNUSED_BUFFER_OFFSET;
+   list_add(&buf->list, &ctx->buffers.list);
+
+   *devPtr = res;   
+
+   return cudaSuccess;
+}
+
+cudaError_t cudaDeviceSynchronize(void) {
+   CUDA_TRACE_NOARG("\n");
+   return 0;
+}
+
+cudaError_t cudaDeviceReset(void) {
+   CUDA_TRACE_NOARG("\n");
+   return 0;
+}