summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPan Xiuli <xiuli.pan@intel.com>2016-03-16 07:16:50 +0800
committerYang Rong <rong.r.yang@intel.com>2016-11-08 20:38:22 +0800
commitb8e07f6f6ff63e4d34e9d49f224ae123fbd043cb (patch)
treec8db4fcc8fa8a24eb2ccfef534f545c1d7e59654
parentd182f461268b415b500278bb5d788ae906d8df93 (diff)
Runtime: Add support for non uniform group size
Enqueue multiple times if the the size is not uniform, at most 2 times for 1D, 4times for 2D and 8 times for 3D. Using the workdim offset of walker in batch buffer to keep work groups in series. TODO: handle events for the flush between multiple enqueues Signed-off-by: Pan Xiuli <xiuli.pan@intel.com> Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r--src/cl_api_kernel.c13
-rw-r--r--src/cl_command_queue.c64
-rw-r--r--src/cl_command_queue_gen7.c19
-rw-r--r--src/cl_driver.h1
-rw-r--r--src/intel/intel_gpgpu.c14
5 files changed, 81 insertions, 30 deletions
diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c
index 1fd687bb..70140b23 100644
--- a/src/cl_api_kernel.c
+++ b/src/cl_api_kernel.c
@@ -89,19 +89,6 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
}
}
- /* Local sizes must be non-null and divide global sizes */
- if (local_work_size != NULL) {
- for (i = 0; i < work_dim; ++i) {
- if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
- err = CL_INVALID_WORK_GROUP_SIZE;
- break;
- }
- }
- if (err != CL_SUCCESS) {
- break;
- }
- }
-
/* Queue and kernel must share the same context */
assert(kernel->program);
if (command_queue->ctx != kernel->program->ctx) {
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 5d527152..06e40a62 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -209,7 +209,8 @@ cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, cl_gpgpu gp
}
extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_event,
- uint32_t, const size_t *, const size_t *, const size_t *);
+ uint32_t, const size_t *, const size_t *,const size_t *,
+ const size_t *, const size_t *, const size_t *);
static cl_int
cl_kernel_check_args(cl_kernel k)
@@ -222,6 +223,62 @@ cl_kernel_check_args(cl_kernel k)
}
LOCAL cl_int
+cl_command_queue_ND_range_wrap(cl_command_queue queue,
+ cl_kernel ker,
+ cl_event event,
+ const uint32_t work_dim,
+ const size_t *global_wk_off,
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz)
+{
+ /* Used for non uniform work group size */
+ cl_int err = CL_SUCCESS;
+ int i,j,k;
+ const size_t global_wk_sz_div[3] = {
+ global_wk_sz[0]/local_wk_sz[0]*local_wk_sz[0],
+ global_wk_sz[1]/local_wk_sz[1]*local_wk_sz[1],
+ global_wk_sz[2]/local_wk_sz[2]*local_wk_sz[2]
+ };
+
+ const size_t global_wk_sz_rem[3] = {
+ global_wk_sz[0]%local_wk_sz[0],
+ global_wk_sz[1]%local_wk_sz[1],
+ global_wk_sz[2]%local_wk_sz[2]
+ };
+
+ const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+ /* Go through the at most 8 cases and euque if there is work items left */
+ for(i = 0; i < 2;i++) {
+ for(j = 0; j < 2;j++) {
+ for(k = 0; k < 2; k++) {
+ size_t global_wk_sz_use[3] = {global_wk_all[k][0],global_wk_all[j][1],global_wk_all[i][2]};
+ size_t global_dim_off[3] = {
+ k * global_wk_sz_div[0] / local_wk_sz[0],
+ j * global_wk_sz_div[1] / local_wk_sz[1],
+ i * global_wk_sz_div[2] / local_wk_sz[2]
+ };
+ size_t local_wk_sz_use[3] = {
+ k ? global_wk_sz_rem[0] : local_wk_sz[0],
+ j ? global_wk_sz_rem[1] : local_wk_sz[1],
+ i ? global_wk_sz_rem[2] : local_wk_sz[2]
+ };
+ if(local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) continue;
+ TRY (cl_command_queue_ND_range_gen7, queue, ker, event, work_dim, global_wk_off,global_dim_off, global_wk_sz,global_wk_sz_use,local_wk_sz, local_wk_sz_use);
+ /* TODO: need to handle events for multiple enqueue, now is a workaroud for uniform group size */
+ if(!(global_wk_sz_rem[0] == 0 && global_wk_sz_rem[1] == 0 && global_wk_sz_rem[2] == 0))
+ err = cl_command_queue_wait_flush(queue);
+ }
+ if(work_dim < 2)
+ break;
+ }
+ if(work_dim < 3)
+ break;
+ }
+error:
+ return err;
+}
+
+LOCAL cl_int
cl_command_queue_ND_range(cl_command_queue queue,
cl_kernel k,
cl_event event,
@@ -240,8 +297,9 @@ cl_command_queue_ND_range(cl_command_queue queue,
if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
- TRY (cl_command_queue_ND_range_gen7, queue, k, event,
- work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+ //TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+ TRY (cl_command_queue_ND_range_wrap, queue, k, event, work_dim,
+ global_wk_off, global_wk_sz, local_wk_sz);
else
FATAL ("Unknown Gen Device");
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 49c7ea77..44873603 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -240,9 +240,9 @@ cl_curbe_fill(cl_kernel ker,
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
- UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
- UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
- UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
+ UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0]%enqueued_local_wk_sz[0]?1:0));
+ UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1]%enqueued_local_wk_sz[1]?1:0));
+ UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2]%enqueued_local_wk_sz[2]?1:0));
UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
#undef UPLOAD
@@ -357,8 +357,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_event event,
const uint32_t work_dim,
const size_t *global_wk_off,
+ const size_t *global_dim_off,
const size_t *global_wk_sz,
- const size_t *local_wk_sz)
+ const size_t *global_wk_sz_use,
+ const size_t *local_wk_sz,
+ const size_t *local_wk_sz_use)
{
cl_gpgpu gpgpu = cl_gpgpu_new(queue->ctx->drv);
cl_context ctx = queue->ctx;
@@ -384,7 +387,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
kernel.use_slm = interp_kernel_use_slm(ker->opaque);
/* Compute the number of HW threads we need */
- if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
+ if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz_use, 3, &local_sz) != CL_SUCCESS)) {
DEBUGP(DL_ERROR, "Work group size exceed Kernel's work group size.");
return err;
}
@@ -397,7 +400,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
}
/* Curbe step 1: fill the constant urb buffer data shared by all threads */
if (ker->curbe) {
- kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz ,local_wk_sz, thread_n);
+ kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz_use ,local_wk_sz, thread_n);
if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
DEBUGP(DL_ERROR, "Out of shared local memory %d.", kernel.slm_sz);
return CL_OUT_OF_RESOURCES;
@@ -458,7 +461,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
for (i = 0; i < thread_n; ++i) {
memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
}
- TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
+ TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz_use, simd_sz, cst_sz, thread_n);
if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
goto error;
}
@@ -471,7 +474,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_gpgpu_batch_start(gpgpu);
/* Issue the GPGPU_WALKER command */
- cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+ cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off,global_dim_off, global_wk_sz_use, local_wk_sz_use);
/* Close the batch buffer and submit it */
cl_gpgpu_batch_end(gpgpu, 0);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index a13ffd92..b45e2fbf 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -326,6 +326,7 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
+ const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3]);
extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index d9562717..d56d35d1 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -2074,6 +2074,7 @@ intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
+ const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
@@ -2123,6 +2124,7 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
const size_t global_wk_off[3],
+ const size_t global_dim_off[3],
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
@@ -2150,14 +2152,14 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
else
OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
+ OUT_BATCH(gpgpu->batch, global_dim_off[0]);
OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]);
+ OUT_BATCH(gpgpu->batch, global_dim_off[1]);
OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
- OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
- OUT_BATCH(gpgpu->batch, 0);
- OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]);
+ OUT_BATCH(gpgpu->batch, global_dim_off[2]);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]);
OUT_BATCH(gpgpu->batch, right_mask);
OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
ADVANCE_BATCH(gpgpu->batch);