diff options
author | Ruiling Song <ruiling.song@intel.com> | 2016-01-19 11:29:07 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-11-08 20:38:21 +0800 |
commit | 74f2113c479a3a85525e07ad5d410f83218dbadb (patch) | |
tree | 5cd9cfebe4b949a8ea48bdc0f72cebd6de8b7a7e | |
parent | 295b1cb6d1ac616dd4d9b4b2ef14934b86e3ffbf (diff) |
GBE: Implement to_local/private/global() function
to avoid zero address in local memory, 4 bytes is reserved. this will be fixed later.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r-- | backend/src/backend/program.h | 1 | ||||
-rw-r--r-- | backend/src/ir/profile.cpp | 4 | ||||
-rw-r--r-- | backend/src/ir/profile.hpp | 3 | ||||
-rw-r--r-- | backend/src/libocl/include/ocl_misc.h | 9 | ||||
-rw-r--r-- | backend/src/libocl/src/ocl_misc.cl | 24 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_backend.cpp | 23 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_ocl_function.hxx | 1 | ||||
-rw-r--r-- | src/cl_command_queue_gen7.c | 11 |
8 files changed, 70 insertions, 6 deletions
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index dadc1716..3ffd05d0 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -110,6 +110,7 @@ enum gbe_curbe_type { GBE_CURBE_PROFILING_TIMESTAMP4, GBE_CURBE_THREAD_ID, GBE_CURBE_CONSTANT_ADDRSPACE, + GBE_CURBE_STACK_SIZE, GBE_GEN_REG, }; diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index b7898037..e62ea568 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -48,7 +48,8 @@ namespace ir { "profiling_timestamps2", "profiling_timestamps3", "profiling_timestamps4", "threadid", - "constant_addrspace_start" + "constant_addrspace_start", + "stack_size" }; #if GBE_DEBUG @@ -101,6 +102,7 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, profilingts4, 0, GBE_CURBE_PROFILING_TIMESTAMP4); DECL_NEW_REG(FAMILY_DWORD, threadid, 1, GBE_CURBE_THREAD_ID); DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE); + DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index ba3d4fce..7f1907d7 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -77,7 +77,8 @@ namespace ir { static const Register profilingts4 = Register(33); // timestamp for profiling. static const Register threadid = Register(34); // the thread id of this thread. static const Register constant_addrspace = Register(35); // starting address of program-scope constant - static const uint32_t regNum = 36; // number of special registers + static const Register stacksize = Register(36); // stack buffer total size + static const uint32_t regNum = 37; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h index 7d4abab2..2c0d7003 100644 --- a/backend/src/libocl/include/ocl_misc.h +++ b/backend/src/libocl/include/ocl_misc.h @@ -152,4 +152,13 @@ uint8 __gen_ocl_vme(image2d_t, image2d_t, uint, uint, uint, uint, uint, uint, uint, uint, int, int, int); + +bool __gen_ocl_in_local(size_t p); +bool __gen_ocl_in_private(size_t p); + +#if (__OPENCL_C_VERSION__ >= 200) +local void *__to_local(generic void *p); +global void *__to_global(generic void *p); +private void *__to_private(generic void *p); +#endif #endif diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl index 94bf1781..3b2eb929 100644 --- a/backend/src/libocl/src/ocl_misc.cl +++ b/backend/src/libocl/src/ocl_misc.cl @@ -229,3 +229,27 @@ struct time_stamp __gen_ocl_get_timestamp(void) { return val; }; + +bool __gen_ocl_in_local(size_t p) { + bool cond1 = p > 0; + bool cond2 = p < 64*1024; + return cond1 && cond2; +} + +#if (__OPENCL_C_VERSION__ >= 200) +local void *__to_local(generic void *p) { + bool cond = __gen_ocl_in_local((size_t)p); + return cond ? (local void*)p : NULL; +} +private void *__to_private(generic void *p) { + bool cond = __gen_ocl_in_private((size_t)p); + return cond ? (private void*)p : NULL; +} + +global void *__to_global(generic void *p) { + bool cond1 = __gen_ocl_in_local((size_t)p); + bool cond2 = __gen_ocl_in_private((size_t)p); + bool cond = cond1 || cond2; + return !cond ? (global void*)p : NULL; +} +#endif diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 5b080ad2..cd0ba9fd 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2840,6 +2840,8 @@ namespace gbe const Constant *c = v.getInitializer(); Type *ty = c->getType(); uint32_t oldSlm = f.getSLMSize(); + // FIXME temporary reserve 4 bytes to avoid 0 address + if (oldSlm == 0) oldSlm = 4; uint32_t align = 8 * getAlignmentByte(unit, ty); uint32_t padding = getPadding(oldSlm*8, align); @@ -3855,6 +3857,7 @@ namespace gbe case GEN_OCL_SIMD_SIZE: case GEN_OCL_READ_TM: case GEN_OCL_REGION: + case GEN_OCL_IN_PRIVATE: case GEN_OCL_SIMD_ID: case GEN_OCL_SIMD_SHUFFLE: case GEN_OCL_VME: @@ -4477,7 +4480,25 @@ namespace gbe ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length, msg_type, vme_search_path_lut_x.getIntegerValue(), lut_sub_x.getIntegerValue()); - + break; + } + case GEN_OCL_IN_PRIVATE: + { + const ir::Register dst = this->getRegister(&I); + uint32_t stackSize = ctx.getFunction().getStackSize(); + if (stackSize == 0) { + ctx.MOV(ir::TYPE_BOOL, dst, ir::ocl::zero); + } else { + ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL); + ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL); + const ir::Register src0 = this->getRegister(*AI); + ir::Register tmp = ctx.reg(ir::FAMILY_QWORD); + + ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer); + ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize); + ctx.LT(ir::TYPE_U64, cmp1, src0, tmp); + ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1); + } break; } case GEN_OCL_REGION: diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 8ab4373a..9444f0e8 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -169,6 +169,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle) DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm) DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region) +DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private) DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme) diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index fa28b74c..5033b0ca 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -273,7 +273,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) const int32_t per_lane_stack_sz = ker->stack_size; const int32_t value = GBE_CURBE_EXTRA_ARGUMENT; const int32_t sub_value = GBE_STACK_BUFFER; - const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value); + const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value); int32_t stack_sz = per_lane_stack_sz; /* No stack required for this kernel */ @@ -283,7 +283,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) /* The stack size is given for *each* SIMD lane. So, we accordingly compute * the size we need for the complete machine */ - assert(offset >= 0); + assert(offset_stack_buffer >= 0); stack_sz *= interp_kernel_get_simd_width(ker->opaque); stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit; @@ -293,7 +293,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) */ cl_driver_enlarge_stack_size(ctx->drv, &stack_sz); - cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE); + const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0); + if (offset_stack_size >= 0) { + *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz; + } + + cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE); } static int |