summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuiling Song <ruiling.song@intel.com>2016-01-19 11:29:07 +0800
committerYang Rong <rong.r.yang@intel.com>2016-11-08 20:38:21 +0800
commit74f2113c479a3a85525e07ad5d410f83218dbadb (patch)
tree5cd9cfebe4b949a8ea48bdc0f72cebd6de8b7a7e
parent295b1cb6d1ac616dd4d9b4b2ef14934b86e3ffbf (diff)
GBE: Implement to_local/private/global() function
to avoid zero address in local memory, 4 bytes is reserved. this will be fixed later. Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r--backend/src/backend/program.h1
-rw-r--r--backend/src/ir/profile.cpp4
-rw-r--r--backend/src/ir/profile.hpp3
-rw-r--r--backend/src/libocl/include/ocl_misc.h9
-rw-r--r--backend/src/libocl/src/ocl_misc.cl24
-rw-r--r--backend/src/llvm/llvm_gen_backend.cpp23
-rw-r--r--backend/src/llvm/llvm_gen_ocl_function.hxx1
-rw-r--r--src/cl_command_queue_gen7.c11
8 files changed, 70 insertions, 6 deletions
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index dadc1716..3ffd05d0 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -110,6 +110,7 @@ enum gbe_curbe_type {
GBE_CURBE_PROFILING_TIMESTAMP4,
GBE_CURBE_THREAD_ID,
GBE_CURBE_CONSTANT_ADDRSPACE,
+ GBE_CURBE_STACK_SIZE,
GBE_GEN_REG,
};
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index b7898037..e62ea568 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -48,7 +48,8 @@ namespace ir {
"profiling_timestamps2", "profiling_timestamps3",
"profiling_timestamps4",
"threadid",
- "constant_addrspace_start"
+ "constant_addrspace_start",
+ "stack_size"
};
#if GBE_DEBUG
@@ -101,6 +102,7 @@ namespace ir {
DECL_NEW_REG(FAMILY_DWORD, profilingts4, 0, GBE_CURBE_PROFILING_TIMESTAMP4);
DECL_NEW_REG(FAMILY_DWORD, threadid, 1, GBE_CURBE_THREAD_ID);
DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE);
+ DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index ba3d4fce..7f1907d7 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -77,7 +77,8 @@ namespace ir {
static const Register profilingts4 = Register(33); // timestamp for profiling.
static const Register threadid = Register(34); // the thread id of this thread.
static const Register constant_addrspace = Register(35); // starting address of program-scope constant
- static const uint32_t regNum = 36; // number of special registers
+ static const Register stacksize = Register(36); // stack buffer total size
+ static const uint32_t regNum = 37; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 7d4abab2..2c0d7003 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -152,4 +152,13 @@ uint8 __gen_ocl_vme(image2d_t, image2d_t,
uint, uint, uint, uint,
uint, uint, uint, uint,
int, int, int);
+
+bool __gen_ocl_in_local(size_t p);
+bool __gen_ocl_in_private(size_t p);
+
+#if (__OPENCL_C_VERSION__ >= 200)
+local void *__to_local(generic void *p);
+global void *__to_global(generic void *p);
+private void *__to_private(generic void *p);
+#endif
#endif
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 94bf1781..3b2eb929 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -229,3 +229,27 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
return val;
};
+
+bool __gen_ocl_in_local(size_t p) {
+ bool cond1 = p > 0;
+ bool cond2 = p < 64*1024;
+ return cond1 && cond2;
+}
+
+#if (__OPENCL_C_VERSION__ >= 200)
+local void *__to_local(generic void *p) {
+ bool cond = __gen_ocl_in_local((size_t)p);
+ return cond ? (local void*)p : NULL;
+}
+private void *__to_private(generic void *p) {
+ bool cond = __gen_ocl_in_private((size_t)p);
+ return cond ? (private void*)p : NULL;
+}
+
+global void *__to_global(generic void *p) {
+ bool cond1 = __gen_ocl_in_local((size_t)p);
+ bool cond2 = __gen_ocl_in_private((size_t)p);
+ bool cond = cond1 || cond2;
+ return !cond ? (global void*)p : NULL;
+}
+#endif
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 5b080ad2..cd0ba9fd 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2840,6 +2840,8 @@ namespace gbe
const Constant *c = v.getInitializer();
Type *ty = c->getType();
uint32_t oldSlm = f.getSLMSize();
+ // FIXME temporary reserve 4 bytes to avoid 0 address
+ if (oldSlm == 0) oldSlm = 4;
uint32_t align = 8 * getAlignmentByte(unit, ty);
uint32_t padding = getPadding(oldSlm*8, align);
@@ -3855,6 +3857,7 @@ namespace gbe
case GEN_OCL_SIMD_SIZE:
case GEN_OCL_READ_TM:
case GEN_OCL_REGION:
+ case GEN_OCL_IN_PRIVATE:
case GEN_OCL_SIMD_ID:
case GEN_OCL_SIMD_SHUFFLE:
case GEN_OCL_VME:
@@ -4477,7 +4480,25 @@ namespace gbe
ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length,
msg_type, vme_search_path_lut_x.getIntegerValue(),
lut_sub_x.getIntegerValue());
-
+ break;
+ }
+ case GEN_OCL_IN_PRIVATE:
+ {
+ const ir::Register dst = this->getRegister(&I);
+ uint32_t stackSize = ctx.getFunction().getStackSize();
+ if (stackSize == 0) {
+ ctx.MOV(ir::TYPE_BOOL, dst, ir::ocl::zero);
+ } else {
+ ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL);
+ ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL);
+ const ir::Register src0 = this->getRegister(*AI);
+ ir::Register tmp = ctx.reg(ir::FAMILY_QWORD);
+
+ ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer);
+ ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize);
+ ctx.LT(ir::TYPE_U64, cmp1, src0, tmp);
+ ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1);
+ }
break;
}
case GEN_OCL_REGION:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8ab4373a..9444f0e8 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -169,6 +169,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
+DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private)
DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index fa28b74c..5033b0ca 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -273,7 +273,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
const int32_t per_lane_stack_sz = ker->stack_size;
const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
const int32_t sub_value = GBE_STACK_BUFFER;
- const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+ const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
int32_t stack_sz = per_lane_stack_sz;
/* No stack required for this kernel */
@@ -283,7 +283,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
/* The stack size is given for *each* SIMD lane. So, we accordingly compute
* the size we need for the complete machine
*/
- assert(offset >= 0);
+ assert(offset_stack_buffer >= 0);
stack_sz *= interp_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
@@ -293,7 +293,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
*/
cl_driver_enlarge_stack_size(ctx->drv, &stack_sz);
- cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+ const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0);
+ if (offset_stack_size >= 0) {
+ *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz;
+ }
+
+ cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE);
}
static int