summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuiling Song <ruiling.song@intel.com>2016-04-05 15:20:19 +0800
committerYang Rong <rong.r.yang@intel.com>2016-11-08 20:38:22 +0800
commit9318ac9bdfd9b101fb762a04f4f9417c9b9a840f (patch)
treeecd791e3ae84687a5b16282d2ea45cb0357a8138
parent6b44815b3519e0b65240991e9f12879b2b63389a (diff)
GBE: add ocl 2.0 work_group_barrier support.
to do an image barrier, we need to: 1. flush L3 RW cache. 2. do a barrier gateway. 3. flush sampler cache. Note the fence argument maybe ORed together. We need to support non-immediate barrier() argument in future. v2: change syncField to 6, and modify syncStr. Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
-rw-r--r--backend/src/backend/gen8_encoder.cpp24
-rw-r--r--backend/src/backend/gen8_encoder.hpp2
-rw-r--r--backend/src/backend/gen8_instruction.hpp6
-rw-r--r--backend/src/backend/gen9_context.cpp9
-rw-r--r--backend/src/backend/gen_context.cpp11
-rw-r--r--backend/src/backend/gen_defs.hpp1
-rw-r--r--backend/src/backend/gen_encoder.cpp8
-rw-r--r--backend/src/backend/gen_encoder.hpp3
-rw-r--r--backend/src/ir/instruction.cpp5
-rw-r--r--backend/src/ir/instruction.hpp6
-rw-r--r--backend/src/libocl/include/ocl_sync.h1
-rw-r--r--backend/src/libocl/src/ocl_barrier.ll27
-rw-r--r--backend/src/libocl/src/ocl_sync.cl1
-rw-r--r--backend/src/llvm/llvm_gen_backend.cpp28
-rw-r--r--backend/src/llvm/llvm_gen_ocl_function.hxx2
15 files changed, 92 insertions, 42 deletions
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 7b0c1a74..90cae3dd 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -431,6 +431,30 @@ namespace gbe
this->setSrc1(&insn, GenRegister::immd(jip*8));
}
+ void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, dst);
+ setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
+ gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
+ gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1;
+ gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0;
+ }
+
+ void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, GenRegister::ud8grf(0,0));
+ unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH;
+ unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64;
+ setSamplerMessage(insn, 0, 0, msg_type,
+ 1, 1,
+ true,
+ simd_mode, 0);
+ }
void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index d67c0dcf..b73beb36 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -38,6 +38,7 @@ namespace gbe
/*! Jump indexed instruction */
virtual void JMPI(GenRegister src, bool longjmp = false);
+ virtual void FENCE(GenRegister dst, bool flushRWCache);
/*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
virtual void F16TO32(GenRegister dest, GenRegister src0);
@@ -57,6 +58,7 @@ namespace gbe
virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
unsigned char msg_type, uint32_t msg_length,
bool header_present);
+ virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index b3cbcbc0..446e7f9c 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -540,7 +540,11 @@ union Gen8NativeInstruction
/*! Memory fence */
struct {
uint32_t bti:8;
- uint32_t pad:5;
+ uint32_t pad:1;
+ uint32_t flush_instruction:1;
+ uint32_t flush_texture:1;
+ uint32_t flush_constant:1;
+ uint32_t flush_rw:1;
uint32_t commit_enable:1;
uint32_t msg_type:4;
uint32_t pad2:1;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index dc05756a..8976edee 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -34,9 +34,10 @@ namespace gbe
const GenRegister fenceDst = ra->genReg(insn.dst(0));
uint32_t barrierType = insn.extra.barrierType;
const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+ bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
- if (barrierType == ir::syncGlobalBarrier) {
- p->FENCE(fenceDst);
+ if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+ p->FENCE(fenceDst, imageFence);
p->MOV(fenceDst, fenceDst);
}
p->push();
@@ -54,6 +55,10 @@ namespace gbe
p->curr.predicate = GEN_PREDICATE_NONE;
p->WAIT();
p->pop();
+ if (imageFence) {
+ p->FLUSH_SAMPLERCACHE(fenceDst);
+ p->MOV(fenceDst, fenceDst);
+ }
}
void BxtContext::newSelection(void) {
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 1a7b24d2..1ec201bf 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1835,9 +1835,10 @@ namespace gbe
const GenRegister fenceDst = ra->genReg(insn.dst(0));
uint32_t barrierType = insn.extra.barrierType;
const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+ bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
- if (barrierType == ir::syncGlobalBarrier) {
- p->FENCE(fenceDst);
+ if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+ p->FENCE(fenceDst, imageFence);
p->MOV(fenceDst, fenceDst);
}
p->push();
@@ -1855,11 +1856,15 @@ namespace gbe
p->curr.predicate = GEN_PREDICATE_NONE;
p->WAIT();
p->pop();
+ if (imageFence) {
+ p->FLUSH_SAMPLERCACHE(fenceDst);
+ p->MOV(fenceDst, fenceDst);
+ }
}
void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
- p->FENCE(dst);
+ p->FENCE(dst, false);
p->MOV(dst, dst);
}
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 9777bf6b..a6c6bb0f 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -426,6 +426,7 @@ enum GenMessageTarget {
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6
#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7
#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10
+#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH 0x1f
/* for GEN5 only */
#define GEN_SAMPLER_SIMD_MODE_SIMD4X2 0
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 290bbb4e..a69adc7c 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,7 @@ namespace gbe
unsigned msg_length, unsigned response_length,
bool header_present, bool end_of_thread)
{
- setSrc1(inst, GenRegister::immd(0));
+ setSrc1(inst, GenRegister::immud(0));
inst->bits3.generic_gen5.header_present = header_present;
inst->bits3.generic_gen5.response_length = response_length;
inst->bits3.generic_gen5.msg_length = msg_length;
@@ -913,7 +913,7 @@ namespace gbe
insn->bits3.msg_gateway.notify = notifyN;
}
- void GenEncoder::FENCE(GenRegister dst) {
+ void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setHeader(insn);
this->setDst(insn, dst);
@@ -1183,6 +1183,10 @@ namespace gbe
header_present,
simd_mode, return_format);
}
+ void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+ // only Gen8+ support flushing sampler cache
+ assert(0);
+ }
void GenEncoder::setVmeMessage(GenNativeInstruction *insn,
unsigned char bti,
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index eccdb197..00d3eaa3 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -145,7 +145,7 @@ namespace gbe
/*! Forward the gateway message. */
void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0);
/*! Memory fence message (to order loads and stores between threads) */
- void FENCE(GenRegister dst);
+ virtual void FENCE(GenRegister dst, bool flushRWCache);
/*! Jump indexed instruction */
virtual void JMPI(GenRegister src, bool longjmp = false);
/*! IF indexed instruction */
@@ -230,6 +230,7 @@ namespace gbe
uint32_t msg_type,
unsigned char vme_search_path_lut,
unsigned char lut_sub);
+ virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
/*! TypedWrite instruction for texture */
virtual void TYPED_WRITE(GenRegister header,
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index b5396cac..0687dbf6 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1475,7 +1475,8 @@ namespace ir {
SYNC_LOCAL_READ_FENCE |
SYNC_LOCAL_WRITE_FENCE |
SYNC_GLOBAL_READ_FENCE |
- SYNC_GLOBAL_WRITE_FENCE;
+ SYNC_GLOBAL_WRITE_FENCE |
+ SYNC_IMAGE_FENCE;
if (UNLIKELY(this->parameters > maxParams)) {
whyNot = "Invalid parameters for sync instruction";
return false;
@@ -1857,7 +1858,7 @@ namespace ir {
}
static const char *syncStr[syncFieldNum] = {
- "workgroup", "local_read", "local_write", "global_read", "global_write"
+ "workgroup", "local_read", "local_write", "global_read", "global_write", "image"
};
INLINE void SyncInstruction::out(std::ostream &out, const Function &fn) const {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index fe195f40..16c2045e 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -542,17 +542,19 @@ namespace ir {
SYNC_LOCAL_WRITE_FENCE = 1<<2,
SYNC_GLOBAL_READ_FENCE = 1<<3,
SYNC_GLOBAL_WRITE_FENCE = 1<<4,
- SYNC_INVALID = 1<<5
+ SYNC_IMAGE_FENCE = 1<<5,
+ SYNC_INVALID = 1<<6
};
/*! 5 bits to encode all possible synchronization capablities */
- static const uint32_t syncFieldNum = 5u;
+ static const uint32_t syncFieldNum = 6u;
/*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
/*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
+ static const uint32_t syncImageBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE | SYNC_IMAGE_FENCE;
/*! Sync instructions are used to order loads and stores for a given memory
* space and/or to serialize threads at a given point in the program
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index dcd9bbaa..22ff89a3 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -28,5 +28,6 @@ OVERLOADABLE void debugwait(void);
OVERLOADABLE void mem_fence(cl_mem_fence_flags flags);
OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags);
OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags);
+#define work_group_barrier barrier
cl_mem_fence_flags get_fence(void *ptr);
#endif /* __OCL_SYNC_H__ */
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 9416f801..502ee678 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -11,34 +11,11 @@ declare i32 @_get_local_mem_fence() nounwind alwaysinline
declare i32 @_get_global_mem_fence() nounwind alwaysinline
declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
-declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
- %1 = icmp eq i32 %flags, 3
- br i1 %1, label %barrier_local_global, label %barrier_local_check
-
-barrier_local_global:
- call void @__gen_ocl_barrier_local_and_global()
- br label %done
-
-barrier_local_check:
- %2 = icmp eq i32 %flags, 1
- br i1 %2, label %barrier_local, label %barrier_global_check
-
-barrier_local:
- call void @__gen_ocl_barrier_local()
- br label %done
-
-barrier_global_check:
- %3 = icmp eq i32 %flags, 2
- br i1 %3, label %barrier_global, label %done
-
-barrier_global:
- call void @__gen_ocl_barrier_global()
- br label %done
-
-done:
+ call void @__gen_ocl_barrier(i32 %flags)
ret void
}
diff --git a/backend/src/libocl/src/ocl_sync.cl b/backend/src/libocl/src/ocl_sync.cl
index d297a2ed..590596ae 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/backend/src/libocl/src/ocl_sync.cl
@@ -20,7 +20,6 @@
void __gen_ocl_barrier_local(void);
void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
void __gen_ocl_debugwait(void);
OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 5bcff0de..dda8c9f4 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3824,7 +3824,7 @@ namespace gbe
case GEN_OCL_FORCE_SIMD16:
case GEN_OCL_LBARRIER:
case GEN_OCL_GBARRIER:
- case GEN_OCL_LGBARRIER:
+ case GEN_OCL_BARRIER:
ctx.getFunction().setUseSLM(true);
break;
case GEN_OCL_WRITE_IMAGE_I:
@@ -4745,7 +4745,31 @@ namespace gbe
case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
- case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+ case GEN_OCL_BARRIER:
+ {
+ Constant *CPV = dyn_cast<Constant>(*AI);
+ unsigned syncFlag = 0;
+ if (CPV) {
+ const ir::Immediate &x = processConstantImm(CPV);
+ unsigned barrierArg = x.getIntegerValue();
+ if (barrierArg & 0x1) {
+ syncFlag |= ir::syncLocalBarrier;
+ }
+ if (barrierArg & 0x2) {
+ syncFlag |= ir::syncGlobalBarrier;
+ }
+ if (barrierArg & 0x4) {
+ syncFlag |= ir::syncImageBarrier;
+ }
+ } else {
+ // FIXME we default it to do global fence and barrier.
+ // we need to do runtime check here.
+ syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier;
+ }
+
+ ctx.SYNC(syncFlag);
+ break;
+ }
case GEN_OCL_ATOMIC_ADD0:
case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
case GEN_OCL_ATOMIC_SUB0:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 4e7a748b..08fd1e21 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -30,7 +30,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
// Barrier function
DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
-DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global)
+DECL_LLVM_GEN_FUNCTION(BARRIER, __gen_ocl_barrier)
// To force SIMD8/16 compilation
DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8)