diff options
author | Ruiling Song <ruiling.song@intel.com> | 2016-04-05 15:20:19 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-11-08 20:38:22 +0800 |
commit | 9318ac9bdfd9b101fb762a04f4f9417c9b9a840f (patch) | |
tree | ecd791e3ae84687a5b16282d2ea45cb0357a8138 | |
parent | 6b44815b3519e0b65240991e9f12879b2b63389a (diff) |
GBE: add ocl 2.0 work_group_barrier support.
to do an image barrier, we need to:
1. flush L3 RW cache.
2. do a barrier gateway.
3. flush sampler cache.
Note the fence argument maybe ORed together.
We need to support non-immediate barrier() argument in future.
v2:
change syncField to 6, and modify syncStr.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
-rw-r--r-- | backend/src/backend/gen8_encoder.cpp | 24 | ||||
-rw-r--r-- | backend/src/backend/gen8_encoder.hpp | 2 | ||||
-rw-r--r-- | backend/src/backend/gen8_instruction.hpp | 6 | ||||
-rw-r--r-- | backend/src/backend/gen9_context.cpp | 9 | ||||
-rw-r--r-- | backend/src/backend/gen_context.cpp | 11 | ||||
-rw-r--r-- | backend/src/backend/gen_defs.hpp | 1 | ||||
-rw-r--r-- | backend/src/backend/gen_encoder.cpp | 8 | ||||
-rw-r--r-- | backend/src/backend/gen_encoder.hpp | 3 | ||||
-rw-r--r-- | backend/src/ir/instruction.cpp | 5 | ||||
-rw-r--r-- | backend/src/ir/instruction.hpp | 6 | ||||
-rw-r--r-- | backend/src/libocl/include/ocl_sync.h | 1 | ||||
-rw-r--r-- | backend/src/libocl/src/ocl_barrier.ll | 27 | ||||
-rw-r--r-- | backend/src/libocl/src/ocl_sync.cl | 1 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_backend.cpp | 28 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_ocl_function.hxx | 2 |
15 files changed, 92 insertions, 42 deletions
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp index 7b0c1a74..90cae3dd 100644 --- a/backend/src/backend/gen8_encoder.cpp +++ b/backend/src/backend/gen8_encoder.cpp @@ -431,6 +431,30 @@ namespace gbe this->setSrc1(&insn, GenRegister::immd(jip*8)); } + void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + Gen8NativeInstruction *gen8_insn = &insn->gen8_insn; + this->setHeader(insn); + this->setDst(insn, dst); + this->setSrc0(insn, dst); + setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1); + gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE; + gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1; + gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0; + } + + void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + this->setHeader(insn); + this->setDst(insn, dst); + this->setSrc0(insn, GenRegister::ud8grf(0,0)); + unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH; + unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64; + setSamplerMessage(insn, 0, 0, msg_type, + 1, 1, + true, + simd_mode, 0); + } void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) { Gen8NativeInstruction *gen8_insn = &insn->gen8_insn; diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp index d67c0dcf..b73beb36 100644 --- a/backend/src/backend/gen8_encoder.hpp +++ b/backend/src/backend/gen8_encoder.hpp @@ -38,6 +38,7 @@ namespace gbe /*! Jump indexed instruction */ virtual void JMPI(GenRegister src, bool longjmp = false); + virtual void FENCE(GenRegister dst, bool flushRWCache); /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */ virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip); virtual void F16TO32(GenRegister dest, GenRegister src0); @@ -57,6 +58,7 @@ namespace gbe virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti, unsigned char msg_type, uint32_t msg_length, bool header_present); + virtual void FLUSH_SAMPLERCACHE(GenRegister dst); virtual void setDst(GenNativeInstruction *insn, GenRegister dest); virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg); virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg); diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp index b3cbcbc0..446e7f9c 100644 --- a/backend/src/backend/gen8_instruction.hpp +++ b/backend/src/backend/gen8_instruction.hpp @@ -540,7 +540,11 @@ union Gen8NativeInstruction /*! Memory fence */ struct { uint32_t bti:8; - uint32_t pad:5; + uint32_t pad:1; + uint32_t flush_instruction:1; + uint32_t flush_texture:1; + uint32_t flush_constant:1; + uint32_t flush_rw:1; uint32_t commit_enable:1; uint32_t msg_type:4; uint32_t pad2:1; diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp index dc05756a..8976edee 100644 --- a/backend/src/backend/gen9_context.cpp +++ b/backend/src/backend/gen9_context.cpp @@ -34,9 +34,10 @@ namespace gbe const GenRegister fenceDst = ra->genReg(insn.dst(0)); uint32_t barrierType = insn.extra.barrierType; const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); + bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE; - if (barrierType == ir::syncGlobalBarrier) { - p->FENCE(fenceDst); + if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) { + p->FENCE(fenceDst, imageFence); p->MOV(fenceDst, fenceDst); } p->push(); @@ -54,6 +55,10 @@ namespace gbe p->curr.predicate = GEN_PREDICATE_NONE; p->WAIT(); p->pop(); + if (imageFence) { + p->FLUSH_SAMPLERCACHE(fenceDst); + p->MOV(fenceDst, fenceDst); + } } void BxtContext::newSelection(void) { diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 1a7b24d2..1ec201bf 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1835,9 +1835,10 @@ namespace gbe const GenRegister fenceDst = ra->genReg(insn.dst(0)); uint32_t barrierType = insn.extra.barrierType; const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); + bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE; - if (barrierType == ir::syncGlobalBarrier) { - p->FENCE(fenceDst); + if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) { + p->FENCE(fenceDst, imageFence); p->MOV(fenceDst, fenceDst); } p->push(); @@ -1855,11 +1856,15 @@ namespace gbe p->curr.predicate = GEN_PREDICATE_NONE; p->WAIT(); p->pop(); + if (imageFence) { + p->FLUSH_SAMPLERCACHE(fenceDst); + p->MOV(fenceDst, fenceDst); + } } void GenContext::emitFenceInstruction(const SelectionInstruction &insn) { const GenRegister dst = ra->genReg(insn.dst(0)); - p->FENCE(dst); + p->FENCE(dst, false); p->MOV(dst, dst); } diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp index 9777bf6b..a6c6bb0f 100644 --- a/backend/src/backend/gen_defs.hpp +++ b/backend/src/backend/gen_defs.hpp @@ -426,6 +426,7 @@ enum GenMessageTarget { #define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6 #define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7 #define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10 +#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH 0x1f /* for GEN5 only */ #define GEN_SAMPLER_SIMD_MODE_SIMD4X2 0 diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 290bbb4e..a69adc7c 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -203,7 +203,7 @@ namespace gbe unsigned msg_length, unsigned response_length, bool header_present, bool end_of_thread) { - setSrc1(inst, GenRegister::immd(0)); + setSrc1(inst, GenRegister::immud(0)); inst->bits3.generic_gen5.header_present = header_present; inst->bits3.generic_gen5.response_length = response_length; inst->bits3.generic_gen5.msg_length = msg_length; @@ -913,7 +913,7 @@ namespace gbe insn->bits3.msg_gateway.notify = notifyN; } - void GenEncoder::FENCE(GenRegister dst) { + void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); this->setHeader(insn); this->setDst(insn, dst); @@ -1183,6 +1183,10 @@ namespace gbe header_present, simd_mode, return_format); } + void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) { + // only Gen8+ support flushing sampler cache + assert(0); + } void GenEncoder::setVmeMessage(GenNativeInstruction *insn, unsigned char bti, diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index eccdb197..00d3eaa3 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -145,7 +145,7 @@ namespace gbe /*! Forward the gateway message. */ void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0); /*! Memory fence message (to order loads and stores between threads) */ - void FENCE(GenRegister dst); + virtual void FENCE(GenRegister dst, bool flushRWCache); /*! Jump indexed instruction */ virtual void JMPI(GenRegister src, bool longjmp = false); /*! IF indexed instruction */ @@ -230,6 +230,7 @@ namespace gbe uint32_t msg_type, unsigned char vme_search_path_lut, unsigned char lut_sub); + virtual void FLUSH_SAMPLERCACHE(GenRegister dst); /*! TypedWrite instruction for texture */ virtual void TYPED_WRITE(GenRegister header, diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index b5396cac..0687dbf6 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1475,7 +1475,8 @@ namespace ir { SYNC_LOCAL_READ_FENCE | SYNC_LOCAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE | - SYNC_GLOBAL_WRITE_FENCE; + SYNC_GLOBAL_WRITE_FENCE | + SYNC_IMAGE_FENCE; if (UNLIKELY(this->parameters > maxParams)) { whyNot = "Invalid parameters for sync instruction"; return false; @@ -1857,7 +1858,7 @@ namespace ir { } static const char *syncStr[syncFieldNum] = { - "workgroup", "local_read", "local_write", "global_read", "global_write" + "workgroup", "local_read", "local_write", "global_read", "global_write", "image" }; INLINE void SyncInstruction::out(std::ostream &out, const Function &fn) const { diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index fe195f40..16c2045e 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -542,17 +542,19 @@ namespace ir { SYNC_LOCAL_WRITE_FENCE = 1<<2, SYNC_GLOBAL_READ_FENCE = 1<<3, SYNC_GLOBAL_WRITE_FENCE = 1<<4, - SYNC_INVALID = 1<<5 + SYNC_IMAGE_FENCE = 1<<5, + SYNC_INVALID = 1<<6 }; /*! 5 bits to encode all possible synchronization capablities */ - static const uint32_t syncFieldNum = 5u; + static const uint32_t syncFieldNum = 6u; /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */ static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE; /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */ static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE; + static const uint32_t syncImageBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE | SYNC_IMAGE_FENCE; /*! Sync instructions are used to order loads and stores for a given memory * space and/or to serialize threads at a given point in the program diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h index dcd9bbaa..22ff89a3 100644 --- a/backend/src/libocl/include/ocl_sync.h +++ b/backend/src/libocl/include/ocl_sync.h @@ -28,5 +28,6 @@ OVERLOADABLE void debugwait(void); OVERLOADABLE void mem_fence(cl_mem_fence_flags flags); OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags); OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags); +#define work_group_barrier barrier cl_mem_fence_flags get_fence(void *ptr); #endif /* __OCL_SYNC_H__ */ diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll index 9416f801..502ee678 100644 --- a/backend/src/libocl/src/ocl_barrier.ll +++ b/backend/src/libocl/src/ocl_barrier.ll @@ -11,34 +11,11 @@ declare i32 @_get_local_mem_fence() nounwind alwaysinline declare i32 @_get_global_mem_fence() nounwind alwaysinline declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate -declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate +declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline { - %1 = icmp eq i32 %flags, 3 - br i1 %1, label %barrier_local_global, label %barrier_local_check - -barrier_local_global: - call void @__gen_ocl_barrier_local_and_global() - br label %done - -barrier_local_check: - %2 = icmp eq i32 %flags, 1 - br i1 %2, label %barrier_local, label %barrier_global_check - -barrier_local: - call void @__gen_ocl_barrier_local() - br label %done - -barrier_global_check: - %3 = icmp eq i32 %flags, 2 - br i1 %3, label %barrier_global, label %done - -barrier_global: - call void @__gen_ocl_barrier_global() - br label %done - -done: + call void @__gen_ocl_barrier(i32 %flags) ret void } diff --git a/backend/src/libocl/src/ocl_sync.cl b/backend/src/libocl/src/ocl_sync.cl index d297a2ed..590596ae 100644 --- a/backend/src/libocl/src/ocl_sync.cl +++ b/backend/src/libocl/src/ocl_sync.cl @@ -20,7 +20,6 @@ void __gen_ocl_barrier_local(void); void __gen_ocl_barrier_global(void); -void __gen_ocl_barrier_local_and_global(void); void __gen_ocl_debugwait(void); OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) { diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 5bcff0de..dda8c9f4 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -3824,7 +3824,7 @@ namespace gbe case GEN_OCL_FORCE_SIMD16: case GEN_OCL_LBARRIER: case GEN_OCL_GBARRIER: - case GEN_OCL_LGBARRIER: + case GEN_OCL_BARRIER: ctx.getFunction().setUseSLM(true); break; case GEN_OCL_WRITE_IMAGE_I: @@ -4745,7 +4745,31 @@ namespace gbe case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break; case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break; case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break; - case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break; + case GEN_OCL_BARRIER: + { + Constant *CPV = dyn_cast<Constant>(*AI); + unsigned syncFlag = 0; + if (CPV) { + const ir::Immediate &x = processConstantImm(CPV); + unsigned barrierArg = x.getIntegerValue(); + if (barrierArg & 0x1) { + syncFlag |= ir::syncLocalBarrier; + } + if (barrierArg & 0x2) { + syncFlag |= ir::syncGlobalBarrier; + } + if (barrierArg & 0x4) { + syncFlag |= ir::syncImageBarrier; + } + } else { + // FIXME we default it to do global fence and barrier. + // we need to do runtime check here. + syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier; + } + + ctx.SYNC(syncFlag); + break; + } case GEN_OCL_ATOMIC_ADD0: case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break; case GEN_OCL_ATOMIC_SUB0: diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 4e7a748b..08fd1e21 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -30,7 +30,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin) // Barrier function DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local) DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global) -DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global) +DECL_LLVM_GEN_FUNCTION(BARRIER, __gen_ocl_barrier) // To force SIMD8/16 compilation DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8) |