From b714f3cf56cde049c9687984d1a05c833d80a70e Mon Sep 17 00:00:00 2001 From: Grigore Lupescu Date: Mon, 11 Apr 2016 17:37:02 +0300 Subject: Backend: Full support for workgroup broadcast Masked elements by proper init value. V2: Fix barrier call in gen context V3: Fix workgroup float max init value Signed-off-by: Grigore Lupescu Reviewed-by: Pan Xiuli --- backend/src/backend/gen_context.cpp | 242 ++++++++++++++++------------- backend/src/backend/gen_insn_selection.cpp | 195 ++++++++++++++++------- 2 files changed, 269 insertions(+), 168 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 7073b807..1663b6f3 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2846,52 +2846,15 @@ namespace gbe } p->pop(); } - static void workgroupOp(GenRegister dst, - GenRegister src1, - GenRegister src2, - uint32_t wg_op, - GenEncoder *p) - { - // REDUCE - if (wg_op == ir::WORKGROUP_OP_ANY) - p->OR(dst, src1, src2); - else if (wg_op == ir::WORKGROUP_OP_ALL) - p->AND(dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD) - p->ADD(dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN) - p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX) - p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); - - // INCLUSIVE - else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD) - p->ADD(dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN) - p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) - p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); - - // EXCLUSIVE - else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) - p->ADD(dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) - p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); - else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) - p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); - - else - GBE_ASSERT(0); - } - - static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) + /* Init value according to WORKGROUP OP + * Emit assert is invalid combination operation - datatype */ + static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) { if (wg_op == ir::WORKGROUP_OP_ALL) { if (dataReg.type == GEN_TYPE_D - || dataReg.type == GEN_TYPE_UD - || dataReg.type == GEN_TYPE_F) + || dataReg.type == GEN_TYPE_UD) p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF)); else if(dataReg.type == GEN_TYPE_L || dataReg.type == GEN_TYPE_UL) @@ -2946,7 +2909,7 @@ namespace gbe else if (dataReg.type == GEN_TYPE_UD) p->MOV(dataReg, GenRegister::immud(0x0)); else if (dataReg.type == GEN_TYPE_F) - p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000)); + p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000)); else if (dataReg.type == GEN_TYPE_L) p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L)); else if (dataReg.type == GEN_TYPE_UL) @@ -2960,8 +2923,53 @@ namespace gbe GBE_ASSERT(0); } - static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData, - GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) { + /* Perform WORKGROUP OP on 2 input elements (registers) */ + static void wgOpPerform(GenRegister dst, + GenRegister src1, + GenRegister src2, + uint32_t wg_op, + GenEncoder *p) + { + /* perform OP REDUCE on 2 elements */ + if (wg_op == ir::WORKGROUP_OP_ANY) + p->OR(dst, src1, src2); + else if (wg_op == ir::WORKGROUP_OP_ALL) + p->AND(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + /* perform OP SCAN INCLUSIVE on 2 elements */ + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + /* perform OP SCAN EXCLUSIVE on 2 elements */ + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + else + GBE_ASSERT(0); + } + + static void wgOpPerformThread(GenRegister threadDst, + GenRegister inputVal, + GenRegister threadExchangeData, + GenRegister resultVal, + uint32_t simd, + uint32_t wg_op, + GenEncoder *p) + { p->push(); p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; @@ -2981,7 +2989,7 @@ namespace gbe /* init thread data to min/max/null values */ p->push(); { p->curr.execWidth = simd; - initValue(p, threadExchangeData, wg_op); + wgOpInitValue(p, threadExchangeData, wg_op); p->MOV(resultVal, inputVal); } p->pop(); @@ -3046,17 +3054,17 @@ namespace gbe wg_op == ir::WORKGROUP_OP_REDUCE_ADD || wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_REDUCE_MAX) - workgroupOp(result[0], result[0], input[i], wg_op, p); + wgOpPerform(result[0], result[0], input[i], wg_op, p); else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) - workgroupOp(result[i], result[i - 1], input[i], wg_op, p); + wgOpPerform(result[i], result[i - 1], input[i], wg_op, p); else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) - workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p); + wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p); else GBE_ASSERT(0); @@ -3090,11 +3098,11 @@ namespace gbe { p->curr.execWidth = 1; /* set result[0] to min/max/null */ - initValue(p, result[0], wg_op); + wgOpInitValue(p, result[0], wg_op); p->curr.execWidth = 16; /* value exchanged with other threads */ - workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p); + wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p); /* partial result thread */ p->MOV(threadDst, resultVal); } @@ -3103,68 +3111,82 @@ namespace gbe } /** - * Basic idea: - * 1. All the threads firstly calculate the max/min/add value for the + * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE + * + * Implementation: + * 1. All the threads first perform the workgroup op value for the * allocated work-items. SIMD16=> 16 work-items allocated for each thread - * 2. Each thread will write the computed reduce OP result in SLM memory - * based on the threadId - * 3. After a memory fence, each thread will read in chunks of 4 elements, - * the SLM region, using a loop based on the thread count value (threadN) - * 4. At the end each thread has the final value computed individually + * 2. Each thread writes the partial result in shared local memory using threadId + * 3. After a barrier, each thread will read in chunks of 1-4 elements, + * the shared local memory region, using a loop based on the thread num value (threadN) + * 4. Each thread computes the final value individually + * + * Optimizations: + * Performance is given by chunk read. If threads read in chunks of 4 elements + * the performance is increase 2-3x times compared to chunks of 1 element. */ void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){ const GenRegister dst = ra->genReg(insn.dst(0)); - const GenRegister tmp = ra->genReg(insn.dst(1)); - const GenRegister theVal = ra->genReg(insn.src(2)); + const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type); + const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type); GenRegister threadData = ra->genReg(insn.src(3)); GenRegister partialData = GenRegister::toUniform(threadData, dst.type); GenRegister threadId = ra->genReg(insn.src(0)); GenRegister threadLoop = ra->genReg(insn.src(1)); GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); + GenRegister localBarrier = ra->genReg(insn.src(5)); uint32_t wg_op = insn.extra.workgroupOp; uint32_t simd = p->curr.execWidth; int32_t jip0, jip1; + /* masked elements should be properly set to init value */ + p->push(); { + p->curr.noMask = 1; + wgOpInitValue(p, tmp, wg_op); + p->curr.noMask = 0; + p->MOV(tmp, theVal); + p->curr.noMask = 1; + p->MOV(theVal, tmp); + } p->pop(); + threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD); - /* Use of continuous GRF allocation from insn selection */ + /* use of continuous GRF allocation from insn selection */ GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type); GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD); GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD); GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type); - /* Do some calculation within each thread */ - workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p); + /* do some calculation within each thread */ + wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p); p->curr.execWidth = 16; p->MOV(theVal, dst); threadData = GenRegister::toUniform(threadData, dst.type); - /* Store thread count for future use on read/write to SLM */ + /* store thread count for future use on read/write to SLM */ if (wg_op == ir::WORKGROUP_OP_ANY || - wg_op == ir::WORKGROUP_OP_ALL || - wg_op == ir::WORKGROUP_OP_REDUCE_ADD || - wg_op == ir::WORKGROUP_OP_REDUCE_MIN || - wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) { - //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); - threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); - p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); + threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); + p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); } else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || - wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || - wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || - wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || - wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || - wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) { - //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid))); - threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); - p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid))); + threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); + p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid))); } - /* All threads write the partial results to SLM memory */ + /* all threads write the partial results to SLM memory */ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) { GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D); @@ -3186,25 +3208,25 @@ namespace gbe p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1); } - /* Init partialData register, it will hold the final result */ - initValue(p, partialData, wg_op); + /* init partialData register, it will hold the final result */ + wgOpInitValue(p, partialData, wg_op); - /* Add call to barrier */ + /* add call to barrier */ p->push(); p->curr.execWidth = 8; p->curr.physicalFlag = 0; p->curr.noMask = 1; - p->AND(msgData, barrierId, GenRegister::immud(0x0f000000)); - p->BARRIER(msgData); + p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000)); + p->BARRIER(localBarrier); p->curr.execWidth = 1; p->WAIT(); p->pop(); - /* Perform a loop, based on thread count (which is now multiple of 4) */ + /* perform a loop, based on thread count (which is now multiple of 4) */ p->push();{ jip0 = p->n_instruction(); - /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */ + /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */ if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) { p->curr.execWidth = 8; @@ -3220,8 +3242,8 @@ namespace gbe msgDataH.hstride = 2; p->MOV(msgDataL, msgDataH); - /* Perform operation, partialData will hold result */ - workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); + /* perform operation, partialData will hold result */ + wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); } else { @@ -3232,11 +3254,11 @@ namespace gbe p->ADD(msgAddr, msgAddr, msgSlmOff); p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1); - /* Perform operation, partialData will hold result */ - workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); + /* perform operation, partialData will hold result */ + wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); } - /* While threadN is not 0, cycle read SLM / update value */ + /* while threadN is not 0, cycle read SLM / update value */ p->curr.noMask = 1; p->curr.flag = 0; p->curr.subFlag = 1; @@ -3248,22 +3270,25 @@ namespace gbe } p->pop(); if(wg_op == ir::WORKGROUP_OP_ANY || - wg_op == ir::WORKGROUP_OP_ALL || - wg_op == ir::WORKGROUP_OP_REDUCE_ADD || - wg_op == ir::WORKGROUP_OP_REDUCE_MIN || - wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) { - /* Save result to final register location dst */ - p->curr.execWidth = 16; - p->MOV(dst, partialData); + /* save result to final register location dst */ + p->curr.execWidth = 16; + p->MOV(dst, partialData); } - else { - /* Save result to final register location dst */ - p->curr.execWidth = 16; + else + { + /* save result to final register location dst */ + p->curr.execWidth = 16; + if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) p->ADD(dst, dst, partialData); - else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) { p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData); /* workaround QW datatype on CMP */ @@ -3276,7 +3301,8 @@ namespace gbe dst.offset(dst, 3, 0), partialData); } } - else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) { p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData); /* workaround QW datatype on CMP */ @@ -3293,11 +3319,11 @@ namespace gbe /* corner cases for threads 0 */ if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || - wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || - wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || - wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || - wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || - wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) { p->push();{ p->curr.flag = 0; diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 7a8800b4..07901a6e 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -688,10 +688,12 @@ namespace gbe /*! double division */ void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum); /*! Work Group Operations */ - void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister data, - GenRegister threadId, GenRegister threadN, - GenRegister tmp, GenRegister slmOff, vector msg, - uint32_t msgSizeReq); + void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, + GenRegister tmpData1, + GenRegister localThreadID, GenRegister localThreadNUM, + GenRegister tmpData2, GenRegister slmOff, + vector msg, uint32_t msgSizeReq, + GenRegister localBarrier); /* common functions for both binary instruction and sel_cmp and compare instruction. It will handle the IMM or normal register assignment, and will try to avoid LOADI as much as possible. */ @@ -1960,15 +1962,16 @@ namespace gbe void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, - GenRegister data, - GenRegister threadId, - GenRegister threadN, - GenRegister tmp, + GenRegister tmpData1, + GenRegister localThreadID, + GenRegister localThreadNUM, + GenRegister tmpData2, GenRegister slmOff, vector msg, - uint32_t msgSizeReq = 6) + uint32_t msgSizeReq, + GenRegister localBarrier) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 5); + SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6); SelectionVector *vector = this->appendVector(); /* allocate continuous GRF registers for READ/WRITE to SLM */ @@ -1980,15 +1983,16 @@ namespace gbe insn->extra.workgroupOp = wg_op; insn->dst(0) = dst; - insn->dst(1) = tmp; + insn->dst(1) = tmpData1; for(uint32_t i = 0; i < msg.size(); i++) insn->dst(2 + i) = msg[i]; - insn->src(0) = threadId; - insn->src(1) = threadN; + insn->src(0) = localThreadID; + insn->src(1) = localThreadNUM; insn->src(2) = src; - insn->src(3) = data; + insn->src(3) = tmpData2; insn->src(4) = slmOff; + insn->src(5) = localBarrier; } // Boiler plate to initialize the selection library at c++ pre-main @@ -6199,64 +6203,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp /*! WorkGroup instruction pattern */ DECL_PATTERN(WorkGroupInstruction) { - /* SLM bassed communication between threads, most of the logic bellow */ + /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE + * Shared local memory bassed communication between threads, + * prepare for the workgroup op in gen context + * Algorithm logic is in gen context, */ INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const { using namespace ir; - const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode(); + GBE_ASSERT(insn.getSrcNum() == 3); + GBE_ASSERT(insn.getSrc(0) == ocl::threadn); + GBE_ASSERT(insn.getSrc(1) == ocl::threadid); + + const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode(); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src = sel.selReg(insn.getSrc(2), type); - GenRegister threadId = sel.selReg(ocl::threadid, ir::TYPE_U32); - GenRegister threadN = sel.selReg(ocl::threadn, ir::TYPE_U32); - const uint32_t srcNum = insn.getSrcNum(); - - GBE_ASSERT(srcNum == 3); - GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn); - GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid); - GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); - GenRegister data = sel.selReg(sel.reg(FAMILY_QWORD), type); - GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U32); - + GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); + GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); + GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32); + GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32); + GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32); + GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD)); + + /* Allocate registers for message sending + * (read/write to shared local memory) */ vector msg; for(uint32_t i = 0; i < 6; i++) - msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type)); + msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32)); - /* insert a barrier to make sure all the var we are interested in + /* Insert a barrier to make sure all the var we are interested in have been assigned the final value. */ - sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); - /* compute individual slice of workitems, (e.g. 0->16 workitems) */ + /* Pass the shared local memory offset */ sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr())); - /* barrier for syn prior to workgroup */ - sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg); + /* Perform workgroup op */ + sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1, + localThreadID, localThreadNUM, tmpData2, slmOff, msg, 6, + localBarrier); return true; } - INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const { - /* 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted. - 2. CMP IDs Compare the local IDs with the specified ones in the function call. - 3. STORE Use flag to control the store of the var. Only the specified item will execute the store. - 4. BARRIER Ensure the specified value has been stored. - 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value, - so broadcasted. */ + /* WORKGROUP OP: BROADCAST + * 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted. + 2. CMP IDs Compare the local IDs with the specified ones in the function call. + 3. STORE Use flag to control the store of the var. Only the specified item will execute the store. + 4. BARRIER Ensure the specified value has been stored. + 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value, + so broadcasted. */ + INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const + { using namespace ir; + + const uint32_t srcNum = insn.getSrcNum(); + GBE_ASSERT(srcNum >= 2); + const Type type = insn.getType(); const GenRegister src = sel.selReg(insn.getSrc(0), type); const GenRegister dst = sel.selReg(insn.getDst(0), type); - const uint32_t srcNum = insn.getSrcNum(); const uint32_t slmAddr = insn.getSlmAddr(); - GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); + GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); vector fakeTemps; + fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type)); + fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type)); - GBE_ASSERT(srcNum >= 2); GenRegister coords[3]; - for (uint32_t i = 1; i < srcNum; i++) { - coords[i - 1] = sel.selReg(insn.getSrc(i), TYPE_U32); - } + for (uint32_t i = 1; i < srcNum; i++) + coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD); sel.push(); { sel.curr.predicate = GEN_PREDICATE_NONE; @@ -6266,7 +6283,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp /* insert a barrier to make sure all the var we are interested in have been assigned the final value. */ - sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); sel.push(); { sel.curr.flag = 0; @@ -6275,33 +6293,90 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.curr.noMask = 1; GenRegister lid0, lid1, lid2; uint32_t dim = srcNum - 1; - lid0 = sel.selReg(ir::ocl::lid0); - lid1 = sel.selReg(ir::ocl::lid1); - lid2 = sel.selReg(ir::ocl::lid2); + lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD); + lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD); + lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD); - sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); + sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, + GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); sel.curr.predicate = GEN_PREDICATE_NORMAL; if (dim >= 2) - sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); + sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, + GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); if (dim >= 3) - sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); + sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, + GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); - if (typeSize(src.type) == 4) { - GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_F); - GenRegister _src = GenRegister::retype(src, GEN_TYPE_F); + /* write to shared local memory for BYTE/WORD/DWORD types */ + if (typeSize(src.type) <= 4) { + GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); + GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD); sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps); } + /* write to shared local memory for QWORD types */ + else if (typeSize(src.type) == 8) { + sel.push(); { + /* arrange data in QWORD */ + GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); + GenRegister srcQW = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); + GenRegister srcQW_p1 = src.retype(srcQW, GEN_TYPE_UD); + GenRegister srcQW_p2 = src.retype(src.offset(srcQW, 2, 0), GEN_TYPE_UD); + vector srcVec; + srcVec.push_back(srcQW_p1); + srcVec.push_back(srcQW_p2); + + /* unpack into 2 DWORD */ + sel.UNPACK_LONG(srcQW, src); + + /* emit write through SEND */ + sel.UNTYPED_WRITE(_addr, srcVec.data(), 2, + GenRegister::immw(0xfe), fakeTemps); + }sel.pop(); + } + else + GBE_ASSERT(0); + } sel.pop(); - /* Make sure the slm var have the valid value now */ - sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); + /* make sure the slm var have the valid value now */ + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); + + /* read from shared local memory for BYTE/WORD/DWORD types */ + if (typeSize(src.type) <= 4) { + GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); + GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD); + sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps); + } + /* read from shared local memory for QWORD types */ + else if (typeSize(src.type) == 8) { + GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); + vector _dst; + _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32)); + _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32)); + GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL); + + sel.push(); { + /* emit read through SEND */ + sel.curr.execWidth = 8; + sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps); + + /* reconstruct QWORD type */ + _dst[0] = dst.toUniform(dst.offset(_dst[0], 0, 4), GEN_TYPE_UD); + _dst[1] = dst.toUniform(_dst[1], GEN_TYPE_UD); + sel.curr.execWidth = 1; + sel.MOV(_dst[0], _dst[1]); + } sel.pop(); - if (typeSize(src.type) == 4) { - sel.UNTYPED_READ(addr, &dst, 1, GenRegister::immw(0xfe), fakeTemps); + /* set all elements assigned to thread */ + sel.MOV(dst, _dstQ); } + else + GBE_ASSERT(0); return true; } + INLINE bool emitOne(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn, bool &markChildren) const { using namespace ir; -- cgit v1.2.3