summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrigore Lupescu <grigore.lupescu at intel.com>2016-04-11 17:37:02 +0300
committerYang Rong <rong.r.yang@intel.com>2016-05-18 15:10:36 +0800
commitb714f3cf56cde049c9687984d1a05c833d80a70e (patch)
tree226e0e4e8e2ef6fe066a77e189c959af7a795ff8
parentb561e87a34f59093de7410f3bd3c0007610133a7 (diff)
Backend: Full support for workgroup broadcast
Masked elements by proper init value. V2: Fix barrier call in gen context V3: Fix workgroup float max init value Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com> Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
-rw-r--r--backend/src/backend/gen_context.cpp242
-rw-r--r--backend/src/backend/gen_insn_selection.cpp195
2 files changed, 269 insertions, 168 deletions
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 7073b807..1663b6f3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2846,52 +2846,15 @@ namespace gbe
} p->pop();
}
- static void workgroupOp(GenRegister dst,
- GenRegister src1,
- GenRegister src2,
- uint32_t wg_op,
- GenEncoder *p)
- {
- // REDUCE
- if (wg_op == ir::WORKGROUP_OP_ANY)
- p->OR(dst, src1, src2);
- else if (wg_op == ir::WORKGROUP_OP_ALL)
- p->AND(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
- p->ADD(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
- // INCLUSIVE
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
- p->ADD(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
- // EXCLUSIVE
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
- p->ADD(dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
- p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
- else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
- p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
- else
- GBE_ASSERT(0);
- }
-
- static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+ /* Init value according to WORKGROUP OP
+ * Emit assert is invalid combination operation - datatype */
+ static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
{
if (wg_op == ir::WORKGROUP_OP_ALL)
{
if (dataReg.type == GEN_TYPE_D
- || dataReg.type == GEN_TYPE_UD
- || dataReg.type == GEN_TYPE_F)
+ || dataReg.type == GEN_TYPE_UD)
p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
else if(dataReg.type == GEN_TYPE_L ||
dataReg.type == GEN_TYPE_UL)
@@ -2946,7 +2909,7 @@ namespace gbe
else if (dataReg.type == GEN_TYPE_UD)
p->MOV(dataReg, GenRegister::immud(0x0));
else if (dataReg.type == GEN_TYPE_F)
- p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+ p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000));
else if (dataReg.type == GEN_TYPE_L)
p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
else if (dataReg.type == GEN_TYPE_UL)
@@ -2960,8 +2923,53 @@ namespace gbe
GBE_ASSERT(0);
}
- static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData,
- GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+ /* Perform WORKGROUP OP on 2 input elements (registers) */
+ static void wgOpPerform(GenRegister dst,
+ GenRegister src1,
+ GenRegister src2,
+ uint32_t wg_op,
+ GenEncoder *p)
+ {
+ /* perform OP REDUCE on 2 elements */
+ if (wg_op == ir::WORKGROUP_OP_ANY)
+ p->OR(dst, src1, src2);
+ else if (wg_op == ir::WORKGROUP_OP_ALL)
+ p->AND(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ /* perform OP SCAN INCLUSIVE on 2 elements */
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ /* perform OP SCAN EXCLUSIVE on 2 elements */
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+ p->ADD(dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+ else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+ else
+ GBE_ASSERT(0);
+ }
+
+ static void wgOpPerformThread(GenRegister threadDst,
+ GenRegister inputVal,
+ GenRegister threadExchangeData,
+ GenRegister resultVal,
+ uint32_t simd,
+ uint32_t wg_op,
+ GenEncoder *p)
+ {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
@@ -2981,7 +2989,7 @@ namespace gbe
/* init thread data to min/max/null values */
p->push(); {
p->curr.execWidth = simd;
- initValue(p, threadExchangeData, wg_op);
+ wgOpInitValue(p, threadExchangeData, wg_op);
p->MOV(resultVal, inputVal);
} p->pop();
@@ -3046,17 +3054,17 @@ namespace gbe
wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
- workgroupOp(result[0], result[0], input[i], wg_op, p);
+ wgOpPerform(result[0], result[0], input[i], wg_op, p);
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
- workgroupOp(result[i], result[i - 1], input[i], wg_op, p);
+ wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
- workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p);
+ wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
else
GBE_ASSERT(0);
@@ -3090,11 +3098,11 @@ namespace gbe
{
p->curr.execWidth = 1;
/* set result[0] to min/max/null */
- initValue(p, result[0], wg_op);
+ wgOpInitValue(p, result[0], wg_op);
p->curr.execWidth = 16;
/* value exchanged with other threads */
- workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+ wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
/* partial result thread */
p->MOV(threadDst, resultVal);
}
@@ -3103,68 +3111,82 @@ namespace gbe
}
/**
- * Basic idea:
- * 1. All the threads firstly calculate the max/min/add value for the
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
* allocated work-items. SIMD16=> 16 work-items allocated for each thread
- * 2. Each thread will write the computed reduce OP result in SLM memory
- * based on the threadId
- * 3. After a memory fence, each thread will read in chunks of 4 elements,
- * the SLM region, using a loop based on the thread count value (threadN)
- * 4. At the end each thread has the final value computed individually
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
*/
void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
const GenRegister dst = ra->genReg(insn.dst(0));
- const GenRegister tmp = ra->genReg(insn.dst(1));
- const GenRegister theVal = ra->genReg(insn.src(2));
+ const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+ const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type);
GenRegister threadData = ra->genReg(insn.src(3));
GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
GenRegister threadId = ra->genReg(insn.src(0));
GenRegister threadLoop = ra->genReg(insn.src(1));
GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+ GenRegister localBarrier = ra->genReg(insn.src(5));
uint32_t wg_op = insn.extra.workgroupOp;
uint32_t simd = p->curr.execWidth;
int32_t jip0, jip1;
+ /* masked elements should be properly set to init value */
+ p->push(); {
+ p->curr.noMask = 1;
+ wgOpInitValue(p, tmp, wg_op);
+ p->curr.noMask = 0;
+ p->MOV(tmp, theVal);
+ p->curr.noMask = 1;
+ p->MOV(theVal, tmp);
+ } p->pop();
+
threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
- /* Use of continuous GRF allocation from insn selection */
+ /* use of continuous GRF allocation from insn selection */
GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
- /* Do some calculation within each thread */
- workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+ /* do some calculation within each thread */
+ wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
p->curr.execWidth = 16;
p->MOV(theVal, dst);
threadData = GenRegister::toUniform(threadData, dst.type);
- /* Store thread count for future use on read/write to SLM */
+ /* store thread count for future use on read/write to SLM */
if (wg_op == ir::WORKGROUP_OP_ANY ||
- wg_op == ir::WORKGROUP_OP_ALL ||
- wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
{
- //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
- threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
- p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
}
else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
- //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
- threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
- p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+ threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+ p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
}
- /* All threads write the partial results to SLM memory */
+ /* all threads write the partial results to SLM memory */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
{
GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
@@ -3186,25 +3208,25 @@ namespace gbe
p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
}
- /* Init partialData register, it will hold the final result */
- initValue(p, partialData, wg_op);
+ /* init partialData register, it will hold the final result */
+ wgOpInitValue(p, partialData, wg_op);
- /* Add call to barrier */
+ /* add call to barrier */
p->push();
p->curr.execWidth = 8;
p->curr.physicalFlag = 0;
p->curr.noMask = 1;
- p->AND(msgData, barrierId, GenRegister::immud(0x0f000000));
- p->BARRIER(msgData);
+ p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000));
+ p->BARRIER(localBarrier);
p->curr.execWidth = 1;
p->WAIT();
p->pop();
- /* Perform a loop, based on thread count (which is now multiple of 4) */
+ /* perform a loop, based on thread count (which is now multiple of 4) */
p->push();{
jip0 = p->n_instruction();
- /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+ /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
{
p->curr.execWidth = 8;
@@ -3220,8 +3242,8 @@ namespace gbe
msgDataH.hstride = 2;
p->MOV(msgDataL, msgDataH);
- /* Perform operation, partialData will hold result */
- workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+ /* perform operation, partialData will hold result */
+ wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
}
else
{
@@ -3232,11 +3254,11 @@ namespace gbe
p->ADD(msgAddr, msgAddr, msgSlmOff);
p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
- /* Perform operation, partialData will hold result */
- workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+ /* perform operation, partialData will hold result */
+ wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
}
- /* While threadN is not 0, cycle read SLM / update value */
+ /* while threadN is not 0, cycle read SLM / update value */
p->curr.noMask = 1;
p->curr.flag = 0;
p->curr.subFlag = 1;
@@ -3248,22 +3270,25 @@ namespace gbe
} p->pop();
if(wg_op == ir::WORKGROUP_OP_ANY ||
- wg_op == ir::WORKGROUP_OP_ALL ||
- wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
- wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+ wg_op == ir::WORKGROUP_OP_ALL ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+ wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
{
- /* Save result to final register location dst */
- p->curr.execWidth = 16;
- p->MOV(dst, partialData);
+ /* save result to final register location dst */
+ p->curr.execWidth = 16;
+ p->MOV(dst, partialData);
}
- else {
- /* Save result to final register location dst */
- p->curr.execWidth = 16;
+ else
+ {
+ /* save result to final register location dst */
+ p->curr.execWidth = 16;
+
if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
|| wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
p->ADD(dst, dst, partialData);
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
{
p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
/* workaround QW datatype on CMP */
@@ -3276,7 +3301,8 @@ namespace gbe
dst.offset(dst, 3, 0), partialData);
}
}
- else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+ || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
/* workaround QW datatype on CMP */
@@ -3293,11 +3319,11 @@ namespace gbe
/* corner cases for threads 0 */
if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
- wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+ wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
{
p->push();{
p->curr.flag = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7a8800b4..07901a6e 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -688,10 +688,12 @@ namespace gbe
/*! double division */
void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum);
/*! Work Group Operations */
- void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister data,
- GenRegister threadId, GenRegister threadN,
- GenRegister tmp, GenRegister slmOff, vector<GenRegister> msg,
- uint32_t msgSizeReq);
+ void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
+ GenRegister tmpData1,
+ GenRegister localThreadID, GenRegister localThreadNUM,
+ GenRegister tmpData2, GenRegister slmOff,
+ vector<GenRegister> msg, uint32_t msgSizeReq,
+ GenRegister localBarrier);
/* common functions for both binary instruction and sel_cmp and compare instruction.
It will handle the IMM or normal register assignment, and will try to avoid LOADI
as much as possible. */
@@ -1960,15 +1962,16 @@ namespace gbe
void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
Reg dst,
GenRegister src,
- GenRegister data,
- GenRegister threadId,
- GenRegister threadN,
- GenRegister tmp,
+ GenRegister tmpData1,
+ GenRegister localThreadID,
+ GenRegister localThreadNUM,
+ GenRegister tmpData2,
GenRegister slmOff,
vector<GenRegister> msg,
- uint32_t msgSizeReq = 6)
+ uint32_t msgSizeReq,
+ GenRegister localBarrier)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 5);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6);
SelectionVector *vector = this->appendVector();
/* allocate continuous GRF registers for READ/WRITE to SLM */
@@ -1980,15 +1983,16 @@ namespace gbe
insn->extra.workgroupOp = wg_op;
insn->dst(0) = dst;
- insn->dst(1) = tmp;
+ insn->dst(1) = tmpData1;
for(uint32_t i = 0; i < msg.size(); i++)
insn->dst(2 + i) = msg[i];
- insn->src(0) = threadId;
- insn->src(1) = threadN;
+ insn->src(0) = localThreadID;
+ insn->src(1) = localThreadNUM;
insn->src(2) = src;
- insn->src(3) = data;
+ insn->src(3) = tmpData2;
insn->src(4) = slmOff;
+ insn->src(5) = localBarrier;
}
// Boiler plate to initialize the selection library at c++ pre-main
@@ -6199,64 +6203,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
/*! WorkGroup instruction pattern */
DECL_PATTERN(WorkGroupInstruction)
{
- /* SLM bassed communication between threads, most of the logic bellow */
+ /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ * Shared local memory bassed communication between threads,
+ * prepare for the workgroup op in gen context
+ * Algorithm logic is in gen context, */
INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
{
using namespace ir;
- const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
+ GBE_ASSERT(insn.getSrcNum() == 3);
+ GBE_ASSERT(insn.getSrc(0) == ocl::threadn);
+ GBE_ASSERT(insn.getSrc(1) == ocl::threadid);
+
+ const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
const Type type = insn.getType();
GenRegister dst = sel.selReg(insn.getDst(0), type);
GenRegister src = sel.selReg(insn.getSrc(2), type);
- GenRegister threadId = sel.selReg(ocl::threadid, ir::TYPE_U32);
- GenRegister threadN = sel.selReg(ocl::threadn, ir::TYPE_U32);
- const uint32_t srcNum = insn.getSrcNum();
-
- GBE_ASSERT(srcNum == 3);
- GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
- GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
- GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
- GenRegister data = sel.selReg(sel.reg(FAMILY_QWORD), type);
- GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U32);
-
+ GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+ GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+ GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32);
+ GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32);
+ GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32);
+ GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD));
+
+ /* Allocate registers for message sending
+ * (read/write to shared local memory) */
vector<GenRegister> msg;
for(uint32_t i = 0; i < 6; i++)
- msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+ msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
- /* insert a barrier to make sure all the var we are interested in
+ /* Insert a barrier to make sure all the var we are interested in
have been assigned the final value. */
- sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
- /* compute individual slice of workitems, (e.g. 0->16 workitems) */
+ /* Pass the shared local memory offset */
sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
- /* barrier for syn prior to workgroup */
- sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg);
+ /* Perform workgroup op */
+ sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1,
+ localThreadID, localThreadNUM, tmpData2, slmOff, msg, 6,
+ localBarrier);
return true;
}
- INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const {
- /* 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted.
- 2. CMP IDs Compare the local IDs with the specified ones in the function call.
- 3. STORE Use flag to control the store of the var. Only the specified item will execute the store.
- 4. BARRIER Ensure the specified value has been stored.
- 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value,
- so broadcasted. */
+ /* WORKGROUP OP: BROADCAST
+ * 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted.
+ 2. CMP IDs Compare the local IDs with the specified ones in the function call.
+ 3. STORE Use flag to control the store of the var. Only the specified item will execute the store.
+ 4. BARRIER Ensure the specified value has been stored.
+ 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value,
+ so broadcasted. */
+ INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
+ {
using namespace ir;
+
+ const uint32_t srcNum = insn.getSrcNum();
+ GBE_ASSERT(srcNum >= 2);
+
const Type type = insn.getType();
const GenRegister src = sel.selReg(insn.getSrc(0), type);
const GenRegister dst = sel.selReg(insn.getDst(0), type);
- const uint32_t srcNum = insn.getSrcNum();
const uint32_t slmAddr = insn.getSlmAddr();
- GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+ GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
vector<GenRegister> fakeTemps;
+ fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+ fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
- GBE_ASSERT(srcNum >= 2);
GenRegister coords[3];
- for (uint32_t i = 1; i < srcNum; i++) {
- coords[i - 1] = sel.selReg(insn.getSrc(i), TYPE_U32);
- }
+ for (uint32_t i = 1; i < srcNum; i++)
+ coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD);
sel.push(); {
sel.curr.predicate = GEN_PREDICATE_NONE;
@@ -6266,7 +6283,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
/* insert a barrier to make sure all the var we are interested in
have been assigned the final value. */
- sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
sel.push(); {
sel.curr.flag = 0;
@@ -6275,33 +6293,90 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
sel.curr.noMask = 1;
GenRegister lid0, lid1, lid2;
uint32_t dim = srcNum - 1;
- lid0 = sel.selReg(ir::ocl::lid0);
- lid1 = sel.selReg(ir::ocl::lid1);
- lid2 = sel.selReg(ir::ocl::lid2);
+ lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD);
+ lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD);
+ lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD);
- sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
sel.curr.predicate = GEN_PREDICATE_NORMAL;
if (dim >= 2)
- sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
if (dim >= 3)
- sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
- if (typeSize(src.type) == 4) {
- GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_F);
- GenRegister _src = GenRegister::retype(src, GEN_TYPE_F);
+ /* write to shared local memory for BYTE/WORD/DWORD types */
+ if (typeSize(src.type) <= 4) {
+ GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD);
sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps);
}
+ /* write to shared local memory for QWORD types */
+ else if (typeSize(src.type) == 8) {
+ sel.push(); {
+ /* arrange data in QWORD */
+ GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ GenRegister srcQW = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+ GenRegister srcQW_p1 = src.retype(srcQW, GEN_TYPE_UD);
+ GenRegister srcQW_p2 = src.retype(src.offset(srcQW, 2, 0), GEN_TYPE_UD);
+ vector<GenRegister> srcVec;
+ srcVec.push_back(srcQW_p1);
+ srcVec.push_back(srcQW_p2);
+
+ /* unpack into 2 DWORD */
+ sel.UNPACK_LONG(srcQW, src);
+
+ /* emit write through SEND */
+ sel.UNTYPED_WRITE(_addr, srcVec.data(), 2,
+ GenRegister::immw(0xfe), fakeTemps);
+ }sel.pop();
+ }
+ else
+ GBE_ASSERT(0);
+
} sel.pop();
- /* Make sure the slm var have the valid value now */
- sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+ /* make sure the slm var have the valid value now */
+ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+ sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+ /* read from shared local memory for BYTE/WORD/DWORD types */
+ if (typeSize(src.type) <= 4) {
+ GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD);
+ sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps);
+ }
+ /* read from shared local memory for QWORD types */
+ else if (typeSize(src.type) == 8) {
+ GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ vector<GenRegister> _dst;
+ _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32));
+ _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32));
+ GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL);
+
+ sel.push(); {
+ /* emit read through SEND */
+ sel.curr.execWidth = 8;
+ sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps);
+
+ /* reconstruct QWORD type */
+ _dst[0] = dst.toUniform(dst.offset(_dst[0], 0, 4), GEN_TYPE_UD);
+ _dst[1] = dst.toUniform(_dst[1], GEN_TYPE_UD);
+ sel.curr.execWidth = 1;
+ sel.MOV(_dst[0], _dst[1]);
+ } sel.pop();
- if (typeSize(src.type) == 4) {
- sel.UNTYPED_READ(addr, &dst, 1, GenRegister::immw(0xfe), fakeTemps);
+ /* set all elements assigned to thread */
+ sel.MOV(dst, _dstQ);
}
+ else
+ GBE_ASSERT(0);
return true;
}
+
INLINE bool emitOne(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn, bool &markChildren) const
{
using namespace ir;