From b714f3cf56cde049c9687984d1a05c833d80a70e Mon Sep 17 00:00:00 2001
From: Grigore Lupescu <grigore.lupescu at intel.com>
Date: Mon, 11 Apr 2016 17:37:02 +0300
Subject: Backend: Full support for workgroup broadcast

Masked elements by proper init value.
V2: Fix barrier call in gen context
V3: Fix workgroup float max init value
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
---
 backend/src/backend/gen_context.cpp        | 242 ++++++++++++++++-------------
 backend/src/backend/gen_insn_selection.cpp | 195 ++++++++++++++++-------
 2 files changed, 269 insertions(+), 168 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 7073b807..1663b6f3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2846,52 +2846,15 @@ namespace gbe
     } p->pop();
   }
 
-  static void workgroupOp(GenRegister dst,
-                         GenRegister src1,
-                         GenRegister src2,
-                         uint32_t wg_op,
-                         GenEncoder *p)
-  {
-    // REDUCE
-    if (wg_op == ir::WORKGROUP_OP_ANY)
-      p->OR(dst, src1, src2);
-    else if (wg_op == ir::WORKGROUP_OP_ALL)
-      p->AND(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
-      p->ADD(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
-      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
-      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
-    // INCLUSIVE
-    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
-      p->ADD(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
-      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
-      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
-    // EXCLUSIVE
-    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
-      p->ADD(dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
-      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
-    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
-
-    else
-      GBE_ASSERT(0);
-  }
-
-  static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+  /* Init value according to WORKGROUP OP
+   * Emit assert is invalid combination operation - datatype */
+  static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
   {
 
     if (wg_op == ir::WORKGROUP_OP_ALL)
     {
       if (dataReg.type == GEN_TYPE_D
-          || dataReg.type == GEN_TYPE_UD
-          || dataReg.type == GEN_TYPE_F)
+          || dataReg.type == GEN_TYPE_UD)
         p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
       else if(dataReg.type == GEN_TYPE_L ||
           dataReg.type == GEN_TYPE_UL)
@@ -2946,7 +2909,7 @@ namespace gbe
       else if (dataReg.type == GEN_TYPE_UD)
         p->MOV(dataReg, GenRegister::immud(0x0));
       else if (dataReg.type == GEN_TYPE_F)
-        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000));
       else if (dataReg.type == GEN_TYPE_L)
         p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
       else if (dataReg.type == GEN_TYPE_UL)
@@ -2960,8 +2923,53 @@ namespace gbe
       GBE_ASSERT(0);
   }
 
-  static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData,
-                                   GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) {
+  /* Perform WORKGROUP OP on 2 input elements (registers) */
+  static void wgOpPerform(GenRegister dst,
+                         GenRegister src1,
+                         GenRegister src2,
+                         uint32_t wg_op,
+                         GenEncoder *p)
+  {
+    /* perform OP REDUCE on 2 elements */
+    if (wg_op == ir::WORKGROUP_OP_ANY)
+      p->OR(dst, src1, src2);
+    else if (wg_op == ir::WORKGROUP_OP_ALL)
+      p->AND(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN INCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN EXCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    else
+      GBE_ASSERT(0);
+  }
+
+  static void wgOpPerformThread(GenRegister threadDst,
+                                  GenRegister inputVal,
+                                  GenRegister threadExchangeData,
+                                   GenRegister resultVal,
+                                   uint32_t simd,
+                                   uint32_t wg_op,
+                                   GenEncoder *p)
+  {
    p->push();
    p->curr.predicate = GEN_PREDICATE_NONE;
    p->curr.noMask = 1;
@@ -2981,7 +2989,7 @@ namespace gbe
    /* init thread data to min/max/null values */
    p->push(); {
      p->curr.execWidth = simd;
-     initValue(p, threadExchangeData, wg_op);
+     wgOpInitValue(p, threadExchangeData, wg_op);
      p->MOV(resultVal, inputVal);
    } p->pop();
 
@@ -3046,17 +3054,17 @@ namespace gbe
          wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
          wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
          wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
-       workgroupOp(result[0], result[0], input[i], wg_op, p);
+       wgOpPerform(result[0], result[0], input[i], wg_op, p);
 
      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
          wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
          wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
-       workgroupOp(result[i], result[i - 1], input[i], wg_op, p);
+       wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
 
      else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
          wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
          wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-       workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p);
+       wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
 
      else
        GBE_ASSERT(0);
@@ -3090,11 +3098,11 @@ namespace gbe
    {
      p->curr.execWidth = 1;
      /* set result[0] to min/max/null */
-     initValue(p, result[0], wg_op);
+     wgOpInitValue(p, result[0], wg_op);
 
      p->curr.execWidth = 16;
      /* value exchanged with other threads */
-     workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+     wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
      /* partial result thread */
      p->MOV(threadDst, resultVal);
    }
@@ -3103,68 +3111,82 @@ namespace gbe
  }
 
 /**
- * Basic idea:
- * 1. All the threads firstly calculate the max/min/add value for the
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
  * allocated work-items. SIMD16=> 16 work-items allocated for each thread
- * 2. Each thread will write the computed reduce OP result in SLM memory
- * based on the threadId
- * 3. After a memory fence, each thread will read in chunks of 4 elements,
- * the SLM region, using a loop based on the thread count value (threadN)
- * 4. At the end each thread has the final value computed individually
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
  */
   void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
     const GenRegister dst = ra->genReg(insn.dst(0));
-    const GenRegister tmp = ra->genReg(insn.dst(1));
-    const GenRegister theVal = ra->genReg(insn.src(2));
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+    const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type);
     GenRegister threadData = ra->genReg(insn.src(3));
     GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
     GenRegister threadId = ra->genReg(insn.src(0));
     GenRegister threadLoop = ra->genReg(insn.src(1));
     GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    GenRegister localBarrier = ra->genReg(insn.src(5));
 
     uint32_t wg_op = insn.extra.workgroupOp;
     uint32_t simd = p->curr.execWidth;
     int32_t jip0, jip1;
 
+    /* masked elements should be properly set to init value */
+    p->push(); {
+      p->curr.noMask = 1;
+      wgOpInitValue(p, tmp, wg_op);
+      p->curr.noMask = 0;
+      p->MOV(tmp, theVal);
+      p->curr.noMask = 1;
+      p->MOV(theVal, tmp);
+    } p->pop();
+
     threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
 
-    /* Use of continuous GRF allocation from insn selection */
+    /* use of continuous GRF allocation from insn selection */
     GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
     GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
     GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
     GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
 
-    /* Do some calculation within each thread */
-    workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
 
     p->curr.execWidth = 16;
     p->MOV(theVal, dst);
     threadData = GenRegister::toUniform(threadData, dst.type);
 
-    /* Store thread count for future use on read/write to SLM */
+    /* store thread count for future use on read/write to SLM */
     if (wg_op == ir::WORKGROUP_OP_ANY ||
-        wg_op == ir::WORKGROUP_OP_ALL ||
-        wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
-        wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
-        wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
     {
-        //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
-        threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
-        p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
     }
     else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
-        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
-        wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
-        wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
-        wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
-        wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
     {
-        //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
-        threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
-        p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
     }
 
-    /* All threads write the partial results to SLM memory */
+    /* all threads write the partial results to SLM memory */
     if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
     {
       GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
@@ -3186,25 +3208,25 @@ namespace gbe
       p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
     }
 
-    /* Init partialData register, it will hold the final result */
-    initValue(p, partialData, wg_op);
+    /* init partialData register, it will hold the final result */
+    wgOpInitValue(p, partialData, wg_op);
 
-    /* Add call to barrier */
+    /* add call to barrier */
     p->push();
       p->curr.execWidth = 8;
       p->curr.physicalFlag = 0;
       p->curr.noMask = 1;
-      p->AND(msgData, barrierId, GenRegister::immud(0x0f000000));
-      p->BARRIER(msgData);
+      p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000));
+      p->BARRIER(localBarrier);
       p->curr.execWidth = 1;
       p->WAIT();
     p->pop();
 
-    /* Perform a loop, based on thread count (which is now multiple of 4) */
+    /* perform a loop, based on thread count (which is now multiple of 4) */
     p->push();{
       jip0 = p->n_instruction();
 
-      /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+      /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
       if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
       {
         p->curr.execWidth = 8;
@@ -3220,8 +3242,8 @@ namespace gbe
         msgDataH.hstride = 2;
         p->MOV(msgDataL, msgDataH);
 
-        /* Perform operation, partialData will hold result */
-        workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
       }
       else
       {
@@ -3232,11 +3254,11 @@ namespace gbe
         p->ADD(msgAddr, msgAddr, msgSlmOff);
         p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
 
-        /* Perform operation, partialData will hold result */
-        workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
       }
 
-      /* While threadN is not 0, cycle read SLM / update value */
+      /* while threadN is not 0, cycle read SLM / update value */
       p->curr.noMask = 1;
       p->curr.flag = 0;
       p->curr.subFlag = 1;
@@ -3248,22 +3270,25 @@ namespace gbe
     } p->pop();
 
     if(wg_op == ir::WORKGROUP_OP_ANY ||
-            wg_op == ir::WORKGROUP_OP_ALL ||
-            wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
-            wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
-            wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
     {
-        /* Save result to final register location dst */
-        p->curr.execWidth = 16;
-        p->MOV(dst, partialData);
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+      p->MOV(dst, partialData);
     }
-    else {
-        /* Save result to final register location dst */
-        p->curr.execWidth = 16;
+    else
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+
       if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
           || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
         p->ADD(dst, dst, partialData);
-      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
       {
         p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
@@ -3276,7 +3301,8 @@ namespace gbe
                        dst.offset(dst, 3, 0), partialData);
         }
       }
-      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
       {
         p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
@@ -3293,11 +3319,11 @@ namespace gbe
 
     /* corner cases for threads 0 */
     if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
-            wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
-            wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
-            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
-            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
-            wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
     {
       p->push();{
         p->curr.flag = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7a8800b4..07901a6e 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -688,10 +688,12 @@ namespace gbe
     /*! double division */
     void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum);
     /*! Work Group Operations */
-    void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister data,
-                      GenRegister threadId, GenRegister threadN,
-                      GenRegister tmp, GenRegister slmOff, vector<GenRegister> msg,
-                      uint32_t msgSizeReq);
+    void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
+                      GenRegister tmpData1,
+                      GenRegister localThreadID, GenRegister localThreadNUM,
+                      GenRegister tmpData2, GenRegister slmOff,
+                      vector<GenRegister> msg, uint32_t msgSizeReq,
+                      GenRegister localBarrier);
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
        as much as possible. */
@@ -1960,15 +1962,16 @@ namespace gbe
   void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
                                        Reg dst,
                                        GenRegister src,
-                                       GenRegister data,
-                                       GenRegister threadId,
-                                       GenRegister threadN,
-                                       GenRegister tmp,
+                                       GenRegister tmpData1,
+                                       GenRegister localThreadID,
+                                       GenRegister localThreadNUM,
+                                       GenRegister tmpData2,
                                        GenRegister slmOff,
                                        vector<GenRegister> msg,
-                                       uint32_t msgSizeReq = 6)
+                                       uint32_t msgSizeReq,
+                                       GenRegister localBarrier)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 5);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6);
     SelectionVector *vector = this->appendVector();
 
     /* allocate continuous GRF registers for READ/WRITE to SLM */
@@ -1980,15 +1983,16 @@ namespace gbe
     insn->extra.workgroupOp = wg_op;
 
     insn->dst(0) = dst;
-    insn->dst(1) = tmp;
+    insn->dst(1) = tmpData1;
     for(uint32_t i = 0; i < msg.size(); i++)
       insn->dst(2 + i) = msg[i];
 
-    insn->src(0) = threadId;
-    insn->src(1) = threadN;
+    insn->src(0) = localThreadID;
+    insn->src(1) = localThreadNUM;
     insn->src(2) = src;
-    insn->src(3) = data;
+    insn->src(3) = tmpData2;
     insn->src(4) = slmOff;
+    insn->src(5) = localBarrier;
   }
 
   // Boiler plate to initialize the selection library at c++ pre-main
@@ -6199,64 +6203,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
   /*! WorkGroup instruction pattern */
   DECL_PATTERN(WorkGroupInstruction)
   {
-    /* SLM bassed communication between threads, most of the logic bellow */
+    /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+     * Shared local memory bassed communication between threads,
+     * prepare for the workgroup op in gen context
+     * Algorithm logic is in gen context,  */
     INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
     {
       using namespace ir;
-      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
 
+      GBE_ASSERT(insn.getSrcNum() == 3);
+      GBE_ASSERT(insn.getSrc(0) == ocl::threadn);
+      GBE_ASSERT(insn.getSrc(1) == ocl::threadid);
+
+      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
       const Type type = insn.getType();
       GenRegister dst = sel.selReg(insn.getDst(0), type);
       GenRegister src = sel.selReg(insn.getSrc(2), type);
-      GenRegister threadId = sel.selReg(ocl::threadid, ir::TYPE_U32);
-      GenRegister threadN = sel.selReg(ocl::threadn, ir::TYPE_U32);
-      const uint32_t srcNum = insn.getSrcNum();
-
-      GBE_ASSERT(srcNum == 3);
-      GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
-      GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
-      GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
-      GenRegister data = sel.selReg(sel.reg(FAMILY_QWORD), type);
-      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U32);
-
+      GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+      GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32);
+      GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32);
+      GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32);
+      GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD));
+
+      /* Allocate registers for message sending
+       * (read/write to shared local memory) */
       vector<GenRegister> msg;
       for(uint32_t i = 0; i < 6; i++)
-        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
 
-      /* insert a barrier to make sure all the var we are interested in
+      /* Insert a barrier to make sure all the var we are interested in
          have been assigned the final value. */
-      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
 
-      /* compute individual slice of workitems, (e.g. 0->16 workitems) */
+      /* Pass the shared local memory offset  */
       sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
 
-      /* barrier for syn prior to workgroup */
-      sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg);
+      /* Perform workgroup op */
+      sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1,
+                       localThreadID, localThreadNUM, tmpData2, slmOff, msg, 6,
+                       localBarrier);
 
       return true;
     }
 
-    INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const {
-      /*  1. BARRIER    Ensure all the threads have set the correct value for the var which will be broadcasted.
-          2. CMP IDs    Compare the local IDs with the specified ones in the function call.
-          3. STORE	   Use flag to control the store of the var. Only the specified item will execute the store.
-          4. BARRIER    Ensure the specified value has been stored.
-          5. LOAD	   Load the stored value to all the dst value, the dst of all the items will have same value,
-          so broadcasted.	*/
+    /* WORKGROUP OP: BROADCAST
+     * 1. BARRIER    Ensure all the threads have set the correct value for the var which will be broadcasted.
+       2. CMP IDs    Compare the local IDs with the specified ones in the function call.
+       3. STORE      Use flag to control the store of the var. Only the specified item will execute the store.
+       4. BARRIER    Ensure the specified value has been stored.
+       5. LOAD       Load the stored value to all the dst value, the dst of all the items will have same value,
+       so broadcasted. */
+    INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
+    {
       using namespace ir;
+
+      const uint32_t srcNum = insn.getSrcNum();
+      GBE_ASSERT(srcNum >= 2);
+
       const Type type = insn.getType();
       const GenRegister src = sel.selReg(insn.getSrc(0), type);
       const GenRegister dst = sel.selReg(insn.getDst(0), type);
-      const uint32_t srcNum = insn.getSrcNum();
       const uint32_t slmAddr = insn.getSlmAddr();
-      GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+      GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
       vector<GenRegister> fakeTemps;
+      fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+      fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
 
-      GBE_ASSERT(srcNum >= 2);
       GenRegister coords[3];
-      for (uint32_t i = 1; i < srcNum; i++) {
-        coords[i - 1] = sel.selReg(insn.getSrc(i), TYPE_U32);
-      }
+      for (uint32_t i = 1; i < srcNum; i++)
+        coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD);
 
       sel.push(); {
         sel.curr.predicate = GEN_PREDICATE_NONE;
@@ -6266,7 +6283,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
 
       /* insert a barrier to make sure all the var we are interested in
          have been assigned the final value. */
-      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
 
       sel.push(); {
         sel.curr.flag = 0;
@@ -6275,33 +6293,90 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         sel.curr.noMask = 1;
         GenRegister lid0, lid1, lid2;
         uint32_t dim = srcNum - 1;
-        lid0 = sel.selReg(ir::ocl::lid0);
-        lid1 = sel.selReg(ir::ocl::lid1);
-        lid2 = sel.selReg(ir::ocl::lid2);
+        lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD);
+        lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD);
+        lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD);
 
-        sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+        sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0,
+                GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
         if (dim >= 2)
-          sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+          sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1,
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
         if (dim >= 3)
-          sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+          sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2,
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
 
-        if (typeSize(src.type) == 4) {
-          GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_F);
-          GenRegister _src = GenRegister::retype(src, GEN_TYPE_F);
+        /* write to shared local memory for BYTE/WORD/DWORD types */
+        if (typeSize(src.type) <= 4) {
+          GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+          GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD);
           sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps);
         }
+        /* write to shared local memory for QWORD types */
+        else if (typeSize(src.type) == 8) {
+          sel.push(); {
+          /* arrange data in QWORD */
+          GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+          GenRegister srcQW = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+          GenRegister srcQW_p1 = src.retype(srcQW, GEN_TYPE_UD);
+          GenRegister srcQW_p2 = src.retype(src.offset(srcQW, 2, 0), GEN_TYPE_UD);
+          vector<GenRegister> srcVec;
+          srcVec.push_back(srcQW_p1);
+          srcVec.push_back(srcQW_p2);
+
+          /* unpack into 2 DWORD */
+          sel.UNPACK_LONG(srcQW, src);
+
+          /* emit write through SEND */
+          sel.UNTYPED_WRITE(_addr, srcVec.data(), 2,
+                            GenRegister::immw(0xfe), fakeTemps);
+          }sel.pop();
+        }
+        else
+          GBE_ASSERT(0);
+
       } sel.pop();
-      /* Make sure the slm var have the valid value now */
-      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+      /* make sure the slm var have the valid value now */
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+      /* read from shared local memory for BYTE/WORD/DWORD types */
+      if (typeSize(src.type) <= 4) {
+        GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+        GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD);
+        sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps);
+      }
+      /* read from shared local memory for QWORD types */
+      else if (typeSize(src.type) == 8) {
+        GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+        vector<GenRegister> _dst;
+        _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32));
+        _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32));
+        GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL);
+
+        sel.push(); {
+        /* emit read through SEND */
+        sel.curr.execWidth = 8;
+        sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps);
+
+        /* reconstruct QWORD type */
+        _dst[0] = dst.toUniform(dst.offset(_dst[0], 0, 4), GEN_TYPE_UD);
+        _dst[1] = dst.toUniform(_dst[1], GEN_TYPE_UD);
+        sel.curr.execWidth = 1;
+        sel.MOV(_dst[0], _dst[1]);
+        } sel.pop();
 
-      if (typeSize(src.type) == 4) {
-        sel.UNTYPED_READ(addr, &dst, 1, GenRegister::immw(0xfe), fakeTemps);
+        /* set all elements assigned to thread */
+        sel.MOV(dst, _dstQ);
       }
+      else
+        GBE_ASSERT(0);
 
       return true;
     }
 
+
     INLINE bool emitOne(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
-- 
cgit v1.2.3