summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPan Xiuli <xiuli.pan@intel.com>2016-12-19 15:57:39 +0800
committerYang Rong <rong.r.yang@intel.com>2016-12-26 13:01:12 +0800
commita99eca7249b3372a5a73388ec0408f0f0c586f19 (patch)
tree062ec95e57f7ebb24f92790b44bb269b264d343a
parenta5e24e2fac09974138c715f685d5322b1f81758d (diff)
Backend: Refine block read/write instruction selection
Move the block pack/unpack into instruction selection in order to get optimization. Also change some variable name to avoid misleading. And make some new function in GenEncoder class. V2: Use ud8grf instead of f8grf to save a retype. V3: Merge change name patch and fix some comments. V4: Fix some simd 8 related bug and comments typo. Signed-off-by: Pan Xiuli <xiuli.pan@intel.com> Reviewed-by: Guo, Yejun <yejun.guo@intel.com>
-rw-r--r--backend/src/backend/gen8_encoder.cpp40
-rw-r--r--backend/src/backend/gen_context.cpp459
-rw-r--r--backend/src/backend/gen_encoder.cpp105
-rw-r--r--backend/src/backend/gen_encoder.hpp18
-rw-r--r--backend/src/backend/gen_insn_selection.cpp448
5 files changed, 440 insertions, 630 deletions
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 277acdab..b274a514 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -840,20 +840,15 @@ namespace gbe
gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
}
- void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- uint32_t rsize = size / 2;
- uint32_t msgsize = size;
- // When size is 1 OWord, which means half a reg, we need to know which half to use
- if (size == 1) {
- if (dst.subnr == 0)
- msgsize = 0;
- else
- msgsize = 1;
- }
- rsize = rsize == 0 ? 1 : rsize;
- const uint32_t response_length = rsize; // Size is in regs
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+ const uint32_t response_length = sizeinreg; // Size is in reg
+
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
@@ -861,21 +856,22 @@ namespace gbe
setOBlockRWA64(this,
insn,
bti,
- msgsize,
+ block_size,
GEN8_P1_BLOCK_READ_A64,
msg_length,
response_length);
}
- void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t size) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- uint32_t rsize = size / 2;
- rsize = rsize == 0 ? 1 : rsize;
- const uint32_t msg_length = 1 + rsize; // Size is in owords
+ void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
const uint32_t response_length = 0;
- uint32_t msgsize = size;
- msgsize = msgsize == 1 ? 0 : msgsize;
+ const uint32_t block_size = getOBlockSize(ow_size);
+
this->setHeader(insn);
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
@@ -883,7 +879,7 @@ namespace gbe
setOBlockRWA64(this,
insn,
bti,
- msgsize,
+ block_size,
GEN8_P1_BLOCK_WRITE_A64,
msg_length,
response_length);
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index c8019e39..10e2c9e9 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3556,458 +3556,39 @@ namespace gbe
}
void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst= ra->genReg(insn.dst(1));
- const GenRegister addrreg = ra->genReg(insn.src(0));
- uint32_t type = dst.type;
- uint32_t typesize = typeSize(type);
- const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
- const uint32_t simdWidth = p->curr.execWidth;
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
- GenRegister headeraddr;
- bool isA64 = insn.getbti() == 255;
+ const GenRegister header = ra->genReg(insn.src(0));
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t ow_size = insn.extra.elem;
+ bool isA64 = bti == 255;
if (isA64)
- headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+ p->OBREADA64(tmp, header, bti, ow_size);
else
- headeraddr = GenRegister::offset(header, 0, 2*4);
-
- // Make header
- p->push();
- {
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0, 0));
-
- // Update the header with the current address
- p->curr.execWidth = 1;
- p->MOV(headeraddr, addr);
-
- // Put zero in the general state base address
- if (!isA64)
- p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-
- }
- p->pop();
- // Now read the data, oword block read can only work with simd16 and no mask
- if (vec_size == 1) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64) {
- //p->curr.execWidth = 8;
- p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
- }
- else
- p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
- }
- p->pop();
- } else if (vec_size == 2) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
- }
- p->pop();
- p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
- p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize ));
- } else if (vec_size == 4) {
- if (simdWidth == 8) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
- else
- p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
- }
- p->pop();
- for (uint32_t j = 0; j < 4; j++)
- p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
- } else {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), 8);
- for (uint32_t j = 0; j < 8 / typesize ; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
- }
- }
- } else if (vec_size == 8) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), 8);
- }
- p->pop();
- for (uint32_t j = 0; j < 16 / typesize; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
- }
- } else {
- for (uint32_t i = 0; i < typesize ; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), 8);
- for (uint32_t j = 0; j < 8 / typesize; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
- }
- }
- } else NOT_SUPPORTED;
+ p->OBREAD(tmp, header, bti, ow_size);
}
void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister addrreg = ra->genReg(insn.src(0));
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- uint32_t type = ra->genReg(insn.src(1)).type;
- uint32_t typesize = typeSize(type);
- const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::offset(header, 1);
- const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
- GenRegister headeraddr;
- bool isA64 = insn.getbti() == 255;
+ const GenRegister header = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t ow_size = insn.extra.elem;
+ bool isA64 = bti == 255;
if (isA64)
- headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+ p->OBWRITEA64(header, bti, ow_size);
else
- headeraddr = GenRegister::offset(header, 0, 2*4);
- const uint32_t simdWidth = p->curr.execWidth;
- uint32_t tmp_size = simdWidth * vec_size / 8;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
- uint32_t offset_size = isA64 ? 128 : 8;
-
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the current address
- p->curr.execWidth = 1;
- if (isA64)
- p->MOV(headeraddr, addr);
- else
- p->SHR(headeraddr, addr, GenRegister::immud(4));
-
- // Put zero in the general state base address
- if (!isA64)
- p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
- p->pop();
- // Now write the data, oword block write can only work with simd16 and no mask
- if (vec_size == 1) {
- p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
- else
- p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
- }
- p->pop();
- } else if (vec_size == 2) {
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1)));
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
- else
- p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
- }
- p->pop();
- } else if (vec_size == 4) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < 4; i++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
- else
- p->OBWRITE(header, insn.getbti(), 2 * typesize);
- }
- p->pop();
- } else {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- for (uint32_t j = 0; j < 8 / typesize; j++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
- }
- p->pop();
- }
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 8);
- else
- p->OBWRITE(header, insn.getbti(), 8);
- }
- }
- } else if (vec_size == 8) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- for (uint32_t j = 0; j < 16 / typesize; j++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
- }
- p->pop();
- }
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 8);
- else
- p->OBWRITE(header, insn.getbti(), 8);
- }
- p->pop();
- }
- } else {
- for (uint32_t i = 0; i < typesize; i++) {
- for (uint32_t j = 0; j < 8 / typesize; j++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
- }
- p->pop();
- }
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 8);
- else
- p->OBWRITE(header, insn.getbti(), 8);
- }
- }
- } else NOT_SUPPORTED;
-
+ p->OBWRITE(header, bti, ow_size);
}
void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(1));
- const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
- const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
- const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
- const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
- size_t vec_size = insn.extra.elem;
- uint32_t type = dst.type;
- uint32_t typesize = typeSize(type);
- uint32_t block_width = typesize * simdWidth;
- uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
- if (simdWidth == 8)
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- p->curr.execWidth = 8;
- // ushort in simd8 will have half reg, but response lenght is still 1
- uint32_t rsize = vec_size * typesize / 4;
- rsize = rsize ? rsize : 1;
- // Now read the data
- p->MBREAD(dst, header, insn.getbti(), rsize);
- p->pop();
-
- }
- else if (simdWidth == 16)
- {
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // First half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
-
- if (typesize == 4)
- {
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
-
- // Move the reg to fit vector rule.
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
- GenRegister::offset(tmp, i));
- }
- p->pop();
- } else NOT_IMPLEMENTED;
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister header = ra->genReg(insn.src(0));
+ const size_t response_size = insn.extra.elem;
+ p->MBREAD(dst, header, insn.getbti(), response_size);
}
void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
- const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister tmp = GenRegister::offset(header, 1);
- GenRegister offsetx, offsety, blocksizereg;
- size_t vec_size = insn.extra.elem;
- uint32_t type = ra->genReg(insn.src(2)).type;
- uint32_t typesize = typeSize(type);
- uint32_t block_width = typesize * simdWidth;
- uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
-
- offsetx = GenRegister::offset(header, 0, 0*4);
- offsety = GenRegister::offset(header, 0, 1*4);
- blocksizereg = GenRegister::offset(header, 0, 2*4);
-
- if (simdWidth == 8)
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type),
- ra->genReg(insn.src(2 + i)));
- // ushort in simd8 will have half reg, but reponse lenght is still 1
- uint32_t rsize = vec_size * typesize / 4;
- rsize = rsize ? rsize : 1;
- // Now read the data
- p->MBWRITE(header, insn.getbti(), rsize);
- p->pop();
-
- }
- else
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // First half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
- p->MBWRITE(header, insn.getbti(), vec_size);
-
- if (typesize == 4)
- {
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
- // Now write the data
- p->MBWRITE(header, insn.getbti(), vec_size);
- }
-
- p->pop();
- }
+ const GenRegister header = ra->genReg(insn.dst(0));
+ const size_t data_size = insn.extra.elem;
+ p->MBWRITE(header, insn.getbti(), data_size);
}
BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 5dea48a9..d95d03e9 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -257,32 +257,47 @@ namespace gbe
NOT_SUPPORTED;
}
- static void setOBlockRW(GenEncoder *p,
- GenNativeInstruction *insn,
- uint32_t bti,
- uint32_t size,
- uint32_t msg_type,
- uint32_t msg_length,
- uint32_t response_length)
+ void GenEncoder::setOBlockRW(GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t block_size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
- p->setMessageDescriptor(insn, sfid, msg_length, response_length);
- assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
insn->bits3.gen7_oblock_rw.msg_type = msg_type;
insn->bits3.gen7_oblock_rw.bti = bti;
- insn->bits3.gen7_oblock_rw.block_size = size <= 2 ? size : (size == 4 ? 3 : 4);
+ insn->bits3.gen7_oblock_rw.block_size = block_size;
insn->bits3.gen7_oblock_rw.header_present = 1;
}
- static void setMBlockRW(GenEncoder *p,
- GenNativeInstruction *insn,
- uint32_t bti,
- uint32_t msg_type,
- uint32_t msg_length,
- uint32_t response_length)
+ uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half)
+ {
+ /* 000: 1 OWord, read into or written from the low 128 bits of the destination register.
+ * 001: 1 OWord, read into or written from the high 128 bits of the destination register.
+ * 010: 2 OWords
+ * 011: 4 OWords
+ * 100: 8 OWords */
+ switch(oword_size)
+ {
+ case 1: return low_half ? 0 : 1;
+ case 2: return 2;
+ case 4: return 3;
+ case 8: return 4;
+ default: NOT_SUPPORTED;
+ }
+ return 0;
+ }
+
+ void GenEncoder::setMBlockRW(GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
- p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
insn->bits3.gen7_mblock_rw.msg_type = msg_type;
insn->bits3.gen7_mblock_rw.bti = bti;
insn->bits3.gen7_mblock_rw.header_present = 1;
@@ -1312,80 +1327,72 @@ namespace gbe
setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
}
- void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- uint32_t rsize = size / 2;
- uint32_t msgsize = size;
- // When size is 1 OWord, which means half a reg, we need to know which half to use
- if (size == 1) {
- if (dst.subnr == 0)
- msgsize = 0;
- else
- msgsize = 1;
- }
- rsize = rsize == 0 ? 1 : rsize;
- const uint32_t response_length = rsize; // Size is in regs
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+ const uint32_t response_length = sizeinreg; // Size is in reg
+
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
- setOBlockRW(this,
- insn,
+ setOBlockRW(insn,
bti,
- msgsize,
+ block_size,
GEN7_UNALIGNED_OBLOCK_READ,
msg_length,
response_length);
}
- void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- uint32_t rsize = size / 2;
- rsize = rsize == 0 ? 1 : rsize;
- const uint32_t msg_length = 1 + rsize; // Size is in owords
+ uint32_t sizeinreg = ow_size / 2;
+ // half reg should also have size 1
+ sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+ const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
const uint32_t response_length = 0;
- uint32_t msgsize = size;
- msgsize = msgsize == 1 ? 0 : msgsize;
+ const uint32_t block_size = getOBlockSize(ow_size);
+
this->setHeader(insn);
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
- setOBlockRW(this,
- insn,
+ setOBlockRW(insn,
bti,
- msgsize,
+ block_size,
GEN7_OBLOCK_WRITE,
msg_length,
response_length);
}
- void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- const uint32_t response_length = size; // Size of registers
+ const uint32_t response_length = response_size; // Size of registers
this->setHeader(insn);
this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
- setMBlockRW(this,
- insn,
+ setMBlockRW(insn,
bti,
GEN75_P1_MEDIA_BREAD,
msg_length,
response_length);
}
- void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+ void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t data_size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- const uint32_t msg_length = 1 + size;
+ const uint32_t msg_length = 1 + data_size;
const uint32_t response_length = 0; // Size of registers
this->setHeader(insn);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
- setMBlockRW(this,
- insn,
+ setMBlockRW(insn,
bti,
GEN75_P1_MEDIA_TYPED_BWRITE,
msg_length,
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 66aa9cb5..4e9fe58c 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -288,18 +288,24 @@ namespace gbe
virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
GenRegister src1 = GenRegister::null());
virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+
+ /*! OBlock helper function */
+ uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true);
+ void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+ void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t block_size, uint32_t msg_type, uint32_t msg_length, uint32_t response_lengtha);
+
/*! OBlock read */
- void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
/*! OBlock write */
- void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+ void OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size);
/*! MBlock read */
- virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size);
/*! MBlock write */
- virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t data_size);
/*! A64 OBlock read */
- virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
/*! A64 OBlock write */
- virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
+ virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size);
GBE_CLASS(GenEncoder); //!< Use custom allocators
virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 128c2bc3..dd21960f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -759,13 +759,13 @@ namespace gbe
void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
GenRegister tmpData1, GenRegister tmpData2);
/*! Oblock read */
- void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+ void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size);
/*! Oblock write */
- void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+ void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size);
/*! Media block read */
- void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+ void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size);
/*! Media block write */
- void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+ void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size);
/* common functions for both binary instruction and sel_cmp and compare instruction.
It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -2300,118 +2300,84 @@ namespace gbe
}
void Selection::Opaque::OBREAD(GenRegister* dsts,
uint32_t vec_size,
- GenRegister addr,
GenRegister header,
uint32_t bti,
- GenRegister* tmp,
- uint32_t tmp_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+ uint32_t ow_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1);
SelectionVector *vector = this->appendVector();
- insn->dst(0) = header;
+ insn->src(0) = header;
for (uint32_t i = 0; i < vec_size; ++i)
- insn->dst(1 + i) = dsts[i];
- for (uint32_t i = 0; i < tmp_size; ++i)
- insn->dst(1 + i + vec_size) = tmp[i];
- insn->src(0) = addr;
+ insn->dst(i) = dsts[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // number of vector size
+ insn->extra.elem = ow_size; // number of OWord size
// tmp regs for OWORD read dst
- vector->regNum = tmp_size;
- vector->reg = &insn->dst(1 + vec_size);
- vector->offsetID = 1 + vec_size;
+ vector->regNum = vec_size;
+ vector->reg = &insn->dst(0);
+ vector->offsetID = 0;
vector->isSrc = 0;
}
- void Selection::Opaque::OBWRITE(GenRegister addr,
+ void Selection::Opaque::OBWRITE(GenRegister header,
GenRegister* values,
uint32_t vec_size,
- GenRegister header,
uint32_t bti,
- GenRegister* tmp,
- uint32_t tmp_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
+ uint32_t ow_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1);
SelectionVector *vector = this->appendVector();
- insn->src(0) = addr;
+ insn->src(0) = header;
for (uint32_t i = 0; i < vec_size; ++i)
insn->src(i + 1) = values[i];
- insn->dst(0) = header;
- for (uint32_t i = 0; i < tmp_size; ++i)
- insn->dst(i + 1) = tmp[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // number of vector_size
+ insn->extra.elem = ow_size; // number of OWord_size
- // tmp regs for OWORD read dst
- vector->regNum = tmp_size + 1;
- vector->reg = &insn->dst(0);
+ // tmp regs for OWORD write header and values
+ vector->regNum = vec_size + 1;
+ vector->reg = &insn->src(0);
vector->offsetID = 0;
- vector->isSrc = 0;
+ vector->isSrc = 1;
+
}
void Selection::Opaque::MBREAD(GenRegister* dsts,
- GenRegister coordx,
- GenRegister coordy,
+ uint32_t tmp_size,
GenRegister header,
- GenRegister* tmp,
uint32_t bti,
- uint32_t vec_size) {
-
- uint32_t simdWidth = curr.execWidth;
- SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
- insn->dst(0) = header;
- for (uint32_t i = 0; i < vec_size; ++i) {
- insn->dst(i + 1) = dsts[i];
- if(simdWidth == 16)
- insn->dst(i + vec_size + 1) = tmp[i];
- }
- insn->src(0) = coordx;
- insn->src(1) = coordy;
+ uint32_t response_size) {
+
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+ insn->src(0) = header;
insn->setbti(bti);
- insn->extra.elem = vec_size; // vector size
+ insn->extra.elem = response_size; // send response length
- // Only in simd 8 the data is in vector form
- if(simdWidth == 8) {
- SelectionVector *vector = this->appendVector();
- vector->regNum = vec_size;
- vector->reg = &insn->dst(1);
- vector->offsetID = 1;
- vector->isSrc = 0;
- }
- if(simdWidth == 16)
- {
- SelectionVector *vectortmp = this->appendVector();
- vectortmp->regNum = vec_size;
- vectortmp->reg = &insn->dst(vec_size + 1);
- vectortmp->offsetID = vec_size + 1;
- vectortmp->isSrc = 0;
+ for (uint32_t i = 0; i < tmp_size; ++i) {
+ insn->dst(i) = dsts[i];
}
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = tmp_size;
+ vector->reg = &insn->dst(0);
+ vector->offsetID = 0;
+ vector->isSrc = 0;
}
- void Selection::Opaque::MBWRITE(GenRegister coordx,
- GenRegister coordy,
+ void Selection::Opaque::MBWRITE(GenRegister header,
GenRegister* values,
- GenRegister header,
- GenRegister* tmp,
+ uint32_t tmp_size,
uint32_t bti,
- uint32_t vec_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+ uint32_t data_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size);
SelectionVector *vector = this->appendVector();
- insn->src(0) = coordx;
- insn->src(1) = coordy;
- for (uint32_t i = 0; i < vec_size; ++i)
- insn->src(2 + i) = values[i];
- insn->dst(0) = header;
- for (uint32_t i = 0; i < vec_size; ++i)
- insn->dst(1 + i) = tmp[i];
- insn->state = this->curr;
+ insn->src(0) = header;
+ for (uint32_t i = 0; i < tmp_size; ++i)
+ insn->src(1 + i) = values[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // vector size
+ insn->extra.elem = data_size; // msg data part size
// We need to put the header and the data together
- vector->regNum = 1 + vec_size;
- vector->reg = &insn->dst(0);
+ vector->regNum = 1 + tmp_size;
+ vector->reg = &insn->src(0);
vector->offsetID = 0;
- vector->isSrc = 0;
+ vector->isSrc = 1;
}
// Boiler plate to initialize the selection library at c++ pre-main
@@ -4767,18 +4733,79 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Type type = insn.getValueType();
const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ bool isA64 = SI == 255;
+
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
vector<GenRegister> valuesVec;
+ vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < vec_size; i++)
valuesVec.push_back(sel.selReg(insn.getValue(i), type));
- // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
- tmp_size = tmp_size == 0 ? 1 : tmp_size;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
- vector<GenRegister> tmpVec;
+
+ GenRegister headeraddr;
+ if (isA64)
+ headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+ // Make header
+ sel.push();
+ {
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the current address
+ sel.curr.execWidth = 1;
+
+ // Put zero in the general state base address
+ if (isA64)
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+ else {
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+ sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+ }
+ }
+ sel.pop();
+
+ /* For block read we need to unpack the block date into values, and for different
+ * simdwidth and vector size with different type size, we may need to spilt the
+ * block read send message.
+ * We can only get a send message with 5 reg length
+ * so for different combination we have different message length and tmp vector size
+ * | simd8 | simd16 | simd8 | simd16
+ * r0 |header | | | |
+ * r1 |date | w0,w1 | w0 | dw0 | dw0
+ * r2 |date | w2,w3 | w1 | dw1 | dw0
+ * r3 |date | ...... | ...... | ...... | dw1
+ * r4 |date | ....... | ...... | ...... | dw1
+ */
+
+ uint32_t totalSize = simdWidth * typeSize * vec_size;
+ uint32_t valueSize = simdWidth * typeSize;
+ uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+ uint32_t msg_num = vec_size / tmp_size;
+ uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
- sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], tmp_size);
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ for (uint32_t i = 0; i < msg_num; i++) {
+ if (i > 0) {
+ sel.push();
+ {
+ // Update the address in header
+ sel.curr.execWidth = 1;
+ sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+ }
+ sel.pop();
+ }
+ sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+ for (uint32_t j = 0; j < tmp_size; j++)
+ sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+ }
+
}
// check whether all binded table index point to constant memory
@@ -5213,18 +5240,87 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Type type = insn.getValueType();
const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ bool isA64 = SI == 255;
+
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
vector<GenRegister> valuesVec;
+ vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < vec_size; i++)
valuesVec.push_back(sel.selReg(insn.getValue(i), type));
- // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
- tmp_size = tmp_size == 0 ? 1 : tmp_size;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
- vector<GenRegister> tmpVec;
+
+ GenRegister headeraddr;
+ if (isA64)
+ headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+ // Make header
+ sel.push();
+ {
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the current address
+ sel.curr.execWidth = 1;
+
+ // Put zero in the general state base address
+ if (isA64)
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+ else {
+ sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+ sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+ }
+ }
+ sel.pop();
+
+ /* For block write we need to pack the block date into the tmp, and for different
+ * simdwidth and vector size with different type size, we may need to spilt the
+ * block write send message.
+ * We can only get a send message with 5 reg length
+ * so for different combination we have different message length and tmp vector size
+ * | simd8 | simd16 | simd8 | simd16
+ * r0 |header | | | |
+ * r1 |date | w0,w1 | w0 | dw0 | dw0
+ * r2 |date | w2,w3 | w1 | dw1 | dw0
+ * r3 |date | ...... | ...... | ...... | dw1
+ * r4 |date | ....... | ...... | ...... | dw1
+ */
+
+ uint32_t totalSize = simdWidth * typeSize * vec_size;
+ uint32_t valueSize = simdWidth * typeSize;
+ uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+ uint32_t msg_num = vec_size / tmp_size;
+ uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
- sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], tmp_size);
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ for (uint32_t i = 0; i < msg_num; i++) {
+ for (uint32_t j = 0; j < tmp_size; j++)
+ sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+ if (i > 0) {
+ sel.push();
+ {
+ // Update the address in header
+ sel.curr.execWidth = 1;
+ sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ sel.pop();
+ }
+ sel.push();
+ // In simd8 mode, when data reg has more than 1 reg, execWidth 8 will get wrong
+ // result, so set the execWidth to 16.
+ sel.curr.execWidth = 16;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+ sel.pop();
+ }
+
+
}
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
@@ -7725,20 +7821,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
uint32_t vec_size = insn.getVectorSize();
uint32_t simdWidth = sel.curr.execWidth;
const Type type = insn.getType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ uint32_t response_size = simdWidth * vec_size * typeSize / 32;
+ // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1
+ response_size = response_size ? response_size : 1;
+ uint32_t block_width = typeSize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
for (uint32_t i = 0; i < vec_size; ++i) {
valuesVec.push_back(sel.selReg(insn.getDst(i), type));
- if(simdWidth == 16)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
+ if(simdWidth == 16 && typeSize == 4)
+ tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
}
- const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
- const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
- GenRegister *tmp = NULL;
- if(simdWidth == 16)
- tmp = &tmpVec[0];
- sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+ const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+ const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+ const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+ const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
+
+ // Make header
+ sel.push();
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.MOV(offsetx, coordx);
+ sel.MOV(offsety, coordy);
+ // Update block width and height
+ sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+ sel.pop();
+
+ if (simdWidth * typeSize < 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ // Now read the data
+ sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size);
+ sel.pop();
+ } else if (simdWidth * typeSize == 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(valuesVec[i], tmpVec[i]);
+
+ // Second half
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ // Now read the data
+ sel.curr.execWidth = 8;
+ sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size);
+
+ // Move the reg to fit vector rule.
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+ sel.pop();
+ } else NOT_IMPLEMENTED;
+
+
return true;
}
DECL_CTOR(MediaBlockReadInstruction, 1, 1);
@@ -7752,17 +7905,84 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
uint32_t vec_size = insn.getVectorSize();
const Type type = insn.getType();
- const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
- const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+ uint32_t simdWidth = sel.curr.execWidth;
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ // ushort in simd8 will have half reg, but data lenght is still 1
+ uint32_t data_size = simdWidth * vec_size * typeSize / 32;
+ data_size = data_size? data_size : 1;
+ uint32_t block_width = typeSize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
- for(uint32_t i = 0; i < vec_size; i++)
- {
- valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
- }
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
- sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+ for (uint32_t i = 0; i < vec_size; ++i) {
+ valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+ if(simdWidth == 16 && typeSize == 4)
+ tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
+ else
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ }
+ const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+ const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+ const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+ const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
+
+ // Make header
+ sel.push();
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.MOV(offsetx, coordx);
+ sel.MOV(offsety, coordy);
+ // Update block width and height
+ sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+ sel.pop();
+
+ if (simdWidth * typeSize < 64) {
+ for (uint32_t i = 0; i < vec_size; ++i) {
+ sel.MOV(tmpVec[i], valuesVec[i]);
+ }
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ // Now write the data
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
+ sel.pop();
+ } else if (simdWidth * typeSize == 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(tmpVec[i], valuesVec[i]);
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+ // Second half
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ sel.curr.execWidth = 8;
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+ // Now write the data
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+ // Move the reg to fit vector rule.
+ sel.pop();
+ } else NOT_IMPLEMENTED;
+
return true;
}
DECL_CTOR(MediaBlockWriteInstruction, 1, 1);