diff options
author | Ian Romanick <ian.d.romanick@intel.com> | 2022-07-12 15:32:01 -0700 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2022-07-26 17:25:18 +0000 |
commit | 349a040f684cc5c6b80d40a4edbefa410e91034d (patch) | |
tree | e9496685b6a20935bb3257785f5c04707ef09a49 | |
parent | 5dab077824665b42c48acbe0e193a0786812672d (diff) |
intel/fs: Make logical URB write instructions more like other logical instructions
The changes to fs_visitor::validate() helped track down a place where I
initially forgot to convert a message to the new sources layout. This
had caused a different validation failure in
dEQP-GLES31.functional.tessellation.tesscoord.triangles_equal_spacing,
but this were not detected until after SENDs were lowered.
Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 19951145 -> 19951133 (<.01%)
instructions in affected programs: 2429 -> 2417 (-0.49%)
helped: 8 / HURT: 0
total cycles in shared programs: 858904152 -> 858862331 (<.01%)
cycles in affected programs: 5702652 -> 5660831 (-0.73%)
helped: 2138 / HURT: 1255
Broadwell
total cycles in shared programs: 904869459 -> 904835501 (<.01%)
cycles in affected programs: 7686744 -> 7652786 (-0.44%)
helped: 2861 / HURT: 2050
Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 141442369 -> 141442032 (-0.0%)
Instructions helped: 337
Cycles in all programs: 9099270231 -> 9099036492 (-0.0%)
Cycles helped: 40661
Cycles hurt: 28606
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17605>
-rw-r--r-- | src/intel/compiler/brw_eu_defines.h | 11 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 47 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_nir.cpp | 77 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_validate.cpp | 14 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_visitor.cpp | 53 | ||||
-rw-r--r-- | src/intel/compiler/brw_lower_logical_sends.cpp | 25 | ||||
-rw-r--r-- | src/intel/compiler/brw_mesh.cpp | 70 |
7 files changed, 176 insertions, 121 deletions
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index da7c09c96f2..fecb3273d86 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -950,6 +950,17 @@ enum rt_logical_srcs { RT_LOGICAL_NUM_SRCS }; +enum urb_logical_srcs { + URB_LOGICAL_SRC_HANDLE, + URB_LOGICAL_SRC_PER_SLOT_OFFSETS, + URB_LOGICAL_SRC_CHANNEL_MASK, + /** Data to be written. BAD_FILE for reads. */ + URB_LOGICAL_SRC_DATA, + + URB_LOGICAL_NUM_SRCS +}; + + #ifdef __cplusplus /** * Allow brw_urb_write_flags enums to be ORed together. diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 9e5ede1dc48..34a88ac89e2 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -863,6 +863,17 @@ fs_inst::components_read(unsigned i) const return 1; } + case SHADER_OPCODE_URB_WRITE_LOGICAL: + case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL: + case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL: + case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL: + if (i == URB_LOGICAL_SRC_DATA) + return mlen - 1 - + unsigned(src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) - + unsigned(src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE); + else + return 1; + default: return 1; } @@ -891,10 +902,6 @@ fs_inst::size_read(int arg) const break; case FS_OPCODE_FB_READ: - case SHADER_OPCODE_URB_WRITE_LOGICAL: - case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL: - case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL: - case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL: case SHADER_OPCODE_URB_READ_LOGICAL: case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL: case FS_OPCODE_INTERPOLATE_AT_SAMPLE: @@ -1546,17 +1553,17 @@ fs_visitor::emit_gs_thread_end() break; } } - fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); - abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); - inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, hdr); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); inst->mlen = 1; } else { - fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); - sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); - sources[1] = this->final_gs_vertex_count; - abld.LOAD_PAYLOAD(payload, sources, 2, 2); - inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count; + inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); inst->mlen = 2; } inst->eot = true; @@ -6676,16 +6683,12 @@ fs_visitor::run_tcs() } /* Emit EOT write; set TR DS Cache bit */ - fs_reg srcs[3] = { - fs_reg(get_tcs_output_urb_handle()), - fs_reg(brw_imm_ud(WRITEMASK_X << 16)), - fs_reg(brw_imm_ud(0)), - }; - fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); - bld.LOAD_PAYLOAD(payload, srcs, 3, 2); - + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle(); + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16); + srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0); fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, - bld.null_reg_ud(), payload); + reg_undef, srcs, ARRAY_SIZE(srcs)); inst->mlen = 3; inst->eot = true; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 87aff871e78..35a50e838a8 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2341,27 +2341,27 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) } /* Store the control data bits in the message payload and send it. */ - unsigned mlen = 2; - if (channel_mask.file != BAD_FILE) - mlen += 4; /* channel masks, plus 3 extra copies of the data */ - if (per_slot_offset.file != BAD_FILE) - mlen++; - - fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); - unsigned i = 0; - sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); - if (per_slot_offset.file != BAD_FILE) - sources[i++] = per_slot_offset; - if (channel_mask.file != BAD_FILE) - sources[i++] = channel_mask; - while (i < mlen) { - sources[i++] = this->control_data_bits; - } - - abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); - fs_inst *inst = abld.emit(opcode, reg_undef, payload); - inst->mlen = mlen; + const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) + + unsigned(per_slot_offset.file != BAD_FILE); + + /* If there are channel masks, add 3 extra copies of the data. */ + const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE); + + fs_reg sources[4]; + + for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) + sources[i] = this->control_data_bits; + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask; + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), + BRW_REGISTER_TYPE_F); + abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); + + fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->mlen = header_size + length; /* We need to increment Global Offset by 256-bits to make room for * Broadwell's extra "Vertex Count" payload at the beginning of the * URB entry. Since this is an OWord message, Global Offset is counted @@ -3046,15 +3046,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, fs_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; unsigned mask = instr->const_index[1]; - unsigned header_regs = 0; - struct brw_reg output_handles = get_tcs_output_urb_handle(); - - fs_reg srcs[7]; - srcs[header_regs++] = output_handles; - - if (indirect_offset.file != BAD_FILE) { - srcs[header_regs++] = indirect_offset; - } if (mask == 0) break; @@ -3068,8 +3059,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, unsigned first_component = nir_intrinsic_component(instr); mask = mask << first_component; + fs_reg mask_reg; if (mask != WRITEMASK_XYZW) { - srcs[header_regs++] = brw_imm_ud(mask << 16); + mask_reg = brw_imm_ud(mask << 16); opcode = indirect_offset.file != BAD_FILE ? SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL : SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL; @@ -3079,21 +3071,30 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, SHADER_OPCODE_URB_WRITE_LOGICAL; } + fs_reg sources[4]; + for (unsigned i = 0; i < num_components; i++) { if (!(mask & (1 << (i + first_component)))) continue; - srcs[header_regs + i + first_component] = offset(value, bld, i); + sources[i + first_component] = offset(value, bld, i); } - unsigned mlen = header_regs + num_components + first_component; - fs_reg payload = - bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); - bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); + unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) + + unsigned(mask != WRITEMASK_XYZW); + const unsigned length = num_components + first_component; + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle(); + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg; + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), + BRW_REGISTER_TYPE_F); + bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); - fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); + fs_inst *inst = bld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs)); inst->offset = imm_offset; - inst->mlen = mlen; + inst->mlen = header_size + length; break; } diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp index 75a794fd794..3fb071086f6 100644 --- a/src/intel/compiler/brw_fs_validate.cpp +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -43,6 +43,20 @@ fs_visitor::validate() { #ifndef NDEBUG foreach_block_and_inst (block, fs_inst, inst, cfg) { + if (inst->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) { + const unsigned header_size = 1 + + unsigned(inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) + + unsigned(inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE); + + unsigned data_size = 0; + for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) { + fsv_assert(type_sz(offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j).type) == 4); + data_size++; + } + + fsv_assert(header_size + data_size == inst->mlen); + } + if (inst->dst.file == VGRF) { fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <= alloc.sizes[inst->dst.nr]); diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 1a6c42f2715..3ced049d101 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -935,22 +935,15 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) if (length == 8 || (length > 0 && slot == last_slot)) flush = true; if (flush) { - fs_reg *payload_sources = - ralloc_array(mem_ctx, fs_reg, length + header_size); - fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size), - BRW_REGISTER_TYPE_F); - payload_sources[0] = urb_handle; + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; - if (opcode == SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL) - payload_sources[1] = per_slot_offsets; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets; + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length), + BRW_REGISTER_TYPE_F); + abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0); - memcpy(&payload_sources[header_size], sources, - length * sizeof sources[0]); - - abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size, - header_size); - - fs_inst *inst = abld.emit(opcode, reg_undef, payload); + fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs)); /* For ICL WA 1805992985 one needs additional write in the end. */ if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) @@ -985,10 +978,17 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) if (stage == MESA_SHADER_GEOMETRY) return; - fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD); - bld.exec_all().MOV(payload, urb_handle); + fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + + bld.exec_all().MOV(uniform_urb_handle, urb_handle); - fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle; + srcs[URB_LOGICAL_SRC_DATA] = payload; + + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, + srcs, ARRAY_SIZE(srcs)); inst->eot = true; inst->mlen = 2; inst->offset = 1; @@ -1002,14 +1002,16 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) * all 8 lanes must valid. */ if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) { - fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD); + fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD); /* Workaround requires all 8 channels (lanes) to be valid. This is * understood to mean they all need to be alive. First trick is to find * a live channel and copy its urb handle for all the other channels to * make sure all handles are valid. */ - bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle)); + bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle)); /* Second trick is to use masked URB write where one can tell the HW to * actually write data only for selected channels even though all are @@ -1025,14 +1027,19 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) * 4 slots data. All are explicitly zeros in order to to keep the MBZ * area written as zeros. */ - bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u)); + bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u)); + bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u)); + bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u)); bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u)); bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u)); - bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u)); - bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u)); + + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask; + srcs[URB_LOGICAL_SRC_DATA] = payload; fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, - reg_undef, payload); + reg_undef, srcs, ARRAY_SIZE(srcs)); inst->eot = true; inst->mlen = 6; inst->offset = 0; diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index e1845a4fc34..0ebc9984b1e 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -73,8 +73,27 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, assert(inst->header_size == 0); + fs_reg *payload_sources = new fs_reg[inst->mlen]; + fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen), + BRW_REGISTER_TYPE_F); + + unsigned header_size = 0; + payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE]; + if (per_slot_present) + payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS]; + + if (channel_mask_present) + payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK]; + + for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) + payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j); + + bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size); + + delete [] payload_sources; + inst->opcode = SHADER_OPCODE_SEND; - inst->header_size = 1; + inst->header_size = header_size; inst->dst = brw_null_reg(); inst->sfid = BRW_SFID_URB; @@ -88,13 +107,11 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, inst->ex_mlen = 0; inst->send_has_side_effects = true; - fs_reg tmp = inst->src[0]; - inst->resize_sources(4); inst->src[0] = brw_imm_ud(0); /* desc */ inst->src[1] = brw_imm_ud(0); /* ex_desc */ - inst->src[2] = tmp; + inst->src[2] = payload; inst->src[3] = brw_null_reg(); } diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index d9828923c9e..6a8872cebe9 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -892,25 +892,25 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr, for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { fs_builder bld8 = bld.group(8, q); - fs_reg payload_srcs[6]; - unsigned p = 0; - - payload_srcs[p++] = urb_handle; - payload_srcs[p++] = brw_imm_ud(first_mask << 16); - const unsigned header_size = p; + fs_reg payload_srcs[4]; + unsigned length = 0; for (unsigned i = 0; i < comp_shift; i++) - payload_srcs[p++] = reg_undef; + payload_srcs[length++] = reg_undef; for (unsigned c = 0; c < first_comps; c++) - payload_srcs[p++] = quarter(offset(src, bld, c), q); + payload_srcs[length++] = quarter(offset(src, bld, c), q); - fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p); - bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, - reg_undef, payload); - inst->mlen = p; + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->mlen = 2 + length; inst->offset = urb_global_offset; assert(inst->offset < 2048); } @@ -923,22 +923,22 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr, for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { fs_builder bld8 = bld.group(8, q); - fs_reg payload_srcs[6]; - unsigned p = 0; - - payload_srcs[p++] = urb_handle; - payload_srcs[p++] = brw_imm_ud(second_mask << 16); - const unsigned header_size = p; + fs_reg payload_srcs[4]; + unsigned length = 0; for (unsigned c = 0; c < second_comps; c++) - payload_srcs[p++] = quarter(offset(src, bld, c + first_comps), q); + payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q); - fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p); - bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16); + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, - reg_undef, payload); - inst->mlen = p; + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->mlen = 2 + length; inst->offset = urb_global_offset; assert(inst->offset < 2048); } @@ -988,21 +988,23 @@ emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr, bld8.SHR(off, off, brw_imm_ud(2)); - fs_reg payload_srcs[7]; - int x = 0; - payload_srcs[x++] = urb_handle; - payload_srcs[x++] = off; - payload_srcs[x++] = mask; + fs_reg payload_srcs[4]; + unsigned length = 0; for (unsigned j = 0; j < 4; j++) - payload_srcs[x++] = quarter(src_comp, q); + payload_srcs[length++] = quarter(src_comp, q); - fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, x); - bld8.LOAD_PAYLOAD(payload, payload_srcs, x, 3); + fs_reg srcs[URB_LOGICAL_NUM_SRCS]; + srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; + srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; + srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask; + srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), + BRW_REGISTER_TYPE_F); + bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); - fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL, - reg_undef, payload); - inst->mlen = x; + fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL, + reg_undef, srcs, ARRAY_SIZE(srcs)); + inst->mlen = 3 + length; inst->offset = 0; } } |