summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2013-03-13 14:48:55 -0700
committerEric Anholt <eric@anholt.net>2013-04-01 16:17:25 -0700
commitdca5fc14358a8b267b3854c39c976a822885898f (patch)
tree3cc2d6bd82d31e722735d2cd592ac42909f899e7
parentbc0e1591f64b8b3f2693fceaaa8bba9198e26171 (diff)
i965/fs: Improve performance of varying-index uniform loads on IVB.
Like we have done for the VS and for constant-index uniform loads, we use the sampler engine to get caching in front of the L3 to avoid tickling the IVB L3 bug. This is also a bit of a functional change, as we're now loading a vec4 instead of a single dword, though we're not taking advantage of the other 3 components of the vec4 (yet). With the driver hacked to always take the varying-index path for all uniforms, improves performance of my old GLSL demo by 315% +/- 2% (n=4). This a major fix for some blur shaders in compositors from the varying-index uniforms support I introduced in 9.1. v2: Move old offset computation into the pre-gen7 path. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=61554 NOTE: This is a candidate for the 9.1 branch.
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp29
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_emit.cpp27
2 files changed, 38 insertions, 18 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c60d0418678..703c3c5d8b4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -235,14 +235,33 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
235 exec_list instructions; 235 exec_list instructions;
236 fs_inst *inst; 236 fs_inst *inst;
237 237
238 fs_reg offset = fs_reg(this, glsl_type::uint_type);
239 instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
240
241 if (intel->gen >= 7) { 238 if (intel->gen >= 7) {
239 /* We have our constant surface use a pitch of 4 bytes, so our index can
240 * be any component of a vector, and then we load 4 contiguous
241 * components starting from that.
242 *
243 * We break down the const_offset to a portion added to the variable
244 * offset and a portion done using reg_offset, which means that if you
245 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
246 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
247 * CSE can later notice that those loads are all the same and eliminate
248 * the redundant ones.
249 */
250 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
251 instructions.push_tail(ADD(vec4_offset,
252 varying_offset, const_offset & ~3));
253
254 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
242 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, 255 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
243 dst, surf_index, offset); 256 vec4_result, surf_index, vec4_offset);
244 instructions.push_tail(inst); 257 instructions.push_tail(inst);
258
259 vec4_result.reg_offset += const_offset & 3;
260 instructions.push_tail(MOV(dst, vec4_result));
245 } else { 261 } else {
262 fs_reg offset = fs_reg(this, glsl_type::uint_type);
263 instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
264
246 int base_mrf = 13; 265 int base_mrf = 13;
247 bool header_present = true; 266 bool header_present = true;
248 267
@@ -313,7 +332,7 @@ fs_inst::equals(fs_inst *inst)
313int 332int
314fs_inst::regs_written() 333fs_inst::regs_written()
315{ 334{
316 if (is_tex()) 335 if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
317 return 4; 336 return 4;
318 337
319 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2, 338 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index a729569c840..bc1fef16b01 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -734,28 +734,29 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
734 index.type == BRW_REGISTER_TYPE_UD); 734 index.type == BRW_REGISTER_TYPE_UD);
735 uint32_t surf_index = index.dw1.ud; 735 uint32_t surf_index = index.dw1.ud;
736 736
737 uint32_t msg_control, rlen, mlen; 737 uint32_t simd_mode, rlen, mlen;
738 if (dispatch_width == 16) { 738 if (dispatch_width == 16) {
739 msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS; 739 mlen = 2;
740 mlen = rlen = 2; 740 rlen = 8;
741 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
741 } else { 742 } else {
742 msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS; 743 mlen = 1;
743 mlen = rlen = 1; 744 rlen = 4;
745 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
744 } 746 }
745 747
746 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); 748 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
747 brw_set_dest(p, send, dst); 749 brw_set_dest(p, send, dst);
748 brw_set_src0(p, send, offset); 750 brw_set_src0(p, send, offset);
749 if (intel->gen < 6) 751 brw_set_sampler_message(p, send,
750 send->header.destreg__conditionalmod = inst->base_mrf;
751 brw_set_dp_read_message(p, send,
752 surf_index, 752 surf_index,
753 msg_control, 753 0, /* LD message ignores sampler unit */
754 GEN7_DATAPORT_DC_DWORD_SCATTERED_READ, 754 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
755 BRW_DATAPORT_READ_TARGET_DATA_CACHE, 755 rlen,
756 mlen, 756 mlen,
757 inst->header_present, 757 false, /* no header */
758 rlen); 758 simd_mode,
759 0);
759} 760}
760 761
761/** 762/**