diff options
author | Eric Anholt <eric@anholt.net> | 2013-03-13 14:48:55 -0700 |
---|---|---|
committer | Eric Anholt <eric@anholt.net> | 2013-04-01 16:17:25 -0700 |
commit | dca5fc14358a8b267b3854c39c976a822885898f (patch) | |
tree | 3cc2d6bd82d31e722735d2cd592ac42909f899e7 | |
parent | bc0e1591f64b8b3f2693fceaaa8bba9198e26171 (diff) |
i965/fs: Improve performance of varying-index uniform loads on IVB.
Like we have done for the VS and for constant-index uniform loads, we use
the sampler engine to get caching in front of the L3 to avoid tickling the
IVB L3 bug. This is also a bit of a functional change, as we're now
loading a vec4 instead of a single dword, though we're not taking
advantage of the other 3 components of the vec4 (yet).
With the driver hacked to always take the varying-index path for all
uniforms, improves performance of my old GLSL demo by 315% +/- 2% (n=4).
This a major fix for some blur shaders in compositors from the
varying-index uniforms support I introduced in 9.1.
v2: Move old offset computation into the pre-gen7 path.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=61554
NOTE: This is a candidate for the 9.1 branch.
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 29 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 27 |
2 files changed, 38 insertions, 18 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c60d0418678..703c3c5d8b4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp | |||
@@ -235,14 +235,33 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index, | |||
235 | exec_list instructions; | 235 | exec_list instructions; |
236 | fs_inst *inst; | 236 | fs_inst *inst; |
237 | 237 | ||
238 | fs_reg offset = fs_reg(this, glsl_type::uint_type); | ||
239 | instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset))); | ||
240 | |||
241 | if (intel->gen >= 7) { | 238 | if (intel->gen >= 7) { |
239 | /* We have our constant surface use a pitch of 4 bytes, so our index can | ||
240 | * be any component of a vector, and then we load 4 contiguous | ||
241 | * components starting from that. | ||
242 | * | ||
243 | * We break down the const_offset to a portion added to the variable | ||
244 | * offset and a portion done using reg_offset, which means that if you | ||
245 | * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = | ||
246 | * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and | ||
247 | * CSE can later notice that those loads are all the same and eliminate | ||
248 | * the redundant ones. | ||
249 | */ | ||
250 | fs_reg vec4_offset = fs_reg(this, glsl_type::int_type); | ||
251 | instructions.push_tail(ADD(vec4_offset, | ||
252 | varying_offset, const_offset & ~3)); | ||
253 | |||
254 | fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type); | ||
242 | inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, | 255 | inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, |
243 | dst, surf_index, offset); | 256 | vec4_result, surf_index, vec4_offset); |
244 | instructions.push_tail(inst); | 257 | instructions.push_tail(inst); |
258 | |||
259 | vec4_result.reg_offset += const_offset & 3; | ||
260 | instructions.push_tail(MOV(dst, vec4_result)); | ||
245 | } else { | 261 | } else { |
262 | fs_reg offset = fs_reg(this, glsl_type::uint_type); | ||
263 | instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset))); | ||
264 | |||
246 | int base_mrf = 13; | 265 | int base_mrf = 13; |
247 | bool header_present = true; | 266 | bool header_present = true; |
248 | 267 | ||
@@ -313,7 +332,7 @@ fs_inst::equals(fs_inst *inst) | |||
313 | int | 332 | int |
314 | fs_inst::regs_written() | 333 | fs_inst::regs_written() |
315 | { | 334 | { |
316 | if (is_tex()) | 335 | if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7) |
317 | return 4; | 336 | return 4; |
318 | 337 | ||
319 | /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2, | 338 | /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2, |
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index a729569c840..bc1fef16b01 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp | |||
@@ -734,28 +734,29 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, | |||
734 | index.type == BRW_REGISTER_TYPE_UD); | 734 | index.type == BRW_REGISTER_TYPE_UD); |
735 | uint32_t surf_index = index.dw1.ud; | 735 | uint32_t surf_index = index.dw1.ud; |
736 | 736 | ||
737 | uint32_t msg_control, rlen, mlen; | 737 | uint32_t simd_mode, rlen, mlen; |
738 | if (dispatch_width == 16) { | 738 | if (dispatch_width == 16) { |
739 | msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS; | 739 | mlen = 2; |
740 | mlen = rlen = 2; | 740 | rlen = 8; |
741 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; | ||
741 | } else { | 742 | } else { |
742 | msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS; | 743 | mlen = 1; |
743 | mlen = rlen = 1; | 744 | rlen = 4; |
745 | simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; | ||
744 | } | 746 | } |
745 | 747 | ||
746 | struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); | 748 | struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); |
747 | brw_set_dest(p, send, dst); | 749 | brw_set_dest(p, send, dst); |
748 | brw_set_src0(p, send, offset); | 750 | brw_set_src0(p, send, offset); |
749 | if (intel->gen < 6) | 751 | brw_set_sampler_message(p, send, |
750 | send->header.destreg__conditionalmod = inst->base_mrf; | ||
751 | brw_set_dp_read_message(p, send, | ||
752 | surf_index, | 752 | surf_index, |
753 | msg_control, | 753 | 0, /* LD message ignores sampler unit */ |
754 | GEN7_DATAPORT_DC_DWORD_SCATTERED_READ, | 754 | GEN5_SAMPLER_MESSAGE_SAMPLE_LD, |
755 | BRW_DATAPORT_READ_TARGET_DATA_CACHE, | 755 | rlen, |
756 | mlen, | 756 | mlen, |
757 | inst->header_present, | 757 | false, /* no header */ |
758 | rlen); | 758 | simd_mode, |
759 | 0); | ||
759 | } | 760 | } |
760 | 761 | ||
761 | /** | 762 | /** |