From dca5fc14358a8b267b3854c39c976a822885898f Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 13 Mar 2013 14:48:55 -0700 Subject: i965/fs: Improve performance of varying-index uniform loads on IVB. Like we have done for the VS and for constant-index uniform loads, we use the sampler engine to get caching in front of the L3 to avoid tickling the IVB L3 bug. This is also a bit of a functional change, as we're now loading a vec4 instead of a single dword, though we're not taking advantage of the other 3 components of the vec4 (yet). With the driver hacked to always take the varying-index path for all uniforms, improves performance of my old GLSL demo by 315% +/- 2% (n=4). This a major fix for some blur shaders in compositors from the varying-index uniforms support I introduced in 9.1. v2: Move old offset computation into the pre-gen7 path. Reviewed-by: Kenneth Graunke Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=61554 NOTE: This is a candidate for the 9.1 branch. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 29 ++++++++++++++++++++++++----- src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 27 ++++++++++++++------------- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c60d0418678..703c3c5d8b4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -235,14 +235,33 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index, exec_list instructions; fs_inst *inst; - fs_reg offset = fs_reg(this, glsl_type::uint_type); - instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset))); - if (intel->gen >= 7) { + /* We have our constant surface use a pitch of 4 bytes, so our index can + * be any component of a vector, and then we load 4 contiguous + * components starting from that. + * + * We break down the const_offset to a portion added to the variable + * offset and a portion done using reg_offset, which means that if you + * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = + * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and + * CSE can later notice that those loads are all the same and eliminate + * the redundant ones. + */ + fs_reg vec4_offset = fs_reg(this, glsl_type::int_type); + instructions.push_tail(ADD(vec4_offset, + varying_offset, const_offset & ~3)); + + fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type); inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, - dst, surf_index, offset); + vec4_result, surf_index, vec4_offset); instructions.push_tail(inst); + + vec4_result.reg_offset += const_offset & 3; + instructions.push_tail(MOV(dst, vec4_result)); } else { + fs_reg offset = fs_reg(this, glsl_type::uint_type); + instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset))); + int base_mrf = 13; bool header_present = true; @@ -313,7 +332,7 @@ fs_inst::equals(fs_inst *inst) int fs_inst::regs_written() { - if (is_tex()) + if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7) return 4; /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2, diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index a729569c840..bc1fef16b01 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -734,28 +734,29 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, index.type == BRW_REGISTER_TYPE_UD); uint32_t surf_index = index.dw1.ud; - uint32_t msg_control, rlen, mlen; + uint32_t simd_mode, rlen, mlen; if (dispatch_width == 16) { - msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS; - mlen = rlen = 2; + mlen = 2; + rlen = 8; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; } else { - msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS; - mlen = rlen = 1; + mlen = 1; + rlen = 4; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; } struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_set_dest(p, send, dst); brw_set_src0(p, send, offset); - if (intel->gen < 6) - send->header.destreg__conditionalmod = inst->base_mrf; - brw_set_dp_read_message(p, send, + brw_set_sampler_message(p, send, surf_index, - msg_control, - GEN7_DATAPORT_DC_DWORD_SCATTERED_READ, - BRW_DATAPORT_READ_TARGET_DATA_CACHE, + 0, /* LD message ignores sampler unit */ + GEN5_SAMPLER_MESSAGE_SAMPLE_LD, + rlen, mlen, - inst->header_present, - rlen); + false, /* no header */ + simd_mode, + 0); } /** -- cgit v1.2.3