summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2013-02-15 19:26:48 -0800
committerIan Romanick <ian.d.romanick@intel.com>2013-03-05 14:57:09 -0800
commit808e01236bd2e12a4ca4beb82a828e5776e38f90 (patch)
tree927f615180779d05e3a3e0fab510ebe84312c0ad
parentc5f63bedacc770e559468408ad6cede1a8772bf9 (diff)
i965/fs: Delay setup of uniform loads until after pre-regalloc scheduling.
This should fix the register allocation explosion on the GLES 3.0 test on gen6. It also gives us an instruction that will fit our CSE handling. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NOTE: This is a candidate for the 9.1 branch. (cherry picked from commit aebd3f46e305829ebfcc817cafa8592edc2f80ab)
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp64
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h1
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp28
3 files changed, 66 insertions, 27 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c1ccd92c2da..200307e3d9b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1710,8 +1710,6 @@ fs_visitor::setup_pull_constants()
dst, index, offset);
pull->ir = inst->ir;
pull->annotation = inst->annotation;
- pull->base_mrf = 14;
- pull->mlen = 1;
inst->insert_before(pull);
@@ -2447,6 +2445,66 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
}
}
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ */
+void
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+ foreach_list(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *)node;
+
+ if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+ continue;
+
+ if (intel->gen >= 7) {
+ fs_reg const_offset_reg = inst->src[1];
+ assert(const_offset_reg.file == IMM &&
+ const_offset_reg.type == BRW_REGISTER_TYPE_UD);
+ const_offset_reg.imm.u /= 16;
+ fs_reg payload = fs_reg(this, glsl_type::uint_type);
+ struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+ BRW_REGISTER_TYPE_UD);
+
+ fs_inst *setup1 = MOV(payload, fs_reg(g0));
+ setup1->force_writemask_all = true;
+ /* We don't need the second half of this vgrf to be filled with g1
+ * in the 16-wide case, but if we use force_uncompressed then live
+ * variable analysis won't consider this a def!
+ */
+
+ fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
+ payload, payload,
+ const_offset_reg);
+
+ setup1->ir = inst->ir;
+ setup1->annotation = inst->annotation;
+ inst->insert_before(setup1);
+ setup2->ir = inst->ir;
+ setup2->annotation = inst->annotation;
+ inst->insert_before(setup2);
+ inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
+ inst->src[1] = payload;
+ } else {
+ /* Before register allocation, we didn't tell the scheduler about the
+ * MRF we use. We know it's safe to use this MRF because nothing
+ * else does except for register spill/unspill, which generates and
+ * uses its MRF within a single IR instruction.
+ */
+ inst->base_mrf = 14;
+ inst->mlen = 1;
+ }
+ }
+}
+
void
fs_visitor::dump_instruction(fs_inst *inst)
{
@@ -2719,6 +2777,8 @@ fs_visitor::run()
schedule_instructions(false);
+ lower_uniform_pull_constant_loads();
+
assign_curb_setup();
assign_urb_setup();
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index d5ebd515cbb..d1bb111bf5f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -334,6 +334,7 @@ public:
void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
void fail(const char *msg, ...);
+ void lower_uniform_pull_constant_loads();
void push_force_uncompressed();
void pop_force_uncompressed();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d4f6fc9ca7e..573921cf8cc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -597,31 +597,9 @@ fs_visitor::visit(ir_expression *ir)
fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
packed_consts.type = result.type;
- if (intel->gen >= 7) {
- fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
- fs_reg payload = fs_reg(this, glsl_type::uint_type);
- struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
- BRW_REGISTER_TYPE_UD);
- fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
- setup->force_writemask_all = true;
- /* We don't need the second half of this vgrf to be filled with g1
- * in the 16-wide case, but if we use force_uncompressed then live
- * variable analysis won't consider this a def!
- */
-
- emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
- payload, const_offset_reg);
- emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
- surf_index, payload);
- } else {
- fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
- fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- packed_consts,
- surf_index,
- const_offset_reg));
- pull->base_mrf = 14;
- pull->mlen = 1;
- }
+ fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
+ emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ packed_consts, surf_index, const_offset_reg));
packed_consts.smear = const_offset->value.u[0] % 16 / 4;
for (int i = 0; i < ir->type->vector_elements; i++) {