summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2013-02-05 15:46:22 -0800
committerIan Romanick <ian.d.romanick@intel.com>2013-03-05 14:57:08 -0800
commit597d98bb2cd221d68b17bf352bfc39eaa410aa0b (patch)
treefc0c6a3147289699a50c22372a2716713042dd66
parent783f76e3d5d2dfe04ad83c93c6882685b25ce404 (diff)
i965/fs: Do a general SEND dependency workaround for the original 965.
We'd been ad-hoc inserting instructions in some SEND messages with no knowledge of when it was required (so extra instructions), but not all SENDs (so not often enough). This should do much better than that, though it's still flow-control-ignorant. v2: Use BRW_MAX_MRF instead of magic numbers. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=58960 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NOTE: Candidate for the stable branches. (cherry picked from commit c37992c54d753e732783f712dea2d483450371dd)
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp225
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h4
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_emit.cpp42
3 files changed, 229 insertions, 42 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8dab4317c10..c1ccd92c2da 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -258,6 +258,26 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
return instructions;
}
+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
+ */
+fs_inst *
+fs_visitor::DEP_RESOLVE_MOV(int grf)
+{
+ fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
+
+ inst->ir = NULL;
+ inst->annotation = "send dependency resolve";
+
+ /* The caller always wants uncompressed to emit the minimal extra
+ * dependencies, and to avoid having to deal with aligning its regs to 2.
+ */
+ inst->force_uncompressed = true;
+
+ return inst;
+}
+
bool
fs_inst::equals(fs_inst *inst)
{
@@ -2228,6 +2248,205 @@ fs_visitor::remove_duplicate_mrf_writes()
return progress;
}
+static void
+clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
+ int first_grf, int grf_len)
+{
+ bool inst_16wide = (dispatch_width > 8 &&
+ !inst->force_uncompressed &&
+ !inst->force_sechalf);
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ for (int i = 0; i < 3; i++) {
+ int grf;
+ if (inst->src[i].file == GRF) {
+ grf = inst->src[i].reg;
+ } else if (inst->src[i].file == FIXED_HW_REG &&
+ inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+ grf = inst->src[i].fixed_hw_reg.nr;
+ } else {
+ continue;
+ }
+
+ if (grf >= first_grf &&
+ grf < first_grf + grf_len) {
+ deps[grf - first_grf] = false;
+ if (inst_16wide)
+ deps[grf - first_grf + 1] = false;
+ }
+ }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
+ * check for post destination dependencies on this instruction, software
+ * must ensure that there is no destination hazard for the case of ‘write
+ * followed by a posted write’ shown in the following example.
+ *
+ * 1. mov r3 0
+ * 2. send r3.xy <rest of send instruction>
+ * 3. mov r2 r3
+ *
+ * Due to no post-destination dependency check on the ‘send’, the above
+ * code sequence could have two instructions (1 and 2) in flight at the
+ * same time that both consider ‘r3’ as the target of their final writes.
+ */
+void
+fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
+{
+ int write_len = inst->regs_written() * dispatch_width / 8;
+ int first_write_grf = inst->dst.reg;
+ bool needs_dep[BRW_MAX_MRF];
+ assert(write_len < (int)sizeof(needs_dep) - 1);
+
+ memset(needs_dep, false, sizeof(needs_dep));
+ memset(needs_dep, true, write_len);
+
+ clear_deps_for_inst_src(inst, dispatch_width,
+ needs_dep, first_write_grf, write_len);
+
+ /* Walk backwards looking for writes to registers we're writing which
+ * aren't read since being written. If we hit the start of the program,
+ * we assume that there are no outstanding dependencies on entry to the
+ * program.
+ */
+ for (fs_inst *scan_inst = (fs_inst *)inst->prev;
+ scan_inst != NULL;
+ scan_inst = (fs_inst *)scan_inst->prev) {
+
+ /* If we hit control flow, assume that there *are* outstanding
+ * dependencies, and force their cleanup before our instruction.
+ */
+ if (scan_inst->is_control_flow()) {
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i]) {
+ inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+ }
+ }
+ }
+
+ bool scan_inst_16wide = (dispatch_width > 8 &&
+ !scan_inst->force_uncompressed &&
+ !scan_inst->force_sechalf);
+
+ /* We insert our reads as late as possible on the assumption that any
+ * instruction but a MOV that might have left us an outstanding
+ * dependency has more latency than a MOV.
+ */
+ if (scan_inst->dst.file == GRF &&
+ scan_inst->dst.reg >= first_write_grf &&
+ scan_inst->dst.reg < first_write_grf + write_len &&
+ needs_dep[scan_inst->dst.reg - first_write_grf]) {
+ inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
+ needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+ if (scan_inst_16wide)
+ needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
+ }
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ clear_deps_for_inst_src(scan_inst, dispatch_width,
+ needs_dep, first_write_grf, write_len);
+
+ /* Continue the loop only if we haven't resolved all the dependencies */
+ int i;
+ for (i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ break;
+ }
+ if (i == write_len)
+ return;
+ }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ * "[DevBW, DevCL] Errata: A destination register from a send can not be
+ * used as a destination register until after it has been sourced by an
+ * instruction with a different destination register.
+ */
+void
+fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
+{
+ int write_len = inst->regs_written() * dispatch_width / 8;
+ int first_write_grf = inst->dst.reg;
+ bool needs_dep[BRW_MAX_MRF];
+ assert(write_len < (int)sizeof(needs_dep) - 1);
+
+ memset(needs_dep, false, sizeof(needs_dep));
+ memset(needs_dep, true, write_len);
+ /* Walk forwards looking for writes to registers we're writing which aren't
+ * read before being written.
+ */
+ for (fs_inst *scan_inst = (fs_inst *)inst->next;
+ !scan_inst->is_tail_sentinel();
+ scan_inst = (fs_inst *)scan_inst->next) {
+ /* If we hit control flow, force resolve all remaining dependencies. */
+ if (scan_inst->is_control_flow()) {
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+ }
+ }
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ clear_deps_for_inst_src(scan_inst, dispatch_width,
+ needs_dep, first_write_grf, write_len);
+
+ /* We insert our reads as late as possible since they're reading the
+ * result of a SEND, which has massive latency.
+ */
+ if (scan_inst->dst.file == GRF &&
+ scan_inst->dst.reg >= first_write_grf &&
+ scan_inst->dst.reg < first_write_grf + write_len &&
+ needs_dep[scan_inst->dst.reg - first_write_grf]) {
+ scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
+ needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+ }
+
+ /* Continue the loop only if we haven't resolved all the dependencies */
+ int i;
+ for (i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ break;
+ }
+ if (i == write_len)
+ return;
+ }
+
+ /* If we hit the end of the program, resolve all remaining dependencies out
+ * of paranoia.
+ */
+ fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
+ assert(last_inst->eot);
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+ }
+}
+
+void
+fs_visitor::insert_gen4_send_dependency_workarounds()
+{
+ if (intel->gen != 4 || intel->is_g4x)
+ return;
+
+ /* Note that we're done with register allocation, so GRF fs_regs always
+ * have a .reg_offset of 0.
+ */
+
+ foreach_list_safe(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *)node;
+
+ if (inst->mlen != 0 && inst->dst.file == GRF) {
+ insert_gen4_pre_send_dependency_workarounds(inst);
+ insert_gen4_post_send_dependency_workarounds(inst);
+ }
+ }
+}
+
void
fs_visitor::dump_instruction(fs_inst *inst)
{
@@ -2522,6 +2741,12 @@ fs_visitor::run()
assert(force_uncompressed_stack == 0);
assert(force_sechalf_stack == 0);
+ /* This must come after all optimization and register allocation, since
+ * it inserts dead code that happens to have side effects, and it does
+ * so based on the actual physical registers in use.
+ */
+ insert_gen4_send_dependency_workarounds();
+
if (failed)
return false;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 88fecb90494..d5ebd515cbb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -285,6 +285,7 @@ public:
fs_inst *IF(fs_reg src0, fs_reg src1, uint32_t condition);
fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
uint32_t condition);
+ fs_inst *DEP_RESOLVE_MOV(int grf);
int type_size(const struct glsl_type *type);
fs_inst *get_instruction_generating_reg(fs_inst *start,
@@ -329,6 +330,9 @@ public:
bool remove_duplicate_mrf_writes();
bool virtual_grf_interferes(int a, int b);
void schedule_instructions(bool post_reg_alloc);
+ void insert_gen4_send_dependency_workarounds();
+ void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
+ void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
void fail(const char *msg, ...);
void push_force_uncompressed();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 45072da6a67..365a2ec3546 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -604,29 +604,8 @@ fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
{
assert(inst->mlen != 0);
- /* Clear any post destination dependencies that would be ignored by
- * the block read. See the B-Spec for pre-gen5 send instruction.
- *
- * This could use a better solution, since texture sampling and
- * math reads could potentially run into it as well -- anywhere
- * that we have a SEND with a destination that is a register that
- * was written but not read within the last N instructions (what's
- * N? unsure). This is rare because of dead code elimination, but
- * not impossible.
- */
- if (intel->gen == 4 && !intel->is_g4x)
- brw_MOV(p, brw_null_reg(), dst);
-
brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
inst->offset);
-
- if (intel->gen == 4 && !intel->is_g4x) {
- /* gen4 errata: destination from a send can't be used as a
- * destination until it's been read. Just read it so we don't
- * have to worry.
- */
- brw_MOV(p, brw_null_reg(), dst);
- }
}
void
@@ -637,19 +616,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
{
assert(inst->mlen != 0);
- /* Clear any post destination dependencies that would be ignored by
- * the block read. See the B-Spec for pre-gen5 send instruction.
- *
- * This could use a better solution, since texture sampling and
- * math reads could potentially run into it as well -- anywhere
- * that we have a SEND with a destination that is a register that
- * was written but not read within the last N instructions (what's
- * N? unsure). This is rare because of dead code elimination, but
- * not impossible.
- */
- if (intel->gen == 4 && !intel->is_g4x)
- brw_MOV(p, brw_null_reg(), dst);
-
assert(index.file == BRW_IMMEDIATE_VALUE &&
index.type == BRW_REGISTER_TYPE_UD);
uint32_t surf_index = index.dw1.ud;
@@ -660,14 +626,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
read_offset, surf_index);
-
- if (intel->gen == 4 && !intel->is_g4x) {
- /* gen4 errata: destination from a send can't be used as a
- * destination until it's been read. Just read it so we don't
- * have to worry.
- */
- brw_MOV(p, brw_null_reg(), dst);
- }
}
void