broadcom/compiler: simplify ldvary pipelining

We get optimal ldvary pipelining by doing the following: 1) Carefully merge a paired ldvary into the previous instruction when possible. 2) When the above succeeds, flag the ldvary as scheduled immediately so we can merge one of its children into the current instruction. 3) When scheduling ldvary sequences, only pick up instructions that are part of the sequence to avoid picking up something that prevents successful pipelining. This patch skips 3) assuming some hurt shaders in exchange for better scheduling flexibility during ldvary sequences. Besides eliminating most of the code dedicated to special handling ldvary sequences, this also usually allows us to produce better code by merging instructions that are unrelated to ldvary sequences into the ldvary sequences, which is particularly effective to fill up the gaps produced when scheduling the first and last ldvary sequences as well as the gaps produced by flat and noperspective varyings sequences that don't have both mul and add instructions. Notice that there are some hurt shaders, because some times the extra scheduler flexibility can lead to picking up instructions that will break a sequence without compensating for that, typically an ldunif that prevents us from doing the fixup for a follow-up ldvary. We will try to correct some of these cases with the next patch. total instructions in shared programs: 13786037 -> 13760415 (-0.19%) instructions in affected programs: 3201387 -> 3175765 (-0.80%) helped: 16155 HURT: 4146 Instructions are helped. total max-temps in shared programs: 2324834 -> 2322991 (-0.08%) max-temps in affected programs: 22160 -> 20317 (-8.32%) helped: 1340 HURT: 103 Max-temps are helped. total sfu-stalls in shared programs: 30685 -> 31827 (3.72%) sfu-stalls in affected programs: 782 -> 1924 (146.04%) helped: 253 HURT: 1416 Inconclusive result. total inst-and-stalls in shared programs: 13816722 -> 13792242 (-0.18%) inst-and-stalls in affected programs: 3171642 -> 3147162 (-0.77%) helped: 15331 HURT: 4179 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9471>
author: Iago Toral Quiroga <itoral@igalia.com> 2021-03-05 12:36:51 +0100
committer: Marge Bot <eric+marge@anholt.net> 2021-03-10 07:52:22 +0000
commit: 947e9e42cc27481adc9a8626bbc9d5f8c15ad4c3 (patch)
tree: eaf0240dbe5c99e6b5e2f799bfd835341fbf8dc4
parent: d37241bdc4fb23139793b135a00036de496cf9f2 (diff)
3 files changed, 8 insertions, 86 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 3e13835100d..96bfd86e475 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -922,29 +922,17 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
 }
 
 static struct qreg
-ldvary_sequence_inst(struct v3d_compile *c, struct qreg result)
-{
-        struct qinst *producer =
-                   (struct qinst *) c->cur_block->instructions.prev;
-        assert(producer);
-        producer->is_ldvary_sequence = true;
-        return result;
-}
-
-static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
                     struct qreg vary, struct qreg w, struct qreg r5)
 {
-        return ldvary_sequence_inst(c, vir_FADD(c,
-               ldvary_sequence_inst(c, vir_FMUL(c, vary, w)), r5));
+        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
 }
 
 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
                            struct qreg vary, struct qreg r5)
 {
-        return ldvary_sequence_inst(c, vir_FADD(c,
-               ldvary_sequence_inst(c, vir_MOV(c, vary)), r5));
+        return vir_FADD(c, vir_MOV(c, vary), r5);
 }
 
 static struct qreg
@@ -952,7 +940,7 @@ emit_flat_varying(struct v3d_compile *c,
                   struct qreg vary, struct qreg r5)
 {
         vir_MOV_dest(c, c->undef, vary);
-        return ldvary_sequence_inst(c, vir_MOV(c, r5));
+        return vir_MOV(c, r5);
 }
 
 static struct qreg
@@ -968,7 +956,6 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                       c->undef, c->undef);
                 ldvary->qpu.sig.ldvary = true;
-                ldvary->is_ldvary_sequence = true;
                 vary = vir_emit_def(c, ldvary);
         } else {
                 vir_NOP(c)->qpu.sig.ldvary = true;
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 7f6ac5af0b4..092b9252f83 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -459,9 +459,7 @@ struct choose_scoreboard {
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
         bool tlb_locked;
-        bool ldvary_pipelining;
         bool fixup_ldvary;
-        int ldvary_count;
 };
 
 static bool
@@ -893,14 +891,6 @@ choose_instruction_to_schedule(struct v3d_compile *c,
 
         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
                             dag.link) {
-                /* If we are scheduling a pipelined varying sequence then
-                 * we want to pick up the next instruction in the sequence.
-                 */
-                if (scoreboard->ldvary_pipelining &&
-                    !n->inst->is_ldvary_sequence) {
-                        continue;
-                }
-
                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
 
 
@@ -991,17 +981,6 @@ choose_instruction_to_schedule(struct v3d_compile *c,
                                             &prev_inst->inst->qpu, inst)) {
                                 continue;
                         }
-
-                        /* If we find an ldvary inside an ongoing pipelineable
-                         * ldvary sequence we want to pick that and start
-                         * pipelining the new sequence into the previous one.
-                         */
-                        if (scoreboard->ldvary_pipelining && inst->sig.ldvary) {
-                                assert(n->inst->is_ldvary_sequence);
-                                scoreboard->ldvary_count++;
-                                scoreboard->fixup_ldvary = true;
-                                return n;
-                        }
                 }
 
                 int prio = get_instruction_priority(c->devinfo, inst);
@@ -1042,51 +1021,11 @@ choose_instruction_to_schedule(struct v3d_compile *c,
                 }
         }
 
-        /* Update ldvary pipelining state */
-        if (chosen) {
-                if (chosen->inst->qpu.sig.ldvary &&
-                    chosen->inst->is_ldvary_sequence) {
-                        scoreboard->ldvary_pipelining =
-                            c->num_inputs > ++scoreboard->ldvary_count;
-                }
-        } else if (scoreboard->ldvary_pipelining) {
-                /* If we are in the middle of an ldvary sequence we only pick
-                 * up instructions that can continue the sequence so we can
-                 * pipeline them, however, if we failed to find anything to
-                 * schedule (!prev_inst) then we can't possibly continue the
-                 * sequence and we need to stop the pipelining process and try
-                 * again.
-                 *
-                 * There is one exception to the above: noperspective or flat
-                 * varyings can cause us to not be able to pick an instruction
-                 * because they need a nop between the ldvary and the next
-                 * instruction to account for the ldvary r5 write latency. We
-                 * can try to detect this by checking if we are also unable to
-                 * schedule an instruction after disabling pipelining.
-                 *
-                 * FIXME: dropping pipelining and picking up another instruction
-                 * could break the sequence for flat/noperspective varyings we
-                 * could've been able to continue if we returned NULL here and
-                 * scheduled a NOP as a result, but detecting this case would
-                 * require us to know in advance that emitting the next NOP will
-                 * guarantee that we will be able to continue the sequence.
-                 *
-                 * If we failed to pair up (prev_inst != NULL), then we disable
-                 * pipelining if we have already scheduled the last ldvary. This
-                 * may allow any other instruction that is not part of an ldvary
-                 * sequence to be merged into the last instruction of the last
-                 * ldvary sequence for optimal results.
-                 */
-                if (!prev_inst) {
-                        scoreboard->ldvary_pipelining = false;
-                        chosen = choose_instruction_to_schedule(c, scoreboard,
-                                                                prev_inst);
-                        scoreboard->ldvary_pipelining = !chosen;
-                } else {
-                        scoreboard->ldvary_pipelining =
-                                c->num_inputs > scoreboard->ldvary_count;
-                }
-        }
+        /* If we are pairing an ldvary, flag it so we can fix it up for optimal
+         * pipelining of ldvary sequences.
+         */
+        if (prev_inst && chosen && chosen->inst->qpu.sig.ldvary)
+                scoreboard->fixup_ldvary = true;
 
         return chosen;
 }
@@ -1741,7 +1680,6 @@ schedule_instructions(struct v3d_compile *c,
                                 }
 
                                 if (scoreboard->fixup_ldvary) {
-                                        assert(scoreboard->ldvary_pipelining);
                                         scoreboard->fixup_ldvary = false;
                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
                                                 /* Flag the ldvary as scheduled
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index fafdf5a208f..12fbb64841f 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -162,9 +162,6 @@ struct qinst {
          * otherwise.
          */
         int uniform;
-
-        /* Set if this instruction participates in a varying setup. */
-        bool is_ldvary_sequence;
 };
 
 enum quniform_contents {
author	Iago Toral Quiroga <itoral@igalia.com>	2021-03-05 12:36:51 +0100
committer	Marge Bot <eric+marge@anholt.net>	2021-03-10 07:52:22 +0000
commit	947e9e42cc27481adc9a8626bbc9d5f8c15ad4c3 (patch)
tree	eaf0240dbe5c99e6b5e2f799bfd835341fbf8dc4
parent	d37241bdc4fb23139793b135a00036de496cf9f2 (diff)