broadcom/compiler: pipeline smooth ldvary sequences

Typically, we would schedule smooth varyings like this: nop ; nop ; ldvary.r4 nop ; fmul r0, r4, rf0 fadd rf13, r0, r5 ; nop ; ldvary.r1 nop ; fmul r2, r1, rf0 fadd rf12, r2, r5 ; nop ; ldvary.r3 nop ; fmul r4, r3, rf0 fadd rf11, r4, r5 ; nop ; ldvary.r0 where we pair up an ldvary with the fadd of the previous sequence instead of the previous fmul. This is because ldvary has an implicit write to r5 which is read by the fadd of the previous sequence, so our dependency tracking doesn't allow us to move the ldvary before the fadd, however, the r5 write of the ldvary instruction happens in the instruction after it is emitted so we can actually move it to the fmul and the r5 write would still happen in the same instruction as the fadd, which is fine. This patch allows us to pipeline these sequences optimally. For that, after merging an ldvary into a previous instruction in the middle of a pipelineable ldvary sequence, we check if we can manually move it to the last scheduled instruction instead (the one before the instruction we are currently scheduling). If we are successful at moving the ldvary to the previous instruction, then we flag the ldvary as scheduled immediately, which may promote its children (the follow-up fmul instruction for that ldvary) to DAG heads and continue the merge loop so that fmul can be picked and merged into the final fadd of the previous sequence (where we had originally merged the ldvary). This leads to a result that looks like this: nop ; nop ; ldvary.r4 nop ; fmul r0, r4, rf0 ; ldvary.r1 fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; ldvary.r3 fadd rf12, r2, r5 ; fmul r4, r3, rf0 ; ldvary.r0 Shader-db results: total instructions in shared programs: 14071591 -> 13820690 (-1.78%) instructions in affected programs: 7809692 -> 7558791 (-3.21%) helped: 41209 HURT: 4528 Instructions are helped. total max-temps in shared programs: 2335784 -> 2326435 (-0.40%) max-temps in affected programs: 84302 -> 74953 (-11.09%) helped: 4561 HURT: 293 Max-temps are helped. total sfu-stalls in shared programs: 31537 -> 30683 (-2.71%) sfu-stalls in affected programs: 3551 -> 2697 (-24.05%) helped: 1713 HURT: 750 Sfu-stalls are helped. total inst-and-stalls in shared programs: 14103128 -> 13851373 (-1.79%) inst-and-stalls in affected programs: 7820726 -> 7568971 (-3.22%) helped: 41411 HURT: 4535 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9304>
author: Iago Toral Quiroga <itoral@igalia.com> 2021-02-26 12:31:52 +0100
committer: Iago Toral Quiroga <itoral@igalia.com> 2021-03-02 07:56:00 +0100
commit: 1784dd22a32dccff0fee0428f7cf7fea8dccc574 (patch)
tree: 9f3b9aef1cb3d58c8e154ad16d4cce804e4dcfb3
parent: 1d021539a24736b4f3e9149dc6b6e4963105cfc1 (diff)
1 files changed, 199 insertions, 6 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 4d23a80c15a..571a89fb7be 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -459,6 +459,8 @@ struct choose_scoreboard {
         int last_uniforms_reset_tick;
         int last_thrsw_tick;
         bool tlb_locked;
+        bool ldvary_pipelining;
+        bool fixup_ldvary;
 };
 
 static bool
@@ -890,6 +892,20 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
 
         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
                             dag.link) {
+                /* If we are scheduling a pipelined smooth varying sequence then
+                 * we want to pick up the next instruction in the sequence.
+                 */
+                if (scoreboard->ldvary_pipelining &&
+                    !n->inst->ldvary_pipelining) {
+                        continue;
+                }
+
+                /* Sanity check: if we are scheduling a smooth ldvary sequence
+                 * we cannot be starting another sequence in the middle of it.
+                 */
+                assert(!scoreboard->ldvary_pipelining ||
+                       !n->inst->ldvary_pipelining_start);
+
                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
 
                 /* Simulator complains if we have two uniforms loaded in the
@@ -946,12 +962,6 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
                  * r5 in the same tick.
-                 *
-                 * XXX perf: To get good pipelining of a sequence of varying
-                 * loads, we need to figure out how to pair the ldvary signal
-                 * up to the instruction before the last r5 user in the
-                 * previous ldvary sequence.  Currently, it usually pairs with
-                 * the last r5 user.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -985,6 +995,16 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                                             &prev_inst->inst->qpu, inst)) {
                                 continue;
                         }
+
+                        /* If we find an ldvary inside an ongoing pipelineable
+                         * ldvary sequence we want to pick that and start
+                         * pipelining the new sequence into the previous one.
+                         */
+                        if (scoreboard->ldvary_pipelining && inst->sig.ldvary) {
+                                assert(n->inst->ldvary_pipelining);
+                                scoreboard->fixup_ldvary = true;
+                                return n;
+                        }
                 }
 
                 int prio = get_instruction_priority(devinfo, inst);
@@ -1025,6 +1045,26 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                 }
         }
 
+        /* If we are in the middle of an ldvary sequence we only pick up
+         * instructions that can continue the sequence so we can pipeline
+         * them, however, if we failed to find anything to schedule then we
+         * can't possibly continue the sequence and we need to stop the
+         * pipelining process and try again.
+         */
+        if (scoreboard->ldvary_pipelining && !prev_inst && !chosen) {
+                scoreboard->ldvary_pipelining = false;
+                chosen = choose_instruction_to_schedule(devinfo, scoreboard, prev_inst);
+        } else if (chosen) {
+                if (scoreboard->ldvary_pipelining) {
+                        assert(chosen->inst->ldvary_pipelining);
+                        if (chosen->inst->ldvary_pipelining_end)
+                                scoreboard->ldvary_pipelining = false;
+                } else if (chosen->inst->ldvary_pipelining_start) {
+                        assert(chosen->inst->qpu.sig.ldvary);
+                        scoreboard->ldvary_pipelining = true;
+                }
+        }
+
         return chosen;
 }
 
@@ -1460,6 +1500,144 @@ emit_thrsw(struct v3d_compile *c,
         return time;
 }
 
+static bool
+alu_reads_register(struct v3d_qpu_instr *inst,
+                   bool add, bool magic, uint32_t index)
+{
+        uint32_t num_src;
+        enum v3d_qpu_mux mux_a, mux_b;
+
+        if (add) {
+                num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
+                mux_a = inst->alu.add.a;
+                mux_b = inst->alu.add.b;
+        } else {
+                num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+                mux_a = inst->alu.mul.a;
+                mux_b = inst->alu.mul.b;
+        }
+
+        for (int i = 0; i < num_src; i++) {
+                if (magic) {
+                        if (i == 0 && mux_a == index)
+                                return true;
+                        if (i == 1 && mux_b == index)
+                                return true;
+                } else {
+                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+                            inst->raddr_a == index) {
+                                return true;
+                        }
+                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+                            inst->raddr_b == index) {
+                                return true;
+                        }
+                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+                            inst->raddr_a == index) {
+                                return true;
+                        }
+                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+                            inst->raddr_b == index) {
+                                return true;
+                        }
+                }
+        }
+
+        return false;
+}
+
+/**
+ * This takes and ldvary signal merged into 'inst' and tries to move it up to
+ * the previous instruction to get good pipelining of ldvary sequences,
+ * transforming this:
+ *
+ * nop                  ; nop               ; ldvary.r4
+ * nop                  ; fmul  r0, r4, rf0 ;
+ * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
+ *
+ * into:
+ *
+ * nop                  ; nop               ; ldvary.r4
+ * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
+ * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
+ *
+ * If we manage to do this successfully (we return true here), then flagging
+ * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
+ * we will be able to pick up to merge into 'inst', leading to code like this:
+ *
+ * nop                  ; nop               ; ldvary.r4
+ * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
+ * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
+ */
+static bool
+fixup_pipelined_ldvary(struct v3d_compile *c,
+                       struct choose_scoreboard *scoreboard,
+                       struct qblock *block,
+                       struct v3d_qpu_instr *inst)
+{
+        /* We only call this if we have successfuly merged an ldvary into a
+         * previous instruction.
+         */
+        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+        assert(inst->sig.ldvary);
+        uint32_t ldvary_magic = inst->sig_magic;
+        uint32_t ldvary_index = inst->sig_addr;
+
+        /* The instruction in which we merged the ldvary cannot read
+         * the ldvary destination, if it does, then moving the ldvary before
+         * it would overwrite it.
+         */
+        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+                return false;
+        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+                return false;
+
+        /* The previous instruction can't write to the same destination as the
+         * ldvary.
+         */
+        struct qinst *prev = (struct qinst *) block->instructions.prev;
+        if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                if (prev->qpu.alu.add.magic_write == ldvary_magic &&
+                    prev->qpu.alu.add.waddr == ldvary_index) {
+                        return false;
+                }
+        }
+
+        if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
+                if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
+                    prev->qpu.alu.mul.waddr == ldvary_index) {
+                        return false;
+                }
+        }
+
+        /* The previous instruction cannot have a conflicting signal */
+        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+                return false;
+
+        /* The previous instruction cannot use flags since ldvary uses the
+         * 'cond' instruction field to store the destination.
+         */
+        if (v3d_qpu_writes_flags(&prev->qpu))
+                return false;
+
+        /* Move the ldvary to the previous instruction and remove it from the
+         * current one.
+         */
+        prev->qpu.sig.ldvary = true;
+        prev->qpu.sig_magic = ldvary_magic;
+        prev->qpu.sig_addr = ldvary_index;
+        scoreboard->last_ldvary_tick = scoreboard->tick - 1;
+
+        inst->sig.ldvary = false;
+        inst->sig_magic = false;
+        inst->sig_addr = 0;
+
+        return true;
+}
+
 static uint32_t
 schedule_instructions(struct v3d_compile *c,
                       struct choose_scoreboard *scoreboard,
@@ -1530,6 +1708,21 @@ schedule_instructions(struct v3d_compile *c,
                                         v3d_qpu_dump(devinfo, inst);
                                         fprintf(stderr, "\n");
                                 }
+
+                                if (scoreboard->fixup_ldvary) {
+                                        assert(scoreboard->ldvary_pipelining);
+                                        scoreboard->fixup_ldvary = false;
+                                        if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
+                                                /* Flag the ldvary as scheduled
+                                                 * now so we can try to merge the
+                                                 * follow-up fmul into the current
+                                                 * instruction.
+                                                 */
+                                                mark_instruction_scheduled(
+                                                        devinfo, scoreboard->dag,
+                                                        time, merge);
+                                        }
+                                }
                         }
                         if (mux_read_stalls(scoreboard, inst))
                                 c->qpu_inst_stalled_count++;
author	Iago Toral Quiroga <itoral@igalia.com>	2021-02-26 12:31:52 +0100
committer	Iago Toral Quiroga <itoral@igalia.com>	2021-03-02 07:56:00 +0100
commit	1784dd22a32dccff0fee0428f7cf7fea8dccc574 (patch)
tree	9f3b9aef1cb3d58c8e154ad16d4cce804e4dcfb3
parent	1d021539a24736b4f3e9149dc6b6e4963105cfc1 (diff)