diff options
Diffstat (limited to 'src/broadcom/compiler/qpu_schedule.c')
-rw-r--r-- | src/broadcom/compiler/qpu_schedule.c | 1158 |
1 files changed, 902 insertions, 256 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index c559814b9ea..ba76ac87e1e 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -85,6 +85,7 @@ struct schedule_state { struct schedule_node *last_unif; struct schedule_node *last_rtop; struct schedule_node *last_unifa; + struct schedule_node *last_setmsf; enum direction dir; /* Estimated cycle when the current instruction would start. */ uint32_t time; @@ -97,7 +98,7 @@ add_dep(struct schedule_state *state, bool write) { bool write_after_read = !write && state->dir == R; - void *edge_data = (void *)(uintptr_t)write_after_read; + uintptr_t edge_data = write_after_read; if (!before || !after) return; @@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (inst->alu.add.magic_write && + if (inst->alu.add.op != V3D_QPU_A_NOP && + inst->alu.add.magic_write && (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) return true; - if (inst->alu.mul.magic_write && + if (inst->alu.mul.op != V3D_QPU_M_NOP && + inst->alu.mul.magic_write && (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) return true; @@ -153,12 +156,13 @@ static void process_mux_deps(struct schedule_state *state, struct schedule_node *n, enum v3d_qpu_mux mux) { + assert(state->devinfo->ver < 71); switch (mux) { case V3D_QPU_MUX_A: add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: - if (!n->inst->qpu.sig.small_imm) { + if (!n->inst->qpu.sig.small_imm_b) { add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); } @@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, } } + +static void +process_raddr_deps(struct schedule_state *state, struct schedule_node *n, + uint8_t raddr, bool is_small_imm) +{ + assert(state->devinfo->ver >= 71); + + if (!is_small_imm) + add_read_dep(state, state->last_rf[raddr], n); +} + static bool tmu_write_is_sequence_terminator(uint32_t waddr) { @@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr) static bool can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) { - if (devinfo->ver < 40) - return false; - if (tmu_write_is_sequence_terminator(waddr)) return false; @@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, break; case V3D_QPU_WADDR_UNIFA: - if (state->devinfo->ver >= 40) - add_write_dep(state, &state->last_unifa, n); + add_write_dep(state, &state->last_unifa, n); break; case V3D_QPU_WADDR_NOP: @@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* If the input and output segments are shared, then all VPM reads to * a location need to happen before all writes. We handle this by * serializing all VPM operations for now. + * + * FIXME: we are assuming that the segments are shared. That is + * correct right now as we are only using shared, but technically you + * can choose. */ bool separate_vpm_segment = false; @@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* XXX: LOAD_IMM */ - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) - process_mux_deps(state, n, inst->alu.add.a); - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) - process_mux_deps(state, n, inst->alu.add.b); + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.a.raddr, + inst->sig.small_imm_a); + } + } + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.b.raddr, + inst->sig.small_imm_b); + } + } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) - process_mux_deps(state, n, inst->alu.mul.a); - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) - process_mux_deps(state, n, inst->alu.mul.b); + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.a.raddr, + inst->sig.small_imm_c); + } + } + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.b.raddr, + inst->sig.small_imm_d); + } + } switch (inst->alu.add.op) { case V3D_QPU_A_VPMSETUP: @@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) case V3D_QPU_A_MSF: add_read_dep(state, state->last_tlb, n); + add_read_dep(state, state->last_setmsf, n); break; case V3D_QPU_A_SETMSF: + add_write_dep(state, &state->last_setmsf, n); + add_write_dep(state, &state->last_tmu_write, n); + FALLTHROUGH; case V3D_QPU_A_SETREVF: add_write_dep(state, &state->last_tlb, n); break; + case V3D_QPU_A_BALLOT: + case V3D_QPU_A_BCASTF: + case V3D_QPU_A_ALLEQ: + case V3D_QPU_A_ALLFEQ: + add_read_dep(state, state->last_setmsf, n); + break; + default: break; } @@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_r[4], n); if (v3d_qpu_writes_r5(devinfo, inst)) add_write_dep(state, &state->last_r[5], n); + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) + add_write_dep(state, &state->last_rf[0], n); /* If we add any more dependencies here we should consider whether we * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. @@ -492,9 +544,16 @@ struct choose_scoreboard { int last_thrsw_tick; int last_branch_tick; int last_setmsf_tick; - bool tlb_locked; + bool first_thrsw_emitted; + bool last_thrsw_emitted; bool fixup_ldvary; int ldvary_count; + int pending_ldtmu_count; + bool first_ldtmu_after_thrsw; + + /* V3D 7.x */ + int last_implicit_rf0_write_tick; + bool has_rf0_flops_conflict; }; static bool @@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, } static bool -reads_too_soon_after_write(struct choose_scoreboard *scoreboard, +reads_too_soon(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, uint8_t raddr) +{ + switch (raddr) { + case 0: /* ldvary delayed write of C coefficient to rf0 */ + if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) + return true; + break; + default: + break; + } + + return false; +} + +static bool +reads_too_soon_after_write(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, struct qinst *qinst) { const struct v3d_qpu_instr *inst = &qinst->qpu; @@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); if (inst->alu.add.op != V3D_QPU_A_NOP) { - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr)) + return true; + } } - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr)) + return true; + } } } if (inst->alu.mul.op != V3D_QPU_M_NOP) { - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr)) + return true; + } } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) + return true; + } } } @@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, v3d_qpu_writes_r4(devinfo, inst)) return true; + if (devinfo->ver == 42) + return false; + + /* Don't schedule anything that writes rf0 right after ldvary, since + * that would clash with the ldvary's delayed rf0 write (the exception + * is another ldvary, since its implicit rf0 write would also have + * one cycle of delay and would not clash). + */ + if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick && + (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + (v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !inst->sig.ldvary))) { + return true; + } + return false; } static bool -pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, +scoreboard_is_locked(struct choose_scoreboard *scoreboard, + bool lock_scoreboard_on_first_thrsw) +{ + if (lock_scoreboard_on_first_thrsw) { + return scoreboard->first_thrsw_emitted && + scoreboard->tick - scoreboard->last_thrsw_tick >= 3; + } + + return scoreboard->last_thrsw_emitted && + scoreboard->tick - scoreboard->last_thrsw_tick >= 3; +} + +static bool +pixel_scoreboard_too_soon(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { - return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); + return qpu_inst_is_tlb(inst) && + !scoreboard_is_locked(scoreboard, + c->lock_scoreboard_on_first_thrsw); } static bool -qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, +qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, uint32_t waddr) { if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && - inst->raddr_a == waddr) - return true; + if (devinfo->ver < 71) { + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && - !inst->sig.small_imm && (inst->raddr_b == waddr)) - return true; + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm_b && (inst->raddr_b == waddr)) + return true; + } else { + if (v3d71_qpu_reads_raddr(inst, waddr)) + return true; + } return false; } static bool -mux_read_stalls(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst) +read_stalls(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) { return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && - qpu_instruction_uses_rf(inst, + qpu_instruction_uses_rf(devinfo, inst, scoreboard->last_stallable_sfu_reg); } /* We define a max schedule priority to allow negative priorities as result of - * substracting this max when an instruction stalls. So instructions that + * subtracting this max when an instruction stalls. So instructions that * stall have lower priority than regular instructions. */ #define MAX_SCHEDULE_PRIORITY 16 @@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo, return next_score; next_score++; + /* Empirical testing shows that using priorities to hide latency of + * TMU operations when scheduling QPU leads to slightly worse + * performance, even at 2 threads. We think this is because the thread + * switching is already quite effective at hiding latency and NIR + * scheduling (and possibly TMU pipelining too) are sufficient to hide + * TMU latency, so piling up on that here doesn't provide any benefits + * and instead may cause us to postpone critical paths that depend on + * the TMU results. + */ +#if 0 /* Schedule texture read results collection late to hide latency. */ if (v3d_qpu_waits_on_tmu(inst)) return next_score; next_score++; +#endif /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; +#if 0 /* Schedule texture read setup early to hide their latency better. */ if (v3d_qpu_writes_tmu(devinfo, inst)) return next_score; next_score++; +#endif /* We should increase the maximum if we assert here */ assert(next_score < MAX_SCHEDULE_PRIORITY); @@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo, return baseline_score; } -static bool -qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo, - enum v3d_qpu_waddr waddr) -{ - return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) || - v3d_qpu_magic_waddr_is_sfu(waddr) || - v3d_qpu_magic_waddr_is_tlb(waddr) || - v3d_qpu_magic_waddr_is_vpm(waddr) || - v3d_qpu_magic_waddr_is_tsy(waddr)); -} +enum { + V3D_PERIPHERAL_VPM_READ = (1 << 0), + V3D_PERIPHERAL_VPM_WRITE = (1 << 1), + V3D_PERIPHERAL_VPM_WAIT = (1 << 2), + V3D_PERIPHERAL_SFU = (1 << 3), + V3D_PERIPHERAL_TMU_WRITE = (1 << 4), + V3D_PERIPHERAL_TMU_READ = (1 << 5), + V3D_PERIPHERAL_TMU_WAIT = (1 << 6), + V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), + V3D_PERIPHERAL_TSY = (1 << 8), + V3D_PERIPHERAL_TLB_READ = (1 << 9), + V3D_PERIPHERAL_TLB_WRITE = (1 << 10), +}; -static bool -qpu_accesses_peripheral(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *inst) +static uint32_t +qpu_peripherals(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) { - if (v3d_qpu_uses_vpm(inst)) - return true; + uint32_t result = 0; + if (v3d_qpu_reads_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_READ; + if (v3d_qpu_writes_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_WRITE; + if (v3d_qpu_waits_vpm(inst)) + result |= V3D_PERIPHERAL_VPM_WAIT; + + if (v3d_qpu_writes_tmu(devinfo, inst)) + result |= V3D_PERIPHERAL_TMU_WRITE; + if (inst->sig.ldtmu) + result |= V3D_PERIPHERAL_TMU_READ; + if (inst->sig.wrtmuc) + result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG; + if (v3d_qpu_uses_sfu(inst)) - return true; + result |= V3D_PERIPHERAL_SFU; + + if (v3d_qpu_reads_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_READ; + if (v3d_qpu_writes_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_WRITE; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && - qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) { - return true; + v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) { + result |= V3D_PERIPHERAL_TSY; } if (inst->alu.add.op == V3D_QPU_A_TMUWT) - return true; - - if (inst->alu.mul.op != V3D_QPU_M_NOP && - inst->alu.mul.magic_write && - qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) { - return true; - } + result |= V3D_PERIPHERAL_TMU_WAIT; } - return (inst->sig.ldvpm || - inst->sig.ldtmu || - inst->sig.ldtlb || - inst->sig.ldtlbu || - inst->sig.wrtmuc); + return result; } static bool @@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { - const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a); - const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b); + const uint32_t a_peripherals = qpu_peripherals(devinfo, a); + const uint32_t b_peripherals = qpu_peripherals(devinfo, b); /* We can always do one peripheral access per instruction. */ - if (!a_uses_peripheral || !b_uses_peripheral) + if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1) return true; - if (devinfo->ver < 41) + /* V3D 4.x can't do more than one peripheral access except in a + * few cases: + */ + if (devinfo->ver == 42) { + /* WRTMUC signal with TMU register write (other than tmuc). */ + if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); + } + if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); + } + + /* TMU read with VPM read/write. */ + if (a_peripherals == V3D_PERIPHERAL_TMU_READ && + (b_peripherals == V3D_PERIPHERAL_VPM_READ || + b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + if (b_peripherals == V3D_PERIPHERAL_TMU_READ && + (a_peripherals == V3D_PERIPHERAL_VPM_READ || + a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + return false; + } - /* V3D 4.1 and later allow TMU read along with a VPM read or write, and - * WRTMUC with a TMU magic register write (other than tmuc). - */ - if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) || - (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) { - return true; + /* V3D 7.x can't have more than one of these restricted peripherals */ + const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE | + V3D_PERIPHERAL_TMU_WRTMUC_SIG | + V3D_PERIPHERAL_TSY | + V3D_PERIPHERAL_TLB_READ | + V3D_PERIPHERAL_SFU | + V3D_PERIPHERAL_VPM_READ | + V3D_PERIPHERAL_VPM_WRITE; + + const uint32_t a_restricted = a_peripherals & restricted; + const uint32_t b_restricted = b_peripherals & restricted; + if (a_restricted && b_restricted) { + /* WRTMUC signal with TMU register write (other than tmuc) is + * allowed though. + */ + if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || + (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) { + return false; + } } - if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || - (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) { - return true; + /* Only one TMU read per instruction */ + if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) && + (b_peripherals & V3D_PERIPHERAL_TMU_READ)) { + return false; } - return false; + /* Only one TLB access per instruction */ + if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ)) && + (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ))) { + return false; + } + + return true; } /* Compute a bitmask of which rf registers are used between @@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, uint64_t raddrs_used = 0; if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) raddrs_used |= (1ll << a->raddr_a); - if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) raddrs_used |= (1ll << a->raddr_b); if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) raddrs_used |= (1ll << b->raddr_a); - if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) raddrs_used |= (1ll << b->raddr_b); return raddrs_used; } -/* Take two instructions and attempt to merge their raddr fields - * into one merged instruction. Returns false if the two instructions - * access more than two different rf registers between them, or more - * than one rf register and one small immediate. +/* Takes two instructions and attempts to merge their raddr fields (including + * small immediates) into one merged instruction. For V3D 4.x, returns false + * if the two instructions access more than two different rf registers between + * them, or more than one rf register and one small immediate. For 7.x returns + * false if both instructions use small immediates. */ static bool qpu_merge_raddrs(struct v3d_qpu_instr *result, const struct v3d_qpu_instr *add_instr, - const struct v3d_qpu_instr *mul_instr) + const struct v3d_qpu_instr *mul_instr, + const struct v3d_device_info *devinfo) { + if (devinfo->ver >= 71) { + assert(add_instr->sig.small_imm_a + + add_instr->sig.small_imm_b <= 1); + assert(add_instr->sig.small_imm_c + + add_instr->sig.small_imm_d == 0); + assert(mul_instr->sig.small_imm_a + + mul_instr->sig.small_imm_b == 0); + assert(mul_instr->sig.small_imm_c + + mul_instr->sig.small_imm_d <= 1); + + result->sig.small_imm_a = add_instr->sig.small_imm_a; + result->sig.small_imm_b = add_instr->sig.small_imm_b; + result->sig.small_imm_c = mul_instr->sig.small_imm_c; + result->sig.small_imm_d = mul_instr->sig.small_imm_d; + + return (result->sig.small_imm_a + + result->sig.small_imm_b + + result->sig.small_imm_c + + result->sig.small_imm_d) <= 1; + } + + assert(devinfo->ver == 42); + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); int naddrs = util_bitcount64(raddrs_used); if (naddrs > 2) return false; - if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { + if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) { if (naddrs > 1) return false; - if (add_instr->sig.small_imm && mul_instr->sig.small_imm) + if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b) if (add_instr->raddr_b != mul_instr->raddr_b) return false; - result->sig.small_imm = true; - result->raddr_b = add_instr->sig.small_imm ? + result->sig.small_imm_b = true; + result->raddr_b = add_instr->sig.small_imm_b ? add_instr->raddr_b : mul_instr->raddr_b; } @@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, raddrs_used &= ~(1ll << raddr_a); result->raddr_a = raddr_a; - if (!result->sig.small_imm) { + if (!result->sig.small_imm_b) { if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && raddr_a == add_instr->raddr_b) { - if (add_instr->alu.add.a == V3D_QPU_MUX_B) - result->alu.add.a = V3D_QPU_MUX_A; - if (add_instr->alu.add.b == V3D_QPU_MUX_B && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B) + result->alu.add.a.mux = V3D_QPU_MUX_A; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_A; + result->alu.add.b.mux = V3D_QPU_MUX_A; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && raddr_a == mul_instr->raddr_b) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) - result->alu.mul.a = V3D_QPU_MUX_A; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B) + result->alu.mul.a.mux = V3D_QPU_MUX_A; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_A; + result->alu.mul.b.mux = V3D_QPU_MUX_A; } } } @@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, result->raddr_b = raddr_b; if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && raddr_b == add_instr->raddr_a) { - if (add_instr->alu.add.a == V3D_QPU_MUX_A) - result->alu.add.a = V3D_QPU_MUX_B; - if (add_instr->alu.add.b == V3D_QPU_MUX_A && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A) + result->alu.add.a.mux = V3D_QPU_MUX_B; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_B; + result->alu.add.b.mux = V3D_QPU_MUX_B; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && raddr_b == mul_instr->raddr_a) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) - result->alu.mul.a = V3D_QPU_MUX_B; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A) + result->alu.mul.a.mux = V3D_QPU_MUX_B; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_B; + result->alu.mul.b.mux = V3D_QPU_MUX_B; } } @@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op) } static void -qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) +qpu_convert_add_to_mul(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst) { STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); assert(inst->alu.add.op != V3D_QPU_A_NOP); @@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) inst->flags.ac = V3D_QPU_COND_NONE; inst->flags.apf = V3D_QPU_PF_NONE; inst->flags.auf = V3D_QPU_UF_NONE; + + inst->alu.mul.output_pack = inst->alu.add.output_pack; + + inst->alu.mul.a.unpack = inst->alu.add.a.unpack; + inst->alu.mul.b.unpack = inst->alu.add.b.unpack; + inst->alu.add.output_pack = V3D_QPU_PACK_NONE; + inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + + if (devinfo->ver >= 71) { + assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d); + assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1); + if (inst->sig.small_imm_a) { + inst->sig.small_imm_c = true; + inst->sig.small_imm_a = false; + } else if (inst->sig.small_imm_b) { + inst->sig.small_imm_d = true; + inst->sig.small_imm_b = false; + } + } +} + +static bool +can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + case V3D_QPU_M_FMOV: + return devinfo->ver >= 71; + default: + return false; + } +} + +static enum v3d_qpu_mul_op +mul_op_as_add_op(enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + return V3D_QPU_A_MOV; + case V3D_QPU_M_FMOV: + return V3D_QPU_A_FMOV; + default: + unreachable("unexpected mov opcode"); + } +} + +static void +qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) +{ + STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul)); + assert(inst->alu.mul.op != V3D_QPU_M_NOP); + assert(inst->alu.add.op == V3D_QPU_A_NOP); + + memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add)); + inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op); + inst->alu.mul.op = V3D_QPU_M_NOP; + + inst->flags.ac = inst->flags.mc; + inst->flags.apf = inst->flags.mpf; + inst->flags.auf = inst->flags.muf; + inst->flags.mc = V3D_QPU_COND_NONE; + inst->flags.mpf = V3D_QPU_PF_NONE; + inst->flags.muf = V3D_QPU_UF_NONE; + + inst->alu.add.output_pack = inst->alu.mul.output_pack; + inst->alu.add.a.unpack = inst->alu.mul.a.unpack; + inst->alu.add.b.unpack = inst->alu.mul.b.unpack; + inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; + inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b); + assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1); + if (inst->sig.small_imm_c) { + inst->sig.small_imm_a = true; + inst->sig.small_imm_c = false; + } else if (inst->sig.small_imm_d) { + inst->sig.small_imm_b = true; + inst->sig.small_imm_d = false; + } } static bool @@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(b->alu.add.op)) { mul_inst = *b; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge.alu.mul = mul_inst.alu.mul; - merge.flags.mc = b->flags.ac; - merge.flags.mpf = b->flags.apf; - merge.flags.muf = b->flags.auf; + merge.flags.mc = mul_inst.flags.mc; + merge.flags.mpf = mul_inst.flags.mpf; + merge.flags.muf = mul_inst.flags.muf; add_instr = a; mul_instr = &mul_inst; } else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(a->alu.add.op)) { mul_inst = *a; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge = mul_inst; merge.alu.add = b->alu.add; @@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, } } + struct v3d_qpu_instr add_inst; if (b->alu.mul.op != V3D_QPU_M_NOP) { - if (a->alu.mul.op != V3D_QPU_M_NOP) - return false; - merge.alu.mul = b->alu.mul; + if (a->alu.mul.op == V3D_QPU_M_NOP) { + merge.alu.mul = b->alu.mul; - merge.flags.mc = b->flags.mc; - merge.flags.mpf = b->flags.mpf; - merge.flags.muf = b->flags.muf; + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; - mul_instr = b; - add_instr = a; + mul_instr = b; + add_instr = a; + } + /* If a's mul op is used but its add op is not, then see if we + * can convert either a's mul op or b's mul op to an add op + * so we can merge. + */ + else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, b->alu.mul.op)) { + add_inst = *b; + qpu_convert_mul_to_add(&add_inst); + + merge.alu.add = add_inst.alu.add; + + merge.flags.ac = add_inst.flags.ac; + merge.flags.apf = add_inst.flags.apf; + merge.flags.auf = add_inst.flags.auf; + + mul_instr = a; + add_instr = &add_inst; + } else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, a->alu.mul.op)) { + add_inst = *a; + qpu_convert_mul_to_add(&add_inst); + + merge = add_inst; + merge.alu.mul = b->alu.mul; + + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; + + mul_instr = b; + add_instr = &add_inst; + } else { + return false; + } } + /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and + * they have restrictions on the number of raddrs that can be adressed + * in a single instruction. In V3D 7.x, we don't have that restriction, + * but we are still limited to a single small immediate per instruction. + */ if (add_instr && mul_instr && - !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { - return false; + !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { + return false; } merge.sig.thrsw |= b->sig.thrsw; @@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, merge.sig.ldtmu |= b->sig.ldtmu; merge.sig.ldvary |= b->sig.ldvary; merge.sig.ldvpm |= b->sig.ldvpm; - merge.sig.small_imm |= b->sig.small_imm; merge.sig.ldtlb |= b->sig.ldtlb; merge.sig.ldtlbu |= b->sig.ldtlbu; merge.sig.ucb |= b->sig.ucb; @@ -1047,24 +1403,25 @@ retry: * regfile A or B that was written to by the previous * instruction." */ - if (reads_too_soon_after_write(scoreboard, n->inst)) + if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; - /* "A scoreboard wait must not occur in the first two - * instructions of a fragment shader. This is either the - * explicit Wait for Scoreboard signal or an implicit wait - * with the first tile-buffer read or write instruction." + /* "Before doing a TLB access a scoreboard wait must have been + * done. This happens either on the first or last thread + * switch, depending on a setting (scb_wait_on_first_thrsw) in + * the shader state." */ - if (pixel_scoreboard_too_soon(scoreboard, inst)) + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* ldunif and ldvary both write r5, but ldunif does so a tick - * sooner. If the ldvary's r5 wasn't used, then ldunif might + /* ldunif and ldvary both write the same register (r5 for v42 + * and below, rf0 for v71), but ldunif does so a tick sooner. + * If the ldvary's register wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update - * r5 in the same tick. + * the register in the same tick. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { @@ -1131,24 +1488,54 @@ retry: continue; } - /* Don't merge in something that will lock the TLB. - * Hopwefully what we have in inst will release some - * other instructions, allowing us to delay the - * TLB-locking instruction until later. + /* Don't merge TLB instructions before we have acquired + * the scoreboard lock. */ - if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) + if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* When we succesfully pair up an ldvary we then try + /* When we successfully pair up an ldvary we then try * to merge it into the previous instruction if * possible to improve pipelining. Don't pick up the * ldvary now if the follow-up fixup would place * it in the delay slots of a thrsw, which is not * allowed and would prevent the fixup from being - * successul. + * successful. In V3D 7.x we can allow this to happen + * as long as it is not the last delay slot. */ - if (inst->sig.ldvary && - scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { + if (inst->sig.ldvary) { + if (c->devinfo->ver == 42 && + scoreboard->last_thrsw_tick + 2 >= + scoreboard->tick - 1) { + continue; + } + if (c->devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 == + scoreboard->tick - 1) { + continue; + } + } + + /* We can emit a new tmu lookup with a previous ldtmu + * if doing this would free just enough space in the + * TMU output fifo so we don't overflow, however, this + * is only safe if the ldtmu cannot stall. + * + * A ldtmu can stall if it is not the first following a + * thread switch and corresponds to the first word of a + * read request. + * + * FIXME: For now we forbid pairing up a new lookup + * with a previous ldtmu that is not the first after a + * thrsw if that could overflow the TMU output fifo + * regardless of whether the ldtmu is reading the first + * word of a TMU result or not, since we don't track + * this aspect in the compiler yet. + */ + if (prev_inst->inst->qpu.sig.ldtmu && + !scoreboard->first_ldtmu_after_thrsw && + (scoreboard->pending_ldtmu_count + + n->inst->ldtmu_count > 16 / c->threads)) { continue; } @@ -1161,7 +1548,7 @@ retry: int prio = get_instruction_priority(c->devinfo, inst); - if (mux_read_stalls(scoreboard, inst)) { + if (read_stalls(c->devinfo, scoreboard, inst)) { /* Don't merge an instruction that stalls */ if (prev_inst) continue; @@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, { if (v3d_qpu_magic_waddr_is_sfu(waddr)) scoreboard->last_magic_sfu_write_tick = scoreboard->tick; - else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) + else if (waddr == V3D_QPU_WADDR_UNIFA) scoreboard->last_unifa_write_tick = scoreboard->tick; } @@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, } static void +update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, + const struct qinst *inst) +{ + /* Track if the have seen any ldtmu after the last thread switch */ + if (scoreboard->tick == scoreboard->last_thrsw_tick + 2) + scoreboard->first_ldtmu_after_thrsw = true; + + /* Track the number of pending ldtmu instructions for outstanding + * TMU lookups. + */ + scoreboard->pending_ldtmu_count += inst->ldtmu_count; + if (inst->qpu.sig.ldtmu) { + assert(scoreboard->pending_ldtmu_count > 0); + scoreboard->pending_ldtmu_count--; + scoreboard->first_ldtmu_after_thrsw = false; + } +} + +static void +set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick && + v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { + scoreboard->has_rf0_flops_conflict = true; + } +} + +static void +update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return; + + /* Thread switch restrictions: + * + * At the point of a thread switch or thread end (when the actual + * thread switch or thread end happens, not when the signalling + * instruction is processed): + * + * - If the most recent write to rf0 was from a ldunif, ldunifa, or + * ldvary instruction in which another signal also wrote to the + * register file, and the final instruction of the thread section + * contained a signal which wrote to the register file, then the + * value of rf0 is undefined at the start of the new section + * + * Here we use the scoreboard to track if our last rf0 implicit write + * happens at the same time that another signal writes the register + * file (has_rf0_flops_conflict). We will use that information when + * scheduling thrsw instructions to avoid putting anything in their + * last delay slot which has a signal that writes to the register file. + */ + + /* Reset tracking if we have an explicit rf0 write or we are starting + * a new thread section. + */ + if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + scoreboard->tick - scoreboard->last_thrsw_tick == 3) { + scoreboard->last_implicit_rf0_write_tick = -10; + scoreboard->has_rf0_flops_conflict = false; + } + + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) { + scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ? + scoreboard->tick + 1 : scoreboard->tick; + } + + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); +} + +static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst, + const struct qinst *qinst, const struct v3d_device_info *devinfo) { + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; @@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } } + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) { + update_scoreboard_for_magic_waddr(scoreboard, + inst->sig_addr, + devinfo); + } + if (inst->sig.ldvary) scoreboard->last_ldvary_tick = scoreboard->tick; - if (qpu_inst_is_tlb(inst)) - scoreboard->tlb_locked = true; + update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo); + + update_scoreboard_tmu_tracking(scoreboard, qinst); } static void @@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo, after_inst->type != V3D_QPU_INSTR_TYPE_ALU) return latency; - if (before_inst->alu.add.magic_write) { + if (v3d_qpu_instr_is_sfu(before_inst)) + return 2; + + if (before_inst->alu.add.op != V3D_QPU_A_NOP && + before_inst->alu.add.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.add.waddr, after_inst)); } - if (before_inst->alu.mul.magic_write) { + if (before_inst->alu.mul.op != V3D_QPU_M_NOP && + before_inst->alu.mul.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.mul.waddr, after_inst)); } - if (v3d_qpu_instr_is_sfu(before_inst)) - return 2; - return latency; } @@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c, { list_addtail(&inst->link, &block->instructions); - update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); + update_scoreboard_for_chosen(scoreboard, inst, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } @@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, { const struct v3d_qpu_instr *inst = &qinst->qpu; - /* Only TLB Z writes are prohibited in the last slot, but we don't - * have those flagged so prohibit all TLB ops for now. - */ - if (slot == 2 && qpu_inst_is_tlb(inst)) + if (slot == 2 && qinst->is_tlb_z_write) return false; if (slot > 0 && qinst->uniform != ~0) return false; - if (v3d_qpu_uses_vpm(inst)) + if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst)) return false; if (inst->sig.ldvary) @@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { /* GFXH-1625: TMUWT not allowed in the final instruction. */ - if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) + if (c->devinfo->ver == 42 && slot == 2 && + inst->alu.add.op == V3D_QPU_A_TMUWT) { return false; + } - /* No writing physical registers at the end. */ - if (!inst->alu.add.magic_write || - !inst->alu.mul.magic_write) { - return false; + if (c->devinfo->ver == 42) { + /* No writing physical registers at the end. */ + bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; + bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; + if ((!add_is_nop && !inst->alu.add.magic_write) || + (!mul_is_nop && !inst->alu.mul.magic_write)) { + return false; + } + + if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && + !inst->sig_magic) { + return false; + } } - if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) - return false; + if (c->devinfo->ver >= 71) { + /* The thread end instruction must not write to the + * register file via the add/mul ALUs. + */ + if (slot == 0 && + (!inst->alu.add.magic_write || + !inst->alu.mul.magic_write)) { + return false; + } + } - /* RF0-2 might be overwritten during the delay slots by - * fragment shader setup. - */ - if (inst->raddr_a < 3 && - (inst->alu.add.a == V3D_QPU_MUX_A || - inst->alu.add.b == V3D_QPU_MUX_A || - inst->alu.mul.a == V3D_QPU_MUX_A || - inst->alu.mul.b == V3D_QPU_MUX_A)) { - return false; + if (c->devinfo->ver == 42) { + /* RF0-2 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) + return false; + + if (inst->raddr_b < 3 && + !inst->sig.small_imm_b && + v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { + return false; + } } - if (inst->raddr_b < 3 && - !inst->sig.small_imm && - (inst->alu.add.a == V3D_QPU_MUX_B || - inst->alu.add.b == V3D_QPU_MUX_B || - inst->alu.mul.a == V3D_QPU_MUX_B || - inst->alu.mul.b == V3D_QPU_MUX_B)) { - return false; + if (c->devinfo->ver >= 71) { + /* RF2-3 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (v3d71_qpu_reads_raddr(inst, 2) || + v3d71_qpu_reads_raddr(inst, 3)) { + return false; + } + + if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) || + v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) { + return false; + } } } @@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, */ static bool qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct qinst *qinst, uint32_t slot) { @@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, * thread. The simulator complains for safety, though it * would only occur for dead code in our case. */ - if (slot > 0 && - qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && - (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || - v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { - return false; + if (slot > 0) { + if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu)) + return false; + if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu)) + return false; } - if (slot > 0 && qinst->qpu.sig.ldvary) - return false; + if (qinst->qpu.sig.ldvary) { + if (c->devinfo->ver == 42 && slot > 0) + return false; + if (c->devinfo->ver >= 71 && slot == 2) + return false; + } /* unifa and the following 3 instructions can't overlap a * thread switch/end. The docs further clarify that this means @@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) return false; + /* See comment when we set has_rf0_flops_conflict for details */ + if (c->devinfo->ver >= 71 && + slot == 2 && + v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) && + !qinst->qpu.sig_magic) { + if (scoreboard->has_rf0_flops_conflict) + return false; + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick) + return false; + } + return true; } @@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, assert(slot <= 2); /* We merge thrsw instructions back into the instruction stream - * manually, so any instructions scheduled after a thrsw shold be + * manually, so any instructions scheduled after a thrsw should be * in the actual delay slots and not in the same slot as the thrsw. */ assert(slot >= 1); @@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, * also apply to instructions scheduled after the thrsw that we want * to place in its delay slots. */ - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot)) return false; /* TLB access is disallowed until scoreboard wait is executed, which @@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_flags(&qinst->qpu)) return false; + /* TSY sync ops materialize at the point of the next thread switch, + * therefore, if we have a TSY sync right after a thread switch, we + * cannot place it in its delay slots, or we would be moving the sync + * to the thrsw before it instead. + */ + if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID) + return false; + return true; } @@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard struct qinst *qinst, int instructions_in_sequence, bool is_thrend) { - /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ - if (scoreboard->last_thrsw_tick + 3 > - scoreboard->tick - instructions_in_sequence) { - return false; - } - for (int slot = 0; slot < instructions_in_sequence; slot++) { - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, + qinst, slot)) { return false; + } if (is_thrend && !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { @@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c, /* Find how far back into previous instructions we can put the THRSW. */ int slots_filled = 0; + int invalid_sig_count = 0; + int invalid_seq_count = 0; + bool last_thrsw_after_invalid_ok = false; struct qinst *merge_inst = NULL; vir_for_each_inst_rev(prev_inst, block) { - struct v3d_qpu_sig sig = prev_inst->qpu.sig; - sig.thrsw = true; - uint32_t packed_sig; - - if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) + /* No emitting our thrsw while the previous thrsw hasn't + * happened yet. + */ + if (scoreboard->last_thrsw_tick + 3 > + scoreboard->tick - (slots_filled + 1)) { break; + } + if (!valid_thrsw_sequence(c, scoreboard, prev_inst, slots_filled + 1, is_thrend)) { - break; + /* Even if the current sequence isn't valid, we may + * be able to get a valid sequence by trying to move the + * thrsw earlier, so keep going. + */ + invalid_seq_count++; + goto cont_block; + } + + struct v3d_qpu_sig sig = prev_inst->qpu.sig; + sig.thrsw = true; + uint32_t packed_sig; + if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { + /* If we can't merge the thrsw here because of signal + * incompatibility, keep going, we might be able to + * merge it in an earlier instruction. + */ + invalid_sig_count++; + goto cont_block; } + /* For last thrsw we need 2 consecutive slots that are + * thrsw compatible, so if we have previously jumped over + * an incompatible signal, flag that we have found the first + * valid slot here and keep going. + */ + if (inst->is_last_thrsw && invalid_sig_count > 0 && + !last_thrsw_after_invalid_ok) { + last_thrsw_after_invalid_ok = true; + invalid_sig_count++; + goto cont_block; + } + + /* We can merge the thrsw in this instruction */ + last_thrsw_after_invalid_ok = false; + invalid_sig_count = 0; + invalid_seq_count = 0; merge_inst = prev_inst; + +cont_block: if (++slots_filled == 3) break; } + /* If we jumped over a signal incompatibility and did not manage to + * merge the thrsw in the end, we need to adjust slots filled to match + * the last valid merge point. + */ + assert((invalid_sig_count == 0 && invalid_seq_count == 0) || + slots_filled >= invalid_sig_count + invalid_seq_count); + if (invalid_sig_count > 0) + slots_filled -= invalid_sig_count; + if (invalid_seq_count > 0) + slots_filled -= invalid_seq_count; + bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; @@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c, merge_inst = inst; } + scoreboard->first_thrsw_emitted = true; + /* If we're emitting the last THRSW (other than program end), then * signal that to the HW by emitting two THRSWs in a row. */ @@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c, struct qinst *second_inst = (struct qinst *)merge_inst->link.next; second_inst->qpu.sig.thrsw = true; + scoreboard->last_thrsw_emitted = true; } /* Make sure the thread end executes within the program lifespan */ @@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c, assert(scoreboard->last_branch_tick + 3 < branch_tick); assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); - /* Can't place a branch with msfign != 0 and cond != 0,2,3 after + /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after * setmsf. */ bool is_safe_msf_branch = + c->devinfo->ver >= 71 || inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || @@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c, break; } + /* Do not move up a branch if it can disrupt an ldvary sequence + * as that can cause stomping of the r5 register. + */ + if (scoreboard->last_ldvary_tick + 2 >= + branch_tick - slots_filled) { + break; + } + /* Can't move a conditional branch before the instruction * that writes the flags for its condition. */ @@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c, } static bool -alu_reads_register(struct v3d_qpu_instr *inst, +alu_reads_register(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst, bool add, bool magic, uint32_t index) { uint32_t num_src; - enum v3d_qpu_mux mux_a, mux_b; - - if (add) { + if (add) num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); - mux_a = inst->alu.add.a; - mux_b = inst->alu.add.b; - } else { + else num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); - mux_a = inst->alu.mul.a; - mux_b = inst->alu.mul.b; - } - for (int i = 0; i < num_src; i++) { - if (magic) { - if (i == 0 && mux_a == index) - return true; - if (i == 1 && mux_b == index) - return true; + if (devinfo->ver == 42) { + enum v3d_qpu_mux mux_a, mux_b; + if (add) { + mux_a = inst->alu.add.a.mux; + mux_b = inst->alu.add.b.mux; } else { - if (i == 0 && mux_a == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 0 && mux_a == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; + mux_a = inst->alu.mul.a.mux; + mux_b = inst->alu.mul.b.mux; + } + + for (int i = 0; i < num_src; i++) { + if (magic) { + if (i == 0 && mux_a == index) + return true; + if (i == 1 && mux_b == index) + return true; + } else { + if (i == 0 && mux_a == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 0 && mux_a == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } } } + + return false; + } + + assert(devinfo->ver >= 71); + assert(!magic); + + uint32_t raddr_a, raddr_b; + if (add) { + raddr_a = inst->alu.add.a.raddr; + raddr_b = inst->alu.add.b.raddr; + } else { + raddr_a = inst->alu.mul.a.raddr; + raddr_b = inst->alu.mul.b.raddr; + } + + for (int i = 0; i < num_src; i++) { + if (i == 0 && raddr_a == index) + return true; + if (i == 1 && raddr_b == index) + return true; } return false; @@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c, struct qblock *block, struct v3d_qpu_instr *inst) { - /* We only call this if we have successfuly merged an ldvary into a + const struct v3d_device_info *devinfo = c->devinfo; + + /* We only call this if we have successfully merged an ldvary into a * previous instruction. */ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); @@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c, * the ldvary destination, if it does, then moving the ldvary before * it would overwrite it. */ - if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index)) return false; - if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The implicit ldvary destination may not be written to by a signal + * in the instruction following ldvary. Since we are planning to move + * ldvary to the previous instruction, this means we need to check if + * the current instruction has any other signal that could create this + * conflict. The only other signal that can write to the implicit + * ldvary destination that is compatible with ldvary in the same + * instruction is ldunif. + */ + if (inst->sig.ldunif) return false; /* The previous instruction can't write to the same destination as the @@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, } /* The previous instruction cannot have a conflicting signal */ - if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) + if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig)) + return false; + + uint32_t sig; + struct v3d_qpu_sig new_sig = prev->qpu.sig; + new_sig.ldvary = true; + if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) return false; /* The previous instruction cannot use flags since ldvary uses the @@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, /* We can't put an ldvary in the delay slots of a thrsw. We should've * prevented this when pairing up the ldvary with another instruction - * and flagging it for a fixup. + * and flagging it for a fixup. In V3D 7.x this is limited only to the + * second delay slot. */ - assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); + assert((devinfo->ver == 42 && + scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) || + (devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1)); /* Move the ldvary to the previous instruction and remove it from the * current one. @@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c, inst->sig_magic = false; inst->sig_addr = 0; - /* By moving ldvary to the previous instruction we make it update - * r5 in the current one, so nothing else in it should write r5. + /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */ + if (devinfo->ver >= 71) { + scoreboard->last_implicit_rf0_write_tick = scoreboard->tick; + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); + } + + /* By moving ldvary to the previous instruction we make it update r5 + * (rf0 for ver >= 71) in the current one, so nothing else in it + * should write this register. + * * This should've been prevented by our depedency tracking, which * would not allow ldvary to be paired up with an instruction that - * writes r5 (since our dependency tracking doesn't know that the - * ldvary write r5 happens in the next instruction). + * writes r5/rf0 (since our dependency tracking doesn't know that the + * ldvary write to r5/rf0 happens in the next instruction). */ - assert(!v3d_qpu_writes_r5(c->devinfo, inst)); + assert(!v3d_qpu_writes_r5(devinfo, inst)); + assert(devinfo->ver == 42 || + (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0))); return true; } @@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c, merge->inst->uniform; } + chosen->inst->ldtmu_count += + merge->inst->ldtmu_count; + if (debug) { fprintf(stderr, "t=%4d: merging: ", time); @@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c, } } } - if (mux_read_stalls(scoreboard, inst)) + if (read_stalls(c->devinfo, scoreboard, inst)) c->qpu_inst_stalled_count++; } @@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_branch_tick = -10; scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; + scoreboard.first_ldtmu_after_thrsw = true; + scoreboard.last_implicit_rf0_write_tick = - 10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); |