summaryrefslogtreecommitdiff
path: root/src/broadcom/compiler/qpu_schedule.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/broadcom/compiler/qpu_schedule.c')
-rw-r--r--src/broadcom/compiler/qpu_schedule.c1158
1 files changed, 902 insertions, 256 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index c559814b9ea..ba76ac87e1e 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -85,6 +85,7 @@ struct schedule_state {
struct schedule_node *last_unif;
struct schedule_node *last_rtop;
struct schedule_node *last_unifa;
+ struct schedule_node *last_setmsf;
enum direction dir;
/* Estimated cycle when the current instruction would start. */
uint32_t time;
@@ -97,7 +98,7 @@ add_dep(struct schedule_state *state,
bool write)
{
bool write_after_read = !write && state->dir == R;
- void *edge_data = (void *)(uintptr_t)write_after_read;
+ uintptr_t edge_data = write_after_read;
if (!before || !after)
return;
@@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
(inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
return true;
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
(inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
return true;
@@ -153,12 +156,13 @@ static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
enum v3d_qpu_mux mux)
{
+ assert(state->devinfo->ver < 71);
switch (mux) {
case V3D_QPU_MUX_A:
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- if (!n->inst->qpu.sig.small_imm) {
+ if (!n->inst->qpu.sig.small_imm_b) {
add_read_dep(state,
state->last_rf[n->inst->qpu.raddr_b], n);
}
@@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint8_t raddr, bool is_small_imm)
+{
+ assert(state->devinfo->ver >= 71);
+
+ if (!is_small_imm)
+ add_read_dep(state, state->last_rf[raddr], n);
+}
+
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
@@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr)
static bool
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
{
- if (devinfo->ver < 40)
- return false;
-
if (tmu_write_is_sequence_terminator(waddr))
return false;
@@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
break;
case V3D_QPU_WADDR_UNIFA:
- if (state->devinfo->ver >= 40)
- add_write_dep(state, &state->last_unifa, n);
+ add_write_dep(state, &state->last_unifa, n);
break;
case V3D_QPU_WADDR_NOP:
@@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* If the input and output segments are shared, then all VPM reads to
* a location need to happen before all writes. We handle this by
* serializing all VPM operations for now.
+ *
+ * FIXME: we are assuming that the segments are shared. That is
+ * correct right now as we are only using shared, but technically you
+ * can choose.
*/
bool separate_vpm_segment = false;
@@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a);
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
+ inst->sig.small_imm_a);
+ }
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
+ inst->sig.small_imm_b);
+ }
+ }
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a);
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+ inst->sig.small_imm_c);
+ }
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+ inst->sig.small_imm_d);
+ }
+ }
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
@@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
case V3D_QPU_A_MSF:
add_read_dep(state, state->last_tlb, n);
+ add_read_dep(state, state->last_setmsf, n);
break;
case V3D_QPU_A_SETMSF:
+ add_write_dep(state, &state->last_setmsf, n);
+ add_write_dep(state, &state->last_tmu_write, n);
+ FALLTHROUGH;
case V3D_QPU_A_SETREVF:
add_write_dep(state, &state->last_tlb, n);
break;
+ case V3D_QPU_A_BALLOT:
+ case V3D_QPU_A_BCASTF:
+ case V3D_QPU_A_ALLEQ:
+ case V3D_QPU_A_ALLFEQ:
+ add_read_dep(state, state->last_setmsf, n);
+ break;
+
default:
break;
}
@@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_r[4], n);
if (v3d_qpu_writes_r5(devinfo, inst))
add_write_dep(state, &state->last_r[5], n);
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+ add_write_dep(state, &state->last_rf[0], n);
/* If we add any more dependencies here we should consider whether we
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -492,9 +544,16 @@ struct choose_scoreboard {
int last_thrsw_tick;
int last_branch_tick;
int last_setmsf_tick;
- bool tlb_locked;
+ bool first_thrsw_emitted;
+ bool last_thrsw_emitted;
bool fixup_ldvary;
int ldvary_count;
+ int pending_ldtmu_count;
+ bool first_ldtmu_after_thrsw;
+
+ /* V3D 7.x */
+ int last_implicit_rf0_write_tick;
+ bool has_rf0_flops_conflict;
};
static bool
@@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
}
static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ switch (raddr) {
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
struct qinst *qinst)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
if (inst->alu.add.op != V3D_QPU_A_NOP) {
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+ return true;
+ }
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
}
@@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
v3d_qpu_writes_r4(devinfo, inst))
return true;
+ if (devinfo->ver == 42)
+ return false;
+
+ /* Don't schedule anything that writes rf0 right after ldvary, since
+ * that would clash with the ldvary's delayed rf0 write (the exception
+ * is another ldvary, since its implicit rf0 write would also have
+ * one cycle of delay and would not clash).
+ */
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !inst->sig.ldvary))) {
+ return true;
+ }
+
return false;
}
static bool
-pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+scoreboard_is_locked(struct choose_scoreboard *scoreboard,
+ bool lock_scoreboard_on_first_thrsw)
+{
+ if (lock_scoreboard_on_first_thrsw) {
+ return scoreboard->first_thrsw_emitted &&
+ scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+ }
+
+ return scoreboard->last_thrsw_emitted &&
+ scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct v3d_qpu_instr *inst)
{
- return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+ return qpu_inst_is_tlb(inst) &&
+ !scoreboard_is_locked(scoreboard,
+ c->lock_scoreboard_on_first_thrsw);
}
static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
uint32_t waddr) {
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
- inst->raddr_a == waddr)
- return true;
+ if (devinfo->ver < 71) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm && (inst->raddr_b == waddr))
- return true;
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+ return true;
+ } else {
+ if (v3d71_qpu_reads_raddr(inst, waddr))
+ return true;
+ }
return false;
}
static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
{
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
- qpu_instruction_uses_rf(inst,
+ qpu_instruction_uses_rf(devinfo, inst,
scoreboard->last_stallable_sfu_reg);
}
/* We define a max schedule priority to allow negative priorities as result of
- * substracting this max when an instruction stalls. So instructions that
+ * subtracting this max when an instruction stalls. So instructions that
* stall have lower priority than regular instructions. */
#define MAX_SCHEDULE_PRIORITY 16
@@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
return next_score;
next_score++;
+ /* Empirical testing shows that using priorities to hide latency of
+ * TMU operations when scheduling QPU leads to slightly worse
+ * performance, even at 2 threads. We think this is because the thread
+ * switching is already quite effective at hiding latency and NIR
+ * scheduling (and possibly TMU pipelining too) are sufficient to hide
+ * TMU latency, so piling up on that here doesn't provide any benefits
+ * and instead may cause us to postpone critical paths that depend on
+ * the TMU results.
+ */
+#if 0
/* Schedule texture read results collection late to hide latency. */
if (v3d_qpu_waits_on_tmu(inst))
return next_score;
next_score++;
+#endif
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
+#if 0
/* Schedule texture read setup early to hide their latency better. */
if (v3d_qpu_writes_tmu(devinfo, inst))
return next_score;
next_score++;
+#endif
/* We should increase the maximum if we assert here */
assert(next_score < MAX_SCHEDULE_PRIORITY);
@@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
return baseline_score;
}
-static bool
-qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
- enum v3d_qpu_waddr waddr)
-{
- return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
- v3d_qpu_magic_waddr_is_sfu(waddr) ||
- v3d_qpu_magic_waddr_is_tlb(waddr) ||
- v3d_qpu_magic_waddr_is_vpm(waddr) ||
- v3d_qpu_magic_waddr_is_tsy(waddr));
-}
+enum {
+ V3D_PERIPHERAL_VPM_READ = (1 << 0),
+ V3D_PERIPHERAL_VPM_WRITE = (1 << 1),
+ V3D_PERIPHERAL_VPM_WAIT = (1 << 2),
+ V3D_PERIPHERAL_SFU = (1 << 3),
+ V3D_PERIPHERAL_TMU_WRITE = (1 << 4),
+ V3D_PERIPHERAL_TMU_READ = (1 << 5),
+ V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
+ V3D_PERIPHERAL_TSY = (1 << 8),
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
+};
-static bool
-qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *inst)
+static uint32_t
+qpu_peripherals(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
{
- if (v3d_qpu_uses_vpm(inst))
- return true;
+ uint32_t result = 0;
+ if (v3d_qpu_reads_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_READ;
+ if (v3d_qpu_writes_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_WRITE;
+ if (v3d_qpu_waits_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_WAIT;
+
+ if (v3d_qpu_writes_tmu(devinfo, inst))
+ result |= V3D_PERIPHERAL_TMU_WRITE;
+ if (inst->sig.ldtmu)
+ result |= V3D_PERIPHERAL_TMU_READ;
+ if (inst->sig.wrtmuc)
+ result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
+
if (v3d_qpu_uses_sfu(inst))
- return true;
+ result |= V3D_PERIPHERAL_SFU;
+
+ if (v3d_qpu_reads_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_READ;
+ if (v3d_qpu_writes_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_WRITE;
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
- qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
- return true;
+ v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
+ result |= V3D_PERIPHERAL_TSY;
}
if (inst->alu.add.op == V3D_QPU_A_TMUWT)
- return true;
-
- if (inst->alu.mul.op != V3D_QPU_M_NOP &&
- inst->alu.mul.magic_write &&
- qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
- return true;
- }
+ result |= V3D_PERIPHERAL_TMU_WAIT;
}
- return (inst->sig.ldvpm ||
- inst->sig.ldtmu ||
- inst->sig.ldtlb ||
- inst->sig.ldtlbu ||
- inst->sig.wrtmuc);
+ return result;
}
static bool
@@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *a,
const struct v3d_qpu_instr *b)
{
- const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
- const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
+ const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
+ const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
/* We can always do one peripheral access per instruction. */
- if (!a_uses_peripheral || !b_uses_peripheral)
+ if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
return true;
- if (devinfo->ver < 41)
+ /* V3D 4.x can't do more than one peripheral access except in a
+ * few cases:
+ */
+ if (devinfo->ver == 42) {
+ /* WRTMUC signal with TMU register write (other than tmuc). */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ }
+
+ /* TMU read with VPM read/write. */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+
return false;
+ }
- /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
- * WRTMUC with a TMU magic register write (other than tmuc).
- */
- if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
- (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
- return true;
+ /* V3D 7.x can't have more than one of these restricted peripherals */
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+ V3D_PERIPHERAL_TSY |
+ V3D_PERIPHERAL_TLB_READ |
+ V3D_PERIPHERAL_SFU |
+ V3D_PERIPHERAL_VPM_READ |
+ V3D_PERIPHERAL_VPM_WRITE;
+
+ const uint32_t a_restricted = a_peripherals & restricted;
+ const uint32_t b_restricted = b_peripherals & restricted;
+ if (a_restricted && b_restricted) {
+ /* WRTMUC signal with TMU register write (other than tmuc) is
+ * allowed though.
+ */
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+ return false;
+ }
}
- if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
- (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
- return true;
+ /* Only one TMU read per instruction */
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+ return false;
}
- return false;
+ /* Only one TLB access per instruction */
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ)) &&
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ))) {
+ return false;
+ }
+
+ return true;
}
/* Compute a bitmask of which rf registers are used between
@@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
uint64_t raddrs_used = 0;
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
raddrs_used |= (1ll << a->raddr_a);
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
raddrs_used |= (1ll << a->raddr_b);
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
raddrs_used |= (1ll << b->raddr_a);
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
raddrs_used |= (1ll << b->raddr_b);
return raddrs_used;
}
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
*/
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *add_instr,
- const struct v3d_qpu_instr *mul_instr)
+ const struct v3d_qpu_instr *mul_instr,
+ const struct v3d_device_info *devinfo)
{
+ if (devinfo->ver >= 71) {
+ assert(add_instr->sig.small_imm_a +
+ add_instr->sig.small_imm_b <= 1);
+ assert(add_instr->sig.small_imm_c +
+ add_instr->sig.small_imm_d == 0);
+ assert(mul_instr->sig.small_imm_a +
+ mul_instr->sig.small_imm_b == 0);
+ assert(mul_instr->sig.small_imm_c +
+ mul_instr->sig.small_imm_d <= 1);
+
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+ return (result->sig.small_imm_a +
+ result->sig.small_imm_b +
+ result->sig.small_imm_c +
+ result->sig.small_imm_d) <= 1;
+ }
+
+ assert(devinfo->ver == 42);
+
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
int naddrs = util_bitcount64(raddrs_used);
if (naddrs > 2)
return false;
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
if (naddrs > 1)
return false;
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
if (add_instr->raddr_b != mul_instr->raddr_b)
return false;
- result->sig.small_imm = true;
- result->raddr_b = add_instr->sig.small_imm ?
+ result->sig.small_imm_b = true;
+ result->raddr_b = add_instr->sig.small_imm_b ?
add_instr->raddr_b : mul_instr->raddr_b;
}
@@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
raddrs_used &= ~(1ll << raddr_a);
result->raddr_a = raddr_a;
- if (!result->sig.small_imm) {
+ if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
- result->alu.add.a = V3D_QPU_MUX_A;
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_A;
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
raddr_a == mul_instr->raddr_b) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
- result->alu.mul.a = V3D_QPU_MUX_A;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_A;
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
}
}
}
@@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
result->raddr_b = raddr_b;
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
raddr_b == add_instr->raddr_a) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
- result->alu.add.a = V3D_QPU_MUX_B;
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_B;
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
raddr_b == mul_instr->raddr_a) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
- result->alu.mul.a = V3D_QPU_MUX_B;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_B;
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
}
}
@@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
}
static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->flags.ac = V3D_QPU_COND_NONE;
inst->flags.apf = V3D_QPU_PF_NONE;
inst->flags.auf = V3D_QPU_UF_NONE;
+
+ inst->alu.mul.output_pack = inst->alu.add.output_pack;
+
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+ inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ if (devinfo->ver >= 71) {
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+ if (inst->sig.small_imm_a) {
+ inst->sig.small_imm_c = true;
+ inst->sig.small_imm_a = false;
+ } else if (inst->sig.small_imm_b) {
+ inst->sig.small_imm_d = true;
+ inst->sig.small_imm_b = false;
+ }
+ }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ case V3D_QPU_M_FMOV:
+ return devinfo->ver >= 71;
+ default:
+ return false;
+ }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ return V3D_QPU_A_MOV;
+ case V3D_QPU_M_FMOV:
+ return V3D_QPU_A_FMOV;
+ default:
+ unreachable("unexpected mov opcode");
+ }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+ inst->alu.mul.op = V3D_QPU_M_NOP;
+
+ inst->flags.ac = inst->flags.mc;
+ inst->flags.apf = inst->flags.mpf;
+ inst->flags.auf = inst->flags.muf;
+ inst->flags.mc = V3D_QPU_COND_NONE;
+ inst->flags.mpf = V3D_QPU_PF_NONE;
+ inst->flags.muf = V3D_QPU_UF_NONE;
+
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+ if (inst->sig.small_imm_c) {
+ inst->sig.small_imm_a = true;
+ inst->sig.small_imm_c = false;
+ } else if (inst->sig.small_imm_d) {
+ inst->sig.small_imm_b = true;
+ inst->sig.small_imm_d = false;
+ }
}
static bool
@@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge.alu.mul = mul_inst.alu.mul;
- merge.flags.mc = b->flags.ac;
- merge.flags.mpf = b->flags.apf;
- merge.flags.muf = b->flags.auf;
+ merge.flags.mc = mul_inst.flags.mc;
+ merge.flags.mpf = mul_inst.flags.mpf;
+ merge.flags.muf = mul_inst.flags.muf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
@@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
}
}
+ struct v3d_qpu_instr add_inst;
if (b->alu.mul.op != V3D_QPU_M_NOP) {
- if (a->alu.mul.op != V3D_QPU_M_NOP)
- return false;
- merge.alu.mul = b->alu.mul;
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
+ merge.alu.mul = b->alu.mul;
- merge.flags.mc = b->flags.mc;
- merge.flags.mpf = b->flags.mpf;
- merge.flags.muf = b->flags.muf;
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
- mul_instr = b;
- add_instr = a;
+ mul_instr = b;
+ add_instr = a;
+ }
+ /* If a's mul op is used but its add op is not, then see if we
+ * can convert either a's mul op or b's mul op to an add op
+ * so we can merge.
+ */
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+ add_inst = *b;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge.alu.add = add_inst.alu.add;
+
+ merge.flags.ac = add_inst.flags.ac;
+ merge.flags.apf = add_inst.flags.apf;
+ merge.flags.auf = add_inst.flags.auf;
+
+ mul_instr = a;
+ add_instr = &add_inst;
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+ add_inst = *a;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge = add_inst;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = &add_inst;
+ } else {
+ return false;
+ }
}
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+ * they have restrictions on the number of raddrs that can be adressed
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
+ * but we are still limited to a single small immediate per instruction.
+ */
if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
- return false;
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
}
merge.sig.thrsw |= b->sig.thrsw;
@@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm |= b->sig.small_imm;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1047,24 +1403,25 @@ retry:
* regfile A or B that was written to by the previous
* instruction."
*/
- if (reads_too_soon_after_write(scoreboard, n->inst))
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
- /* "A scoreboard wait must not occur in the first two
- * instructions of a fragment shader. This is either the
- * explicit Wait for Scoreboard signal or an implicit wait
- * with the first tile-buffer read or write instruction."
+ /* "Before doing a TLB access a scoreboard wait must have been
+ * done. This happens either on the first or last thread
+ * switch, depending on a setting (scb_wait_on_first_thrsw) in
+ * the shader state."
*/
- if (pixel_scoreboard_too_soon(scoreboard, inst))
+ if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* ldunif and ldvary both write r5, but ldunif does so a tick
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ /* ldunif and ldvary both write the same register (r5 for v42
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
+ * If the ldvary's register wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
- * r5 in the same tick.
+ * the register in the same tick.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1131,24 +1488,54 @@ retry:
continue;
}
- /* Don't merge in something that will lock the TLB.
- * Hopwefully what we have in inst will release some
- * other instructions, allowing us to delay the
- * TLB-locking instruction until later.
+ /* Don't merge TLB instructions before we have acquired
+ * the scoreboard lock.
*/
- if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+ if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* When we succesfully pair up an ldvary we then try
+ /* When we successfully pair up an ldvary we then try
* to merge it into the previous instruction if
* possible to improve pipelining. Don't pick up the
* ldvary now if the follow-up fixup would place
* it in the delay slots of a thrsw, which is not
* allowed and would prevent the fixup from being
- * successul.
+ * successful. In V3D 7.x we can allow this to happen
+ * as long as it is not the last delay slot.
*/
- if (inst->sig.ldvary &&
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+ if (inst->sig.ldvary) {
+ if (c->devinfo->ver == 42 &&
+ scoreboard->last_thrsw_tick + 2 >=
+ scoreboard->tick - 1) {
+ continue;
+ }
+ if (c->devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 ==
+ scoreboard->tick - 1) {
+ continue;
+ }
+ }
+
+ /* We can emit a new tmu lookup with a previous ldtmu
+ * if doing this would free just enough space in the
+ * TMU output fifo so we don't overflow, however, this
+ * is only safe if the ldtmu cannot stall.
+ *
+ * A ldtmu can stall if it is not the first following a
+ * thread switch and corresponds to the first word of a
+ * read request.
+ *
+ * FIXME: For now we forbid pairing up a new lookup
+ * with a previous ldtmu that is not the first after a
+ * thrsw if that could overflow the TMU output fifo
+ * regardless of whether the ldtmu is reading the first
+ * word of a TMU result or not, since we don't track
+ * this aspect in the compiler yet.
+ */
+ if (prev_inst->inst->qpu.sig.ldtmu &&
+ !scoreboard->first_ldtmu_after_thrsw &&
+ (scoreboard->pending_ldtmu_count +
+ n->inst->ldtmu_count > 16 / c->threads)) {
continue;
}
@@ -1161,7 +1548,7 @@ retry:
int prio = get_instruction_priority(c->devinfo, inst);
- if (mux_read_stalls(scoreboard, inst)) {
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
/* Don't merge an instruction that stalls */
if (prev_inst)
continue;
@@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
{
if (v3d_qpu_magic_waddr_is_sfu(waddr))
scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
- else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
+ else if (waddr == V3D_QPU_WADDR_UNIFA)
scoreboard->last_unifa_write_tick = scoreboard->tick;
}
@@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
}
static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+ const struct qinst *inst)
+{
+ /* Track if the have seen any ldtmu after the last thread switch */
+ if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+ scoreboard->first_ldtmu_after_thrsw = true;
+
+ /* Track the number of pending ldtmu instructions for outstanding
+ * TMU lookups.
+ */
+ scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+ if (inst->qpu.sig.ldtmu) {
+ assert(scoreboard->pending_ldtmu_count > 0);
+ scoreboard->pending_ldtmu_count--;
+ scoreboard->first_ldtmu_after_thrsw = false;
+ }
+}
+
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ scoreboard->has_rf0_flops_conflict = true;
+ }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return;
+
+ /* Thread switch restrictions:
+ *
+ * At the point of a thread switch or thread end (when the actual
+ * thread switch or thread end happens, not when the signalling
+ * instruction is processed):
+ *
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
+ * ldvary instruction in which another signal also wrote to the
+ * register file, and the final instruction of the thread section
+ * contained a signal which wrote to the register file, then the
+ * value of rf0 is undefined at the start of the new section
+ *
+ * Here we use the scoreboard to track if our last rf0 implicit write
+ * happens at the same time that another signal writes the register
+ * file (has_rf0_flops_conflict). We will use that information when
+ * scheduling thrsw instructions to avoid putting anything in their
+ * last delay slot which has a signal that writes to the register file.
+ */
+
+ /* Reset tracking if we have an explicit rf0 write or we are starting
+ * a new thread section.
+ */
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+ scoreboard->last_implicit_rf0_write_tick = -10;
+ scoreboard->has_rf0_flops_conflict = false;
+ }
+
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+ scoreboard->tick + 1 : scoreboard->tick;
+ }
+
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
+static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst,
+ const struct qinst *qinst,
const struct v3d_device_info *devinfo)
{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
return;
@@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
}
}
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
+ update_scoreboard_for_magic_waddr(scoreboard,
+ inst->sig_addr,
+ devinfo);
+ }
+
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
- if (qpu_inst_is_tlb(inst))
- scoreboard->tlb_locked = true;
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
+ update_scoreboard_tmu_tracking(scoreboard, qinst);
}
static void
@@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo,
after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
return latency;
- if (before_inst->alu.add.magic_write) {
+ if (v3d_qpu_instr_is_sfu(before_inst))
+ return 2;
+
+ if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
+ before_inst->alu.add.magic_write) {
latency = MAX2(latency,
magic_waddr_latency(devinfo,
before_inst->alu.add.waddr,
after_inst));
}
- if (before_inst->alu.mul.magic_write) {
+ if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
+ before_inst->alu.mul.magic_write) {
latency = MAX2(latency,
magic_waddr_latency(devinfo,
before_inst->alu.mul.waddr,
after_inst));
}
- if (v3d_qpu_instr_is_sfu(before_inst))
- return 2;
-
return latency;
}
@@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
{
list_addtail(&inst->link, &block->instructions);
- update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+ update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
c->qpu_inst_count++;
scoreboard->tick++;
}
@@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
- /* Only TLB Z writes are prohibited in the last slot, but we don't
- * have those flagged so prohibit all TLB ops for now.
- */
- if (slot == 2 && qpu_inst_is_tlb(inst))
+ if (slot == 2 && qinst->is_tlb_z_write)
return false;
if (slot > 0 && qinst->uniform != ~0)
return false;
- if (v3d_qpu_uses_vpm(inst))
+ if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
return false;
if (inst->sig.ldvary)
@@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
/* GFXH-1625: TMUWT not allowed in the final instruction. */
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+ if (c->devinfo->ver == 42 && slot == 2 &&
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
return false;
+ }
- /* No writing physical registers at the end. */
- if (!inst->alu.add.magic_write ||
- !inst->alu.mul.magic_write) {
- return false;
+ if (c->devinfo->ver == 42) {
+ /* No writing physical registers at the end. */
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
+ return false;
+ }
+
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ return false;
+ }
}
- if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* The thread end instruction must not write to the
+ * register file via the add/mul ALUs.
+ */
+ if (slot == 0 &&
+ (!inst->alu.add.magic_write ||
+ !inst->alu.mul.magic_write)) {
+ return false;
+ }
+ }
- /* RF0-2 might be overwritten during the delay slots by
- * fragment shader setup.
- */
- if (inst->raddr_a < 3 &&
- (inst->alu.add.a == V3D_QPU_MUX_A ||
- inst->alu.add.b == V3D_QPU_MUX_A ||
- inst->alu.mul.a == V3D_QPU_MUX_A ||
- inst->alu.mul.b == V3D_QPU_MUX_A)) {
- return false;
+ if (c->devinfo->ver == 42) {
+ /* RF0-2 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+ return false;
+
+ if (inst->raddr_b < 3 &&
+ !inst->sig.small_imm_b &&
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+ return false;
+ }
}
- if (inst->raddr_b < 3 &&
- !inst->sig.small_imm &&
- (inst->alu.add.a == V3D_QPU_MUX_B ||
- inst->alu.add.b == V3D_QPU_MUX_B ||
- inst->alu.mul.a == V3D_QPU_MUX_B ||
- inst->alu.mul.b == V3D_QPU_MUX_B)) {
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* RF2-3 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
+ v3d71_qpu_reads_raddr(inst, 3)) {
+ return false;
+ }
+
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+ return false;
+ }
}
}
@@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
*/
static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
uint32_t slot)
{
@@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* thread. The simulator complains for safety, though it
* would only occur for dead code in our case.
*/
- if (slot > 0 &&
- qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
- v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
- return false;
+ if (slot > 0) {
+ if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+ return false;
+ if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
+ return false;
}
- if (slot > 0 && qinst->qpu.sig.ldvary)
- return false;
+ if (qinst->qpu.sig.ldvary) {
+ if (c->devinfo->ver == 42 && slot > 0)
+ return false;
+ if (c->devinfo->ver >= 71 && slot == 2)
+ return false;
+ }
/* unifa and the following 3 instructions can't overlap a
* thread switch/end. The docs further clarify that this means
@@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
return false;
+ /* See comment when we set has_rf0_flops_conflict for details */
+ if (c->devinfo->ver >= 71 &&
+ slot == 2 &&
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+ !qinst->qpu.sig_magic) {
+ if (scoreboard->has_rf0_flops_conflict)
+ return false;
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+ return false;
+ }
+
return true;
}
@@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
assert(slot <= 2);
/* We merge thrsw instructions back into the instruction stream
- * manually, so any instructions scheduled after a thrsw shold be
+ * manually, so any instructions scheduled after a thrsw should be
* in the actual delay slots and not in the same slot as the thrsw.
*/
assert(slot >= 1);
@@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* also apply to instructions scheduled after the thrsw that we want
* to place in its delay slots.
*/
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
return false;
/* TLB access is disallowed until scoreboard wait is executed, which
@@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_flags(&qinst->qpu))
return false;
+ /* TSY sync ops materialize at the point of the next thread switch,
+ * therefore, if we have a TSY sync right after a thread switch, we
+ * cannot place it in its delay slots, or we would be moving the sync
+ * to the thrsw before it instead.
+ */
+ if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
+ return false;
+
return true;
}
@@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
struct qinst *qinst, int instructions_in_sequence,
bool is_thrend)
{
- /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
- if (scoreboard->last_thrsw_tick + 3 >
- scoreboard->tick - instructions_in_sequence) {
- return false;
- }
-
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+ qinst, slot)) {
return false;
+ }
if (is_thrend &&
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c,
/* Find how far back into previous instructions we can put the THRSW. */
int slots_filled = 0;
+ int invalid_sig_count = 0;
+ int invalid_seq_count = 0;
+ bool last_thrsw_after_invalid_ok = false;
struct qinst *merge_inst = NULL;
vir_for_each_inst_rev(prev_inst, block) {
- struct v3d_qpu_sig sig = prev_inst->qpu.sig;
- sig.thrsw = true;
- uint32_t packed_sig;
-
- if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
+ /* No emitting our thrsw while the previous thrsw hasn't
+ * happened yet.
+ */
+ if (scoreboard->last_thrsw_tick + 3 >
+ scoreboard->tick - (slots_filled + 1)) {
break;
+ }
+
if (!valid_thrsw_sequence(c, scoreboard,
prev_inst, slots_filled + 1,
is_thrend)) {
- break;
+ /* Even if the current sequence isn't valid, we may
+ * be able to get a valid sequence by trying to move the
+ * thrsw earlier, so keep going.
+ */
+ invalid_seq_count++;
+ goto cont_block;
+ }
+
+ struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+ sig.thrsw = true;
+ uint32_t packed_sig;
+ if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
+ /* If we can't merge the thrsw here because of signal
+ * incompatibility, keep going, we might be able to
+ * merge it in an earlier instruction.
+ */
+ invalid_sig_count++;
+ goto cont_block;
}
+ /* For last thrsw we need 2 consecutive slots that are
+ * thrsw compatible, so if we have previously jumped over
+ * an incompatible signal, flag that we have found the first
+ * valid slot here and keep going.
+ */
+ if (inst->is_last_thrsw && invalid_sig_count > 0 &&
+ !last_thrsw_after_invalid_ok) {
+ last_thrsw_after_invalid_ok = true;
+ invalid_sig_count++;
+ goto cont_block;
+ }
+
+ /* We can merge the thrsw in this instruction */
+ last_thrsw_after_invalid_ok = false;
+ invalid_sig_count = 0;
+ invalid_seq_count = 0;
merge_inst = prev_inst;
+
+cont_block:
if (++slots_filled == 3)
break;
}
+ /* If we jumped over a signal incompatibility and did not manage to
+ * merge the thrsw in the end, we need to adjust slots filled to match
+ * the last valid merge point.
+ */
+ assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
+ slots_filled >= invalid_sig_count + invalid_seq_count);
+ if (invalid_sig_count > 0)
+ slots_filled -= invalid_sig_count;
+ if (invalid_seq_count > 0)
+ slots_filled -= invalid_seq_count;
+
bool needs_free = false;
if (merge_inst) {
merge_inst->qpu.sig.thrsw = true;
@@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c,
merge_inst = inst;
}
+ scoreboard->first_thrsw_emitted = true;
+
/* If we're emitting the last THRSW (other than program end), then
* signal that to the HW by emitting two THRSWs in a row.
*/
@@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c,
struct qinst *second_inst =
(struct qinst *)merge_inst->link.next;
second_inst->qpu.sig.thrsw = true;
+ scoreboard->last_thrsw_emitted = true;
}
/* Make sure the thread end executes within the program lifespan */
@@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c,
assert(scoreboard->last_branch_tick + 3 < branch_tick);
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
* setmsf.
*/
bool is_safe_msf_branch =
+ c->devinfo->ver >= 71 ||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c,
break;
}
+ /* Do not move up a branch if it can disrupt an ldvary sequence
+ * as that can cause stomping of the r5 register.
+ */
+ if (scoreboard->last_ldvary_tick + 2 >=
+ branch_tick - slots_filled) {
+ break;
+ }
+
/* Can't move a conditional branch before the instruction
* that writes the flags for its condition.
*/
@@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c,
}
static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
uint32_t num_src;
- enum v3d_qpu_mux mux_a, mux_b;
-
- if (add) {
+ if (add)
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a;
- mux_b = inst->alu.add.b;
- } else {
+ else
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a;
- mux_b = inst->alu.mul.b;
- }
- for (int i = 0; i < num_src; i++) {
- if (magic) {
- if (i == 0 && mux_a == index)
- return true;
- if (i == 1 && mux_b == index)
- return true;
+ if (devinfo->ver == 42) {
+ enum v3d_qpu_mux mux_a, mux_b;
+ if (add) {
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (magic) {
+ if (i == 0 && mux_a == index)
+ return true;
+ if (i == 1 && mux_b == index)
+ return true;
+ } else {
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
}
}
+
+ return false;
+ }
+
+ assert(devinfo->ver >= 71);
+ assert(!magic);
+
+ uint32_t raddr_a, raddr_b;
+ if (add) {
+ raddr_a = inst->alu.add.a.raddr;
+ raddr_b = inst->alu.add.b.raddr;
+ } else {
+ raddr_a = inst->alu.mul.a.raddr;
+ raddr_b = inst->alu.mul.b.raddr;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (i == 0 && raddr_a == index)
+ return true;
+ if (i == 1 && raddr_b == index)
+ return true;
}
return false;
@@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
struct qblock *block,
struct v3d_qpu_instr *inst)
{
- /* We only call this if we have successfuly merged an ldvary into a
+ const struct v3d_device_info *devinfo = c->devinfo;
+
+ /* We only call this if we have successfully merged an ldvary into a
* previous instruction.
*/
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
@@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
* the ldvary destination, if it does, then moving the ldvary before
* it would overwrite it.
*/
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
return false;
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+ return false;
+
+ /* The implicit ldvary destination may not be written to by a signal
+ * in the instruction following ldvary. Since we are planning to move
+ * ldvary to the previous instruction, this means we need to check if
+ * the current instruction has any other signal that could create this
+ * conflict. The only other signal that can write to the implicit
+ * ldvary destination that is compatible with ldvary in the same
+ * instruction is ldunif.
+ */
+ if (inst->sig.ldunif)
return false;
/* The previous instruction can't write to the same destination as the
@@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
}
/* The previous instruction cannot have a conflicting signal */
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+ return false;
+
+ uint32_t sig;
+ struct v3d_qpu_sig new_sig = prev->qpu.sig;
+ new_sig.ldvary = true;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
return false;
/* The previous instruction cannot use flags since ldvary uses the
@@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
/* We can't put an ldvary in the delay slots of a thrsw. We should've
* prevented this when pairing up the ldvary with another instruction
- * and flagging it for a fixup.
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
+ * second delay slot.
*/
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+ assert((devinfo->ver == 42 &&
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+ (devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
/* Move the ldvary to the previous instruction and remove it from the
* current one.
@@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
inst->sig_magic = false;
inst->sig_addr = 0;
- /* By moving ldvary to the previous instruction we make it update
- * r5 in the current one, so nothing else in it should write r5.
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+ if (devinfo->ver >= 71) {
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+ }
+
+ /* By moving ldvary to the previous instruction we make it update r5
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
+ * should write this register.
+ *
* This should've been prevented by our depedency tracking, which
* would not allow ldvary to be paired up with an instruction that
- * writes r5 (since our dependency tracking doesn't know that the
- * ldvary write r5 happens in the next instruction).
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
+ * ldvary write to r5/rf0 happens in the next instruction).
*/
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
+ assert(devinfo->ver == 42 ||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
return true;
}
@@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c,
merge->inst->uniform;
}
+ chosen->inst->ldtmu_count +=
+ merge->inst->ldtmu_count;
+
if (debug) {
fprintf(stderr, "t=%4d: merging: ",
time);
@@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c,
}
}
}
- if (mux_read_stalls(scoreboard, inst))
+ if (read_stalls(c->devinfo, scoreboard, inst))
c->qpu_inst_stalled_count++;
}
@@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_branch_tick = -10;
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
+ scoreboard.first_ldtmu_after_thrsw = true;
+ scoreboard.last_implicit_rf0_write_tick = - 10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");