ir3: Rewrite delay calculation

The old delay calculation relied on the SSA information staying around, and wouldn't work once we start introducing phi nodes and making "normal" values defined in multiple blocks not array regs anymore. What's worse is that properly inserting phi nodes when splitting live ranges would make that code even more complicated, and this was the last place post-RA that actually needed that information. The new version only compares the physical registers of sources and destinations. It works by going backwards up to a maximum number of cycles, so it might be slightly slower when the definition is closer but should be faster when it is farther away. To avoid complicating the new method, the old method is kept around, but only for pre-RA scheduling and it can therefore be drastically simplified as the array case can be dropped. ir3_delay_calc() is split into a few variants to avoid an explosion of boolean arguments in users, especially now that merged_regs now has to be passed to it. The new method is a little more complicated when it comes to handling (rptN), because both the assigner and consumer may be (rptN). This adds some unit tests for those cases, in addition to dropping the to-SSA code in the test harness since it's no longer needed. Finally, ir3_legalize has to be switched to using physical registers for the branch condition. This was the one place where IR3_REG_SSA remained after RA. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9842>
author: Connor Abbott <cwabbott0@gmail.com> 2021-02-19 10:53:08 +0100
committer: Emma Anholt <emma@anholt.net> 2021-06-10 12:20:38 -0700
commit: 58d82add87ede4dc6533f97f7e23e4ba09e1d242 (patch)
tree: 32e1eb91c084e9218b1e209b20611c7b1b6d65f7 /src/freedreno
parent: c0823a2d31c995395a8d2567b0c14793e8b569ca (diff)
6 files changed, 256 insertions, 218 deletions
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 4b804b8ccb5..f7d8dc1377f 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1465,8 +1465,11 @@ void ir3_print_instr(struct ir3_instruction *instr);
 /* delay calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred);
+unsigned ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr);
+unsigned ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool mergedregs);
+unsigned ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
+		bool mergedregs);
 void ir3_remove_nops(struct ir3 *ir);
 
 /* dead code elimination: */
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index 8a76601e536..1d382b45a80 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -26,6 +26,23 @@
 
 #include "ir3.h"
 
+/* The maximum number of nop's we may need to insert between two instructions.
+ */
+#define MAX_NOPS 6
+
+/* The soft delay for approximating the cost of (ss). On a6xx, it takes the
+ * number of delay slots to get a SFU result back (ie. using nop's instead of
+ * (ss) is:
+ *
+ *     8 - single warp
+ *     9 - two warps
+ *    10 - four warps
+ *
+ * and so on. Not quite sure where it tapers out (ie. how many warps share an
+ * SFU unit). But 10 seems like a reasonable # to choose:
+ */
+#define SOFT_SS_NOPS 10
+
 /*
  * Helpers to figure out the necessary delay slots between instructions.  Used
  * both in scheduling pass(es) and the final pass to insert any required nop's
@@ -59,19 +76,8 @@ ir3_delayslots(struct ir3_instruction *assigner,
 	if (writes_addr0(assigner) || writes_addr1(assigner))
 		return 6;
 
-	/* On a6xx, it takes the number of delay slots to get a SFU result
-	 * back (ie. using nop's instead of (ss) is:
-	 *
-	 *     8 - single warp
-	 *     9 - two warps
-	 *    10 - four warps
-	 *
-	 * and so on.  Not quite sure where it tapers out (ie. how many
-	 * warps share an SFU unit).  But 10 seems like a reasonable #
-	 * to choose:
-	 */
 	if (soft && is_sfu(assigner))
-		return 10;
+		return SOFT_SS_NOPS;
 
 	/* handled via sync flags: */
 	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
@@ -120,23 +126,9 @@ count_instruction(struct ir3_instruction *n)
 	return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 }
 
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled)
- */
 static unsigned
 distance(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned maxd, bool pred)
+		unsigned maxd)
 {
 	unsigned d = 0;
 
@@ -151,46 +143,20 @@ distance(struct ir3_block *block, struct ir3_instruction *instr,
 			d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 	}
 
-	/* if coming from a predecessor block, assume it is assigned far
-	 * enough away.. we'll fix up later.
-	 */
-	if (!pred)
-		return maxd;
-
-	if (pred && (block->data != block)) {
-		/* Search into predecessor blocks, finding the one with the
-		 * shortest distance, since that will be the worst case
-		 */
-		unsigned min = maxd - d;
-
-		/* (ab)use block->data to prevent recursion: */
-		block->data = block;
-
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			struct ir3_block *pred = block->predecessors[i];
-			unsigned n;
-
-			n = distance(pred, instr, min, pred);
-
-			min = MIN2(min, n);
-		}
-
-		block->data = NULL;
-		d += min;
-	}
-
-	return d;
+	return maxd;
 }
 
-/* calculate delay for specified src: */
 static unsigned
-delay_calc_srcn(struct ir3_block *block,
+delay_calc_srcn_prera(struct ir3_block *block,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer,
-		unsigned srcn, bool soft, bool pred)
+		unsigned srcn)
 {
 	unsigned delay = 0;
 
+	if (assigner->opc == OPC_META_PHI)
+		return 0;
+
 	if (is_meta(assigner)) {
 		foreach_src_n (src, n, assigner) {
 			unsigned d;
@@ -198,7 +164,7 @@ delay_calc_srcn(struct ir3_block *block,
 			if (!src->def)
 				continue;
 
-			d = delay_calc_srcn(block, src->def->instr, consumer, srcn, soft, pred);
+			d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
 
 			/* A (rptN) instruction executes in consecutive cycles so
 			 * it's outputs are written in successive cycles.  And
@@ -224,136 +190,235 @@ delay_calc_srcn(struct ir3_block *block,
 			delay = MAX2(delay, d);
 		}
 	} else {
-		delay = ir3_delayslots(assigner, consumer, srcn, soft);
-		delay -= distance(block, assigner, delay, pred);
+		delay = ir3_delayslots(assigner, consumer, srcn, false);
+		delay -= distance(block, assigner, delay);
 	}
 
 	return delay;
 }
 
-static struct ir3_instruction *
-find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
+/**
+ * Calculate delay for instruction before register allocation, using SSA
+ * source pointers. This can't handle inter-block dependencies.
+ */
+unsigned
+ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
 {
-	unsigned d = 0;
+	unsigned delay = 0;
 
-	/* Note that this relies on incrementally building up the block's
-	 * instruction list.. but this is how scheduling and nopsched
-	 * work.
-	 */
-	foreach_instr_rev (n, &block->instr_list) {
-		if (d >= maxd)
-			return NULL;
-		if (count_instruction(n))
-			d++;
-		if (dest_regs(n) == 0)
-			continue;
+	foreach_src_n (src, i, instr) {
+		unsigned d = 0;
+
+		if (src->def && src->def->instr->block == block) {
+			d = delay_calc_srcn_prera(block, src->def->instr, instr, i+1);
+		}
 
-		/* note that a dest reg will never be an immediate */
-		if (n->regs[0]->array.id == array_id)
-			return n;
+		delay = MAX2(delay, d);
 	}
 
-	return NULL;
+	if (instr->address) {
+		unsigned d = delay_calc_srcn_prera(block, instr->address, instr, 0);
+		delay = MAX2(delay, d);
+	}
+
+	return delay;
 }
 
-/* like list_length() but only counts instructions which count in the
- * delay determination:
+/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
+ * and have to handle relative accesses specially.
  */
+
 static unsigned
-count_block_delay(struct ir3_block *block)
+post_ra_reg_elems(struct ir3_register *reg)
 {
-	unsigned delay = 0;
-	foreach_instr (n, &block->instr_list) {
-		if (!count_instruction(n))
-			continue;
-		delay++;
-	}
-	return delay;
+	if (reg->flags & IR3_REG_RELATIV)
+		return reg->size;
+	return reg_elems(reg);
 }
 
 static unsigned
-delay_calc_array(struct ir3_block *block, unsigned array_id,
-		struct ir3_instruction *consumer, unsigned srcn,
-		bool soft, bool pred, unsigned maxd)
+post_ra_reg_num(struct ir3_register *reg)
 {
-	struct ir3_instruction *assigner;
+	if (reg->flags & IR3_REG_RELATIV)
+		return reg->array.base;
+	return reg->num;
+}
 
-	assigner = find_array_write(block, array_id, maxd);
-	if (assigner)
-		return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
+static unsigned
+delay_calc_srcn_postra(struct ir3_instruction *assigner, struct ir3_instruction *consumer,
+					   unsigned n, bool soft, bool mergedregs)
+{
+	struct ir3_register *src = consumer->regs[n];
+	struct ir3_register *dst = assigner->regs[0];
+	bool mismatched_half =
+		(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
 
-	if (!pred)
+	if (!mergedregs && mismatched_half)
 		return 0;
 
-	unsigned len = count_block_delay(block);
-	if (maxd <= len)
+	unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
+	unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
+	unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
+	unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
+
+	if (dst_start >= src_end || src_start >= dst_end)
 		return 0;
 
-	maxd -= len;
+	unsigned delay = ir3_delayslots(assigner, consumer, n, soft);
 
-	if (block->data == block) {
-		/* we have a loop, return worst case: */
-		return maxd;
-	}
+	if (assigner->repeat == 0 && consumer->repeat == 0)
+		return delay;
+
+	/* If either side is a relative access, we can't really apply most of the
+	 * reasoning below because we don't know which component aliases which.
+	 * Just bail in this case.
+	 */
+	if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
+		return delay;
 
-	/* If we need to search into predecessors, find the one with the
-	 * max delay.. the resulting delay is that minus the number of
-	 * counted instructions in this block:
+	/* TODO: Handle the combination of (rpt) and different component sizes
+	 * better like below. This complicates things significantly because the
+	 * components don't line up.
 	 */
-	unsigned max = 0;
+	if (mismatched_half)
+		return delay;
 
-	/* (ab)use block->data to prevent recursion: */
-	block->data = block;
+	/* If an instruction has a (rpt), then it acts as a sequence of
+	 * instructions, reading its non-(r) sources at each cycle. First, get the
+	 * register num for the first instruction where they interfere:
+	 */
+
+	unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
 
-	for (unsigned i = 0; i < block->predecessors_count; i++) {
-		struct ir3_block *pred = block->predecessors[i];
-		unsigned delay =
-			delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
+	/* Now, for that first conflicting half/full register, figure out the
+	 * sub-instruction within assigner/consumer it corresponds to. For (r)
+	 * sources, this should already return the correct answer of 0.
+	 */
+	unsigned first_src_instr = first_num - src->num;
+	unsigned first_dst_instr = first_num - dst->num;
+
+	/* The delay we return is relative to the *end* of assigner and the
+	 * *beginning* of consumer, because it's the number of nops (or other
+	 * things) needed between them. Any instructions after first_dst_instr
+	 * subtract from the delay, and so do any instructions before
+	 * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
+	 * delay to account for that.
+	 *
+	 * Now, a priori, we need to go through this process for every
+	 * conflicting regnum and take the minimum of the offsets to make sure
+	 * that the appropriate number of nop's is inserted for every conflicting
+	 * pair of sub-instructions. However, as we go to the next conflicting
+	 * regnum (if any), the number of instructions after first_dst_instr
+	 * decreases by 1 and the number of source instructions before
+	 * first_src_instr correspondingly increases by 1, so the offset stays the
+	 * same for all conflicting registers.
+	 */
+	unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
+	return offset > delay ? 0 : delay - offset;
+}
+
+static unsigned
+delay_calc_postra(struct ir3_block *block,
+				  struct ir3_instruction *start,
+				  struct ir3_instruction *consumer,
+				  unsigned distance, bool soft, bool pred, bool mergedregs)
+{
+	unsigned delay = 0;
+	/* Search backwards starting at the instruction before start, unless it's
+	 * NULL then search backwards from the block end.
+	 */
+	struct list_head *start_list = start ? start->node.prev : block->instr_list.prev;
+	list_for_each_entry_from_rev(struct ir3_instruction, assigner, start_list, &block->instr_list, node) {
+		if (count_instruction(assigner))
+			distance += assigner->nop;
 
-		max = MAX2(max, delay);
+		if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
+			return delay;
+
+		if (is_meta(assigner))
+			continue;
+
+		unsigned new_delay = 0;
+
+		if (consumer->address == assigner) {
+			unsigned addr_delay = ir3_delayslots(assigner, consumer, 0, soft);
+			new_delay = MAX2(new_delay, addr_delay);
+		}
+
+		if (dest_regs(assigner) != 0) {
+			foreach_src_n (src, n, consumer) {
+				if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
+					continue;
+
+				unsigned src_delay = delay_calc_srcn_postra(assigner, consumer, n+1, soft, mergedregs);
+				new_delay = MAX2(new_delay, src_delay);
+			}
+		}
+
+		new_delay = new_delay > distance ? new_delay - distance : 0;
+		delay = MAX2(delay, new_delay);
+
+		if (count_instruction(assigner))
+			distance += 1 + assigner->repeat;
 	}
 
-	block->data = NULL;
+	/* Note: this allows recursion into "block" if it has already been
+	 * visited, but *not* recursion into its predecessors. We may have to
+	 * visit the original block twice, for the loop case where we have to
+	 * consider definititons in an earlier iterations of the same loop:
+	 *
+	 * while (...) {
+	 *		mov.u32u32 ..., r0.x
+	 *		...
+	 *		mov.u32u32 r0.x, ...
+	 * }
+	 *
+	 * However any other recursion would be unnecessary.
+	 */
+
+	if (pred && block->data != block) {
+		block->data = block;
 
-	if (max < len)
-		return 0;
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			struct ir3_block *pred = block->predecessors[i];
+			unsigned pred_delay =
+				delay_calc_postra(pred, NULL, consumer, distance, soft, pred, mergedregs);
+			delay = MAX2(delay, pred_delay);
+		}
 
-	return max - len;
+		block->data = NULL;
+	}
+
+	return delay;
 }
 
 /**
- * Calculate delay for instruction (maximum of delay for all srcs):
+ * Calculate delay for post-RA scheduling based on physical registers but not
+ * exact (i.e. don't recurse into predecessors, and make it possible to
+ * estimate impact of sync flags).
  *
  * @soft:  If true, add additional delay for situations where they
  *    would not be strictly required because a sync flag would be
  *    used (but scheduler would prefer to schedule some other
  *    instructions first to avoid stalling on sync flag)
- * @pred:  If true, recurse into predecessor blocks
+ * @mergedregs: True if mergedregs is enabled.
  */
 unsigned
-ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred)
+ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool mergedregs)
 {
-	unsigned delay = 0;
-
-	foreach_src_n (src, i, instr) {
-		unsigned d = 0;
-
-		if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
-			d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
-		} else if (src->def) {
-			d = delay_calc_srcn(block, src->def->instr, instr, i+1, soft, pred);
-		}
-
-		delay = MAX2(delay, d);
-	}
-
-	if (instr->address) {
-		unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
-		delay = MAX2(delay, d);
-	}
+	return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
+}
 
-	return delay;
+/**
+ * Calculate delay for nop insertion. This must exactly match hardware
+ * requirements, including recursing into predecessor blocks.
+ */
+unsigned
+ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
+		bool mergedregs)
+{
+	return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
 }
 
 /**
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index caeb4456b8a..e48fd8be2da 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -718,7 +718,7 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
 
 /* Insert nop's required to make this a legal/valid shader program: */
 static void
-nop_sched(struct ir3 *ir)
+nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
 {
 	foreach_block (block, &ir->block_list) {
 		struct ir3_instruction *last = NULL;
@@ -731,7 +731,8 @@ nop_sched(struct ir3 *ir)
 		list_inithead(&block->instr_list);
 
 		foreach_instr_safe (instr, &instr_list) {
-			unsigned delay = ir3_delay_calc(block, instr, false, true);
+			unsigned delay =
+				ir3_delay_calc_exact(block, instr, so->mergedregs);
 
 			/* NOTE: I think the nopN encoding works for a5xx and
 			 * probably a4xx, but not a3xx.  So far only tested on
@@ -827,7 +828,7 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 		progress |= apply_fine_deriv_macro(ctx, block);
 	}
 
-	nop_sched(ir);
+	nop_sched(ir, so);
 
 	do {
 		ir3_count_instructions(ir);
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index c59c3b94e93..d02926fa711 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -192,7 +192,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
 	/* Next prioritize discards: */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -211,7 +212,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
 	/* Next prioritize expensive instructions: */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -241,7 +243,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 			if (would_sync(ctx, n->instr))
 				continue;
 
-			unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+			unsigned d =
+				ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
 
 			if (d > delay)
 				continue;
@@ -262,7 +265,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 	 * while we wait)
 	 */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -281,7 +285,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 	 * stalls.. but we've already decided there is not a better option.
 	 */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -649,7 +654,8 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 	while (!list_is_empty(&ctx->unscheduled_list)) {
 		struct ir3_instruction *instr = choose_instr(ctx);
 
-		unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+		unsigned delay =
+			ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
 		d("delay=%u", delay);
 
 		/* and if we run out of instructions that can be scheduled,
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index d4969188732..22d1f887e3e 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -567,7 +567,8 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (defer && should_defer(ctx, n->instr))
 			continue;
 
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		/* Note: mergedregs is only used post-RA, just set it to false */
+		unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
 		if (d > 0)
 			continue;
@@ -620,7 +621,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (defer && should_defer(ctx, n->instr))
 			continue;
 
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
 		if (d > 0)
 			continue;
@@ -688,7 +689,7 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (defer && should_defer(ctx, n->instr))
 			continue;
 
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
 		if (d > 0)
 			continue;
@@ -769,7 +770,7 @@ dump_state(struct ir3_sched_ctx *ctx)
 	foreach_sched_node (n, &ctx->dag->heads) {
 		di(n->instr, "maxdel=%3d le=%d del=%u ",
 				n->max_delay, live_effect(n->instr),
-				ir3_delay_calc(ctx->block, n->instr, false, false));
+				ir3_delay_calc_prera(ctx->block, n->instr));
 
 		util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 			struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
@@ -1132,7 +1133,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
 		instr = choose_instr(ctx, &notes);
 		if (instr) {
-			unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+			unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
 			d("delay=%u", delay);
 
 			/* and if we run out of instructions that can be scheduled,
diff --git a/src/freedreno/ir3/tests/delay.c b/src/freedreno/ir3/tests/delay.c
index d1cff80f27c..2da619dcca7 100644
--- a/src/freedreno/ir3/tests/delay.c
+++ b/src/freedreno/ir3/tests/delay.c
@@ -80,9 +80,17 @@ static const struct test {
 		(rpt2)mov.f32f32 r0.x, (r)c0.x
 		add.f r0.x, r0.x, r0.y
 	),
+	TEST(2,
+		(rpt1)mov.f32f32 r0.x, (r)c0.x
+		(rpt1)add.f r0.x, (r)r0.x, c0.x
+	),
 	TEST(1,
-		(rpt2)mov.f32f32 r0.x, (r)c0.x
-		(rpt2)add.f r0.x, (r)r0.x, c0.x
+		(rpt1)mov.f32f32 r0.y, (r)c0.x
+		(rpt1)add.f r0.x, (r)r0.x, c0.x
+	),
+	TEST(3,
+		(rpt1)mov.f32f32 r0.x, (r)c0.x
+		(rpt1)add.f r0.x, (r)r0.y, c0.x
 	),
 };
 
@@ -101,75 +109,29 @@ parse_asm(struct ir3_compiler *c, const char *asmstr)
 	return shader;
 }
 
-static unsigned
-regn(struct ir3_register *reg)
-{
-	unsigned regn = reg->num;
-	if (reg->flags & IR3_REG_HALF)
-		regn += MAX_REG;
-	return regn;
-}
-
 /**
- * Super-cheezy into-ssa pass, doesn't handle flow control or anything
- * hard.  Just enough to figure out the SSA srcs of the last instruction.
+ * ir3_delay_calc_* relies on the src/dst wrmask being correct even for ALU
+ * instructions, so this sets it here.
  *
  * Note that this is not clever enough to know how many src/dst there are
  * for various tex/mem instructions.  But the rules for tex consuming alu
  * are the same as sfu consuming alu.
  */
 static void
-regs_to_ssa(struct ir3 *ir)
+fixup_wrmask(struct ir3 *ir)
 {
-	struct ir3_instruction *regfile[2 * MAX_REG] = {};
 	struct ir3_block *block = ir3_start_block(ir);
 
 	foreach_instr_safe (instr, &block->instr_list) {
+		instr->regs[0]->wrmask = MASK(instr->repeat + 1);
 		foreach_src (reg, instr) {
 			if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 				continue;
 
-			struct ir3_instruction *src = regfile[regn(reg)];
-
-			if (!src)
-				continue;
-
-			if (reg->flags & IR3_REG_R) {
-				unsigned nsrc = 1 + instr->repeat;
-				unsigned flags = src->regs[0]->flags & IR3_REG_HALF;
-				struct ir3_instruction *collect =
-					ir3_instr_create(block, OPC_META_COLLECT, 1 + nsrc);
-				__ssa_dst(collect)->flags |= flags;
-				for (unsigned i = 0; i < nsrc; i++)
-					__ssa_src(collect, regfile[regn(reg) + i], flags);
-
-				ir3_instr_move_before(collect, instr);
-
-				src = collect;
-			}
-
-			reg->def = src->regs[0];
-			reg->flags |= IR3_REG_SSA;
-		}
-
-		if (instr->repeat) {
-			unsigned ndst = 1 + instr->repeat;
-			unsigned flags = instr->regs[0]->flags & IR3_REG_HALF;
-
-			for (unsigned i = 0; i < ndst; i++) {
-				struct ir3_instruction *split =
-					ir3_instr_create(block, OPC_META_SPLIT, 2);
-				__ssa_dst(split)->flags |= flags;
-				__ssa_src(split, instr, flags);
-				split->split.off = i;
-
-				ir3_instr_move_after(split, instr);
-
-				regfile[regn(instr->regs[0]) + i] = split;
-			}
-		} else {
-			instr->regs[0]->instr = instr;
-			regfile[regn(instr->regs[0])] = instr;
+			if (reg->flags & IR3_REG_R)
+				reg->wrmask = MASK(instr->repeat + 1);
+			else
+				reg->wrmask = 1;
 		}
 	}
 }
@@ -188,9 +150,9 @@ main(int argc, char **argv)
 		struct ir3_shader *shader = parse_asm(c, test->asmstr);
 		struct ir3 *ir = shader->variants->ir;
 
-		regs_to_ssa(ir);
+		fixup_wrmask(ir);
 
-		ir3_debug_print(ir, "AFTER REGS->SSA");
+		ir3_debug_print(ir, "AFTER fixup_wrmask");
 
 		struct ir3_block *block =
 			list_first_entry(&ir->block_list, struct ir3_block, node);
@@ -209,7 +171,7 @@ main(int argc, char **argv)
 		 */
 		list_delinit(&last->node);
 
-		unsigned n = ir3_delay_calc(block, last, false, false);
+		unsigned n = ir3_delay_calc_exact(block, last, true);
 
 		if (n != test->expected_delay) {
 			printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n",
author	Connor Abbott <cwabbott0@gmail.com>	2021-02-19 10:53:08 +0100
committer	Emma Anholt <emma@anholt.net>	2021-06-10 12:20:38 -0700
commit	58d82add87ede4dc6533f97f7e23e4ba09e1d242 (patch)
tree	32e1eb91c084e9218b1e209b20611c7b1b6d65f7 /src/freedreno
parent	c0823a2d31c995395a8d2567b0c14793e8b569ca (diff)