ir3/sched: Don't schedule too many tex/SFU instructions

Consider a simple loop that does a series of texture instructions and then reduces the results: vec4 sum = vec4(0); for (int i = 0; i < N; i++) { sum += texture(...); } Assume that the loop is unrolled and we schedule the resulting basic block. Right now, after we schedule the first texture instruction, the only instructions available to schedule that don't incur a sync are the instructions to setup the second texture instruction. So we keep picking the texture instructions, no matter how large N is, resulting in a pathological schedule for register pressure when N is very large: sum1 = texture(...); sum2 = texture(...); sum3 = texture(...); ... sum = sum1 + sum2 + sum3 + ...; In particular this happens with some CTS tests for VK_EXT_robustness2, where a loop like that with many iterations is marked as [[unroll]], forcing NIR to unroll it. This solution is a balance between the current approach and always scheduling for register pressure (and ignoring sync's). We only allow a certain number of texture fetches to be in flight before considering textures to "sync", even though they don't really, both because they likely *will* sync in reality (overflowing the internal queue of waiting texture instructions) and because at some point we need the normal algorithm to kick in and start lowering register pressure. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7571>
author: Connor Abbott <cwabbott0@gmail.com> 2020-11-11 15:31:09 +0100
committer: Marge Bot <eric+marge@anholt.net> 2021-04-14 17:33:58 +0000
commit: 2deead184cfbd84a617f64bffcdd7dcaaf2bd6f1 (patch)
tree: 02077f372d2fb5f3c6b5d9f65319063fbcd55e21
parent: 7821e5a3f8d593e1e9738924f5f4dc5996583518 (diff)
1 files changed, 27 insertions, 14 deletions
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 46678fe75de..51b39dc1ee9 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -496,10 +496,11 @@ live_effect(struct ir3_instruction *instr)
 /* Determine if this is an instruction that we'd prefer not to schedule
  * yet, in order to avoid an (ss)/(sy) sync.  This is limited by the
  * sfu_delay/tex_delay counters, ie. the more cycles it has been since
- * the last SFU/tex, the less costly a sync would be.
+ * the last SFU/tex, the less costly a sync would be, and the number of
+ * outstanding SFU/tex instructions to prevent a blowup in register pressure.
  */
 static bool
-would_sync(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
 	if (ctx->sfu_delay) {
 		if (sched_check_src_cond(instr, is_outstanding_sfu, ctx))
@@ -516,12 +517,24 @@ would_sync(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 			return true;
 	}
 
+	/* Avoid scheduling too many outstanding texture or sfu instructions at
+	 * once by deferring further tex/SFU instructions. This both prevents
+	 * stalls when the queue of texture/sfu instructions becomes too large,
+	 * and prevents unacceptably large increases in register pressure from too
+	 * many outstanding texture instructions.
+	 */
+	if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr))
+		return true;
+
+	if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr))
+		return true;
+
 	return false;
 }
 
 static struct ir3_sched_node *
 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		bool avoid_sync, bool avoid_output);
+		bool defer, bool avoid_output);
 
 /**
  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
@@ -532,14 +545,14 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  */
 static struct ir3_sched_node *
 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		bool avoid_sync)
+		bool defer)
 {
-	const char *mode = avoid_sync ? "-as" : "";
+	const char *mode = defer ? "-d" : "";
 	struct ir3_sched_node *chosen = NULL;
 
 	/* Find a ready inst with regs freed and pick the one with max cost. */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		if (avoid_sync && would_sync(ctx, n->instr))
+		if (defer && should_defer(ctx, n->instr))
 			continue;
 
 		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
@@ -565,7 +578,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 
 	/* Find a leader with regs freed and pick the one with max cost. */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		if (avoid_sync && would_sync(ctx, n->instr))
+		if (defer && should_defer(ctx, n->instr))
 			continue;
 
 		if (live_effect(n->instr) > -1)
@@ -592,7 +605,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * XXX: Should this prioritize ready?
 	 */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		if (avoid_sync && would_sync(ctx, n->instr))
+		if (defer && should_defer(ctx, n->instr))
 			continue;
 
 		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
@@ -616,7 +629,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	}
 
 	foreach_sched_node (n, &ctx->dag->heads) {
-		if (avoid_sync && would_sync(ctx, n->instr))
+		if (defer && should_defer(ctx, n->instr))
 			continue;
 
 		if (live_effect(n->instr) > 0)
@@ -634,7 +647,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		return chosen;
 	}
 
-	return choose_instr_inc(ctx, notes, avoid_sync, true);
+	return choose_instr_inc(ctx, notes, defer, true);
 }
 
 /**
@@ -643,9 +656,9 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  */
 static struct ir3_sched_node *
 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		bool avoid_sync, bool avoid_output)
+		bool defer, bool avoid_output)
 {
-	const char *mode = avoid_sync ? "-as" : "";
+	const char *mode = defer ? "-d" : "";
 	struct ir3_sched_node *chosen = NULL;
 
 	/*
@@ -660,7 +673,7 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (avoid_output && n->output)
 			continue;
 
-		if (avoid_sync && would_sync(ctx, n->instr))
+		if (defer && should_defer(ctx, n->instr))
 			continue;
 
 		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
@@ -689,7 +702,7 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (avoid_output && n->output)
 			continue;
 
-		if (avoid_sync && would_sync(ctx, n->instr))
+		if (defer && should_defer(ctx, n->instr))
 			continue;
 
 		if (!check_instr(ctx, notes, n->instr))
author	Connor Abbott <cwabbott0@gmail.com>	2020-11-11 15:31:09 +0100
committer	Marge Bot <eric+marge@anholt.net>	2021-04-14 17:33:58 +0000
commit	2deead184cfbd84a617f64bffcdd7dcaaf2bd6f1 (patch)
tree	02077f372d2fb5f3c6b5d9f65319063fbcd55e21
parent	7821e5a3f8d593e1e9738924f5f4dc5996583518 (diff)