summaryrefslogtreecommitdiff
path: root/src/freedreno/ir3/ir3_legalize.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/freedreno/ir3/ir3_legalize.c')
-rw-r--r--src/freedreno/ir3/ir3_legalize.c1008
1 files changed, 836 insertions, 172 deletions
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index 8d56efccfc6..b3c34ba5f3b 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -39,7 +39,7 @@
* 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
* based on state flowing out of predecessor blocks until there is
* no further change. In some cases this requires inserting nops.
- * 2) Mark (ei) on last varying input, and (ul) on last use of a0.x
+ * 2) Mark (ei) on last varying input
* 3) Final nop scheduling for instruction latency
* 4) Resolve jumps and schedule blocks, marking potential convergence
* points with (jp)
@@ -51,19 +51,239 @@ struct ir3_legalize_ctx {
gl_shader_stage type;
int max_bary;
bool early_input_release;
+ bool has_inputs;
+};
+
+struct ir3_nop_state {
+ unsigned full_ready[4 * 48];
+ unsigned half_ready[4 * 48];
};
struct ir3_legalize_state {
regmask_t needs_ss;
regmask_t needs_ss_war; /* write after read */
regmask_t needs_sy;
+ bool needs_ss_for_const;
+
+ /* Each of these arrays contains the cycle when the corresponding register
+ * becomes "ready" i.e. does not require any more nops. There is a special
+ * mechanism to let ALU instructions read compatible (i.e. same halfness)
+ * destinations of another ALU instruction with less delay, so this can
+ * depend on what type the consuming instruction is, which is why there are
+ * multiple arrays. The cycle is counted relative to the start of the block.
+ */
+
+ /* When ALU instructions reading the given full/half register will be ready.
+ */
+ struct ir3_nop_state alu_nop;
+
+ /* When non-ALU (e.g. cat5) instructions reading the given full/half register
+ * will be ready.
+ */
+ struct ir3_nop_state non_alu_nop;
+
+ /* When p0.x-w, a0.x, and a1.x are ready. */
+ unsigned pred_ready[4];
+ unsigned addr_ready[2];
};
struct ir3_legalize_block_data {
bool valid;
+ struct ir3_legalize_state begin_state;
struct ir3_legalize_state state;
};
+static inline void
+apply_ss(struct ir3_instruction *instr,
+ struct ir3_legalize_state *state,
+ bool mergedregs)
+{
+ instr->flags |= IR3_INSTR_SS;
+ regmask_init(&state->needs_ss_war, mergedregs);
+ regmask_init(&state->needs_ss, mergedregs);
+ state->needs_ss_for_const = false;
+}
+
+static inline void
+apply_sy(struct ir3_instruction *instr,
+ struct ir3_legalize_state *state,
+ bool mergedregs)
+{
+ instr->flags |= IR3_INSTR_SY;
+ regmask_init(&state->needs_sy, mergedregs);
+}
+
+static bool
+count_instruction(struct ir3_instruction *n)
+{
+ /* NOTE: don't count branch/jump since we don't know yet if they will
+ * be eliminated later in resolve_jumps().. really should do that
+ * earlier so we don't have this constraint.
+ */
+ return is_alu(n) ||
+ (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
+ (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
+}
+
+static unsigned *
+get_ready_slot(struct ir3_legalize_state *state,
+ struct ir3_register *reg, unsigned num,
+ bool consumer_alu, bool matching_size)
+{
+ if (reg->flags & IR3_REG_PREDICATE) {
+ assert(num == reg->num);
+ assert(reg_num(reg) == REG_P0);
+ return &state->pred_ready[reg_comp(reg)];
+ }
+ if (reg->num == regid(REG_A0, 0))
+ return &state->addr_ready[0];
+ if (reg->num == regid(REG_A0, 1))
+ return &state->addr_ready[1];
+ struct ir3_nop_state *nop =
+ consumer_alu ? &state->alu_nop : &state->non_alu_nop;
+ assert(!(reg->flags & IR3_REG_SHARED));
+ if (reg->flags & IR3_REG_HALF) {
+ if (matching_size)
+ return &nop->half_ready[num];
+ else
+ return &nop->full_ready[num / 2];
+ } else {
+ if (matching_size)
+ return &nop->full_ready[num];
+ /* If "num" is large enough, then it can't alias a half-reg because only
+ * the first half of the full reg speace aliases half regs. Return NULL in
+ * this case.
+ */
+ else if (num * 2 < ARRAY_SIZE(nop->half_ready))
+ return &nop->half_ready[num * 2];
+ else
+ return NULL;
+ }
+}
+
+static unsigned
+delay_calc(struct ir3_legalize_state *state,
+ struct ir3_instruction *instr,
+ unsigned cycle)
+{
+ /* As far as we know, shader outputs don't need any delay. */
+ if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
+ return 0;
+
+ unsigned delay = 0;
+ foreach_src_n (src, n, instr) {
+ if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
+ continue;
+
+ unsigned elems = post_ra_reg_elems(src);
+ unsigned num = post_ra_reg_num(src);
+ unsigned src_cycle = cycle;
+
+ /* gat and swz have scalar sources and each source is read in a
+ * subsequent cycle.
+ */
+ if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ)
+ src_cycle += n;
+
+ /* cat3 instructions consume their last source two cycles later, so they
+ * only need a delay of 1.
+ */
+ if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2)
+ src_cycle += 2;
+
+ for (unsigned elem = 0; elem < elems; elem++, num++) {
+ unsigned ready_cycle =
+ *get_ready_slot(state, src, num, is_alu(instr), true);
+ delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
+
+ /* Increment cycle for ALU instructions with (rptN) where sources are
+ * read each subsequent cycle.
+ */
+ if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
+ src_cycle++;
+ }
+ }
+
+ return delay;
+}
+
+static void
+delay_update(struct ir3_legalize_state *state,
+ struct ir3_instruction *instr,
+ unsigned cycle,
+ bool mergedregs)
+{
+ foreach_dst_n (dst, n, instr) {
+ unsigned elems = post_ra_reg_elems(dst);
+ unsigned num = post_ra_reg_num(dst);
+ unsigned dst_cycle = cycle;
+
+ /* sct and swz have scalar destinations and each destination is written in
+ * a subsequent cycle.
+ */
+ if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
+ dst_cycle += n;
+
+ /* For relative accesses with (rptN), we have no way of knowing which
+ * component is accessed when, so we have to assume the worst and mark
+ * every array member as being written at the end.
+ */
+ if (dst->flags & IR3_REG_RELATIV)
+ dst_cycle += instr->repeat;
+
+ if (dst->flags & IR3_REG_SHARED)
+ continue;
+
+ for (unsigned elem = 0; elem < elems; elem++, num++) {
+ for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
+ for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
+ unsigned *ready_slot =
+ get_ready_slot(state, dst, num, consumer_alu, matching_size);
+
+ if (!ready_slot)
+ continue;
+
+ bool reset_ready_slot = false;
+ unsigned delay = 0;
+ if (!is_alu(instr)) {
+ /* Apparently writes that require (ss) or (sy) are
+ * synchronized against previous writes, so consumers don't
+ * have to wait for any previous overlapping ALU instructions
+ * to complete.
+ */
+ reset_ready_slot = true;
+ } else if ((dst->flags & IR3_REG_PREDICATE) ||
+ reg_num(dst) == REG_A0) {
+ delay = 6;
+ if (!matching_size)
+ continue;
+ } else {
+ delay = (consumer_alu && matching_size) ? 3 : 6;
+ }
+
+ if (!matching_size) {
+ for (unsigned i = 0; i < reg_elem_size(dst); i++) {
+ ready_slot[i] =
+ reset_ready_slot ? 0 :
+ MAX2(ready_slot[i], dst_cycle + delay);
+ }
+ } else {
+ *ready_slot =
+ reset_ready_slot ? 0 :
+ MAX2(*ready_slot, dst_cycle + delay);
+ }
+ }
+ }
+
+ /* Increment cycle for ALU instructions with (rptN) where destinations
+ * are written each subsequent cycle.
+ */
+ if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
+ dst_cycle++;
+ }
+ }
+}
+
/* We want to evaluate each block from the position of any other
* predecessor block, in order that the flags set are the union of
* all possible program paths.
@@ -87,16 +307,23 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
if (bd->valid)
return false;
- struct ir3_instruction *last_rel = NULL;
struct ir3_instruction *last_n = NULL;
struct list_head instr_list;
struct ir3_legalize_state prev_state = bd->state;
- struct ir3_legalize_state *state = &bd->state;
+ struct ir3_legalize_state *state = &bd->begin_state;
bool last_input_needs_ss = false;
bool has_tex_prefetch = false;
bool mergedregs = ctx->so->mergedregs;
- /* our input state is the OR of all predecessor blocks' state: */
+ /* Our input state is the OR of all predecessor blocks' state.
+ *
+ * Why don't we just zero the state at the beginning before merging in the
+ * predecessors? Because otherwise updates may not be a "lattice refinement",
+ * i.e. needs_ss may go from true to false for some register due to a (ss) we
+ * inserted the second time around (and the same for (sy)). This means that
+ * there's no solid guarantee the algorithm will converge, and in theory
+ * there may be infinite loops where we fight over the placment of an (ss).
+ */
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *predecessor = block->predecessors[i];
struct ir3_legalize_block_data *pbd = predecessor->data;
@@ -109,8 +336,38 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
regmask_or(&state->needs_ss_war, &state->needs_ss_war,
&pstate->needs_ss_war);
regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
+ state->needs_ss_for_const |= pstate->needs_ss_for_const;
+
+ /* Our nop state is the max of the predecessor blocks */
+ for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
+ state->pred_ready[i] = MAX2(state->pred_ready[i],
+ pstate->pred_ready[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
+ state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
+ pstate->alu_nop.full_ready[i]);
+ state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
+ pstate->alu_nop.half_ready[i]);
+ state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
+ pstate->non_alu_nop.full_ready[i]);
+ state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
+ pstate->non_alu_nop.half_ready[i]);
+ }
}
+ /* We need to take phsyical-only edges into account when tracking shared
+ * registers.
+ */
+ for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+ struct ir3_block *predecessor = block->physical_predecessors[i];
+ struct ir3_legalize_block_data *pbd = predecessor->data;
+ struct ir3_legalize_state *pstate = &pbd->state;
+
+ regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
+ }
+
+ memcpy(&bd->state, state, sizeof(*state));
+ state = &bd->state;
+
unsigned input_count = 0;
foreach_instr (n, &block->instr_list) {
@@ -125,7 +382,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
* with the end of the program.
*/
assert(input_count == 0 || !ctx->early_input_release ||
- block == ir3_start_block(block->shader));
+ block == ir3_after_preamble(block->shader));
/* remove all the instructions from the list, we'll be adding
* them back in as we go
@@ -133,6 +390,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
list_replace(&block->instr_list, &instr_list);
list_inithead(&block->instr_list);
+ unsigned cycle = 0;
+
foreach_instr_safe (n, &instr_list) {
unsigned i;
@@ -150,18 +409,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
}
- if (last_n && is_barrier(last_n)) {
- n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+ if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
+ apply_ss(n, state, mergedregs);
+ apply_sy(n, state, mergedregs);
last_input_needs_ss = false;
- regmask_init(&state->needs_ss_war, mergedregs);
- regmask_init(&state->needs_ss, mergedregs);
- regmask_init(&state->needs_sy, mergedregs);
}
if (last_n && (last_n->opc == OPC_PREDT)) {
- n->flags |= IR3_INSTR_SS;
- regmask_init(&state->needs_ss_war, mergedregs);
- regmask_init(&state->needs_ss, mergedregs);
+ apply_ss(n, state, mergedregs);
}
/* NOTE: consider dst register too.. it could happen that
@@ -184,37 +439,25 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
* some tests for both this and (sy)..
*/
if (regmask_get(&state->needs_ss, reg)) {
- n->flags |= IR3_INSTR_SS;
+ apply_ss(n, state, mergedregs);
last_input_needs_ss = false;
- regmask_init(&state->needs_ss_war, mergedregs);
- regmask_init(&state->needs_ss, mergedregs);
}
if (regmask_get(&state->needs_sy, reg)) {
- n->flags |= IR3_INSTR_SY;
- regmask_init(&state->needs_sy, mergedregs);
+ apply_sy(n, state, mergedregs);
+ }
+ } else if ((reg->flags & IR3_REG_CONST)) {
+ if (state->needs_ss_for_const) {
+ apply_ss(n, state, mergedregs);
+ last_input_needs_ss = false;
}
}
-
- /* TODO: is it valid to have address reg loaded from a
- * relative src (ie. mova a0, c<a0.x+4>)? If so, the
- * last_rel check below should be moved ahead of this:
- */
- if (reg->flags & IR3_REG_RELATIV)
- last_rel = n;
}
foreach_dst (reg, n) {
if (regmask_get(&state->needs_ss_war, reg)) {
- n->flags |= IR3_INSTR_SS;
+ apply_ss(n, state, mergedregs);
last_input_needs_ss = false;
- regmask_init(&state->needs_ss_war, mergedregs);
- regmask_init(&state->needs_ss, mergedregs);
- }
-
- if (last_rel && (reg->num == regid(REG_A0, 0))) {
- last_rel->flags |= IR3_INSTR_UL;
- last_rel = NULL;
}
}
@@ -228,11 +471,40 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
nop = ir3_NOP(block);
nop->flags |= IR3_INSTR_SS;
n->flags &= ~IR3_INSTR_SS;
+ last_n = nop;
+ cycle++;
+ }
+
+ unsigned delay = delay_calc(state, n, cycle);
+
+ /* NOTE: I think the nopN encoding works for a5xx and
+ * probably a4xx, but not a3xx. So far only tested on
+ * a6xx.
+ */
+
+ if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
+ ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
+ (last_n->repeat == 0)) {
+ /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+ unsigned transfer = MIN2(delay, 3 - last_n->nop);
+ last_n->nop += transfer;
+ delay -= transfer;
+ cycle += transfer;
}
- /* need to be able to set (ss) on first instruction: */
- if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
- ir3_NOP(block);
+ if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
+ /* the previous nop can encode at most 5 repeats: */
+ unsigned transfer = MIN2(delay, 5 - last_n->repeat);
+ last_n->repeat += transfer;
+ delay -= transfer;
+ cycle += transfer;
+ }
+
+ if (delay > 0) {
+ assert(delay <= 6);
+ ir3_NOP(block)->repeat = delay - 1;
+ cycle += delay;
+ }
if (ctx->compiler->samgq_workaround &&
ctx->type != MESA_SHADER_FRAGMENT &&
@@ -255,6 +527,11 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
if (is_sfu(n))
regmask_set(&state->needs_ss, n->dsts[0]);
+ foreach_dst (dst, n) {
+ if (dst->flags & IR3_REG_SHARED)
+ regmask_set(&state->needs_ss, dst);
+ }
+
if (is_tex_or_prefetch(n)) {
regmask_set(&state->needs_sy, n->dsts[0]);
if (n->opc == OPC_META_TEX_PREFETCH)
@@ -264,28 +541,25 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
ir3_NOP(block)->flags |= IR3_INSTR_SS;
last_input_needs_ss = false;
} else if (is_load(n)) {
- /* seems like ldlv needs (ss) bit instead?? which is odd but
- * makes a bunch of flat-varying tests start working on a4xx.
- */
- if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) ||
- (n->opc == OPC_LDLW))
+ if (is_local_mem_load(n))
regmask_set(&state->needs_ss, n->dsts[0]);
else
regmask_set(&state->needs_sy, n->dsts[0]);
} else if (is_atomic(n->opc)) {
- if (n->flags & IR3_INSTR_G) {
- if (ctx->compiler->gen >= 6) {
- /* New encoding, returns result via second src: */
- regmask_set(&state->needs_sy, n->srcs[2]);
- } else {
- regmask_set(&state->needs_sy, n->dsts[0]);
- }
+ if (is_bindless_atomic(n->opc)) {
+ regmask_set(&state->needs_sy, n->srcs[2]);
+ } else if (is_global_a3xx_atomic(n->opc) ||
+ is_global_a6xx_atomic(n->opc)) {
+ regmask_set(&state->needs_sy, n->dsts[0]);
} else {
regmask_set(&state->needs_ss, n->dsts[0]);
}
+ } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+ state->needs_ss_for_const = true;
}
- if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
+ if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
+ is_bindless_atomic(n->opc))
ctx->so->has_ssbo = true;
/* both tex/sfu appear to not always immediately consume
@@ -293,11 +567,18 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
*/
if (is_tex(n) || is_sfu(n) || is_mem(n)) {
foreach_src (reg, n) {
- if (reg_gpr(reg))
- regmask_set(&state->needs_ss_war, reg);
+ regmask_set(&state->needs_ss_war, reg);
}
}
+ if (count_instruction(n))
+ cycle += 1;
+
+ delay_update(state, n, cycle, mergedregs);
+
+ if (count_instruction(n))
+ cycle += n->repeat;
+
if (ctx->early_input_release && is_input(n)) {
last_input_needs_ss |= (n->opc == OPC_LDLV);
@@ -326,9 +607,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
last_input->dsts[0]->flags |= IR3_REG_EI;
if (last_input_needs_ss) {
- last_input->flags |= IR3_INSTR_SS;
- regmask_init(&state->needs_ss_war, mergedregs);
- regmask_init(&state->needs_ss, mergedregs);
+ apply_ss(last_input, state, mergedregs);
}
}
}
@@ -338,7 +617,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
assert(inputs_remaining == 0 || !ctx->early_input_release);
- if (has_tex_prefetch && input_count == 0) {
+ if (has_tex_prefetch && !ctx->has_inputs) {
/* texture prefetch, but *no* inputs.. we need to insert a
* dummy bary.f at the top of the shader to unblock varying
* storage:
@@ -356,8 +635,23 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
list_add(&baryf->node, &block->instr_list);
}
- if (last_rel)
- last_rel->flags |= IR3_INSTR_UL;
+ /* Currently our nop state contains the cycle offset from the start of this
+ * block when each register becomes ready. But successor blocks need the
+ * cycle offset from their start, which is this block's end. Translate the
+ * cycle offset.
+ */
+ for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
+ state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
+ for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
+ state->alu_nop.full_ready[i] =
+ MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
+ state->alu_nop.half_ready[i] =
+ MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
+ state->non_alu_nop.full_ready[i] =
+ MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
+ state->non_alu_nop.half_ready[i] =
+ MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
+ }
bd->valid = true;
@@ -382,8 +676,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
* dsxpp.1.p dst, src
*
* We apply this after flags syncing, as we don't want to sync in between the
- * two (which might happen if dst == src). We do it before nop scheduling
- * because that needs to count actual instructions.
+ * two (which might happen if dst == src).
*/
static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
@@ -405,13 +698,43 @@ apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
struct ir3_instruction *op_p = ir3_instr_clone(n);
op_p->flags = IR3_INSTR_P;
- ctx->so->need_fine_derivatives = true;
+ ctx->so->need_full_quad = true;
}
}
return true;
}
+static void
+apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
+ struct ir3_block *block)
+{
+ foreach_instr (n, &block->instr_list) {
+ if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+ struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2);
+ ir3_instr_move_after(stsc, n);
+ ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
+ n->push_consts.dst_base;
+ ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
+ n->push_consts.src_base;
+ stsc->cat6.iim_val = n->push_consts.src_size;
+ stsc->cat6.type = TYPE_U32;
+
+ if (ctx->compiler->stsc_duplication_quirk) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+ ir3_instr_move_after(nop, stsc);
+ nop->flags |= IR3_INSTR_SS;
+ ir3_instr_move_after(ir3_instr_clone(stsc), nop);
+ }
+
+ list_delinit(&n->node);
+ break;
+ } else if (!is_meta(n)) {
+ break;
+ }
+ }
+}
+
/* NOTE: branch instructions are always the last instruction(s)
* in the block. We take advantage of this as we resolve the
* branches, since "if (foo) break;" constructs turn into
@@ -507,26 +830,21 @@ retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
if (cur_block->successors[0] == old_target) {
cur_block->successors[0] = new_target;
} else {
- debug_assert(cur_block->successors[1] == old_target);
+ assert(cur_block->successors[1] == old_target);
cur_block->successors[1] = new_target;
}
- /* also update physical_successors.. we don't really need them at
- * this stage, but it keeps ir3_validate happy:
- */
- if (cur_block->physical_successors[0] == old_target) {
- cur_block->physical_successors[0] = new_target;
- } else {
- debug_assert(cur_block->physical_successors[1] == old_target);
- cur_block->physical_successors[1] = new_target;
- }
-
/* update new target's predecessors: */
ir3_block_add_predecessor(new_target, cur_block);
/* and remove old_target's predecessor: */
ir3_block_remove_predecessor(old_target, cur_block);
+ /* If we reconverged at the old target, we'll reconverge at the new target
+ * too:
+ */
+ new_target->reconvergence_point |= old_target->reconvergence_point;
+
instr->cat0.target = new_target;
if (old_target->predecessors_count == 0) {
@@ -538,6 +856,21 @@ retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
}
static bool
+is_invertible_branch(struct ir3_instruction *instr)
+{
+ switch (instr->opc) {
+ case OPC_BR:
+ case OPC_BRAA:
+ case OPC_BRAO:
+ case OPC_BANY:
+ case OPC_BALL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
opt_jump(struct ir3 *ir)
{
bool progress = false;
@@ -547,6 +880,12 @@ opt_jump(struct ir3 *ir)
block->index = index++;
foreach_block (block, &ir->block_list) {
+ /* This pass destroys the physical CFG so don't keep it around to avoid
+ * validation errors.
+ */
+ block->physical_successors_count = 0;
+ block->physical_predecessors_count = 0;
+
foreach_instr (instr, &block->instr_list) {
if (!is_flow(instr) || !instr->cat0.target)
continue;
@@ -581,13 +920,14 @@ opt_jump(struct ir3 *ir)
if (jumps[0]->opc == OPC_JUMP)
jumps[1] = NULL;
- else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B)
+ else if (!is_invertible_branch(jumps[0]) || !jumps[1] ||
+ !is_invertible_branch(jumps[1])) {
continue;
+ }
for (unsigned i = 0; i < 2; i++) {
if (!jumps[i])
continue;
-
struct ir3_block *tblock = jumps[i]->cat0.target;
if (&tblock->node == block->node.next) {
list_delinit(&jumps[i]->node);
@@ -616,109 +956,214 @@ resolve_jumps(struct ir3 *ir)
static void
mark_jp(struct ir3_block *block)
{
+ /* We only call this on the end block (in kill_sched) or after retargeting
+ * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
+ * worry about empty blocks.
+ */
+ assert(!list_is_empty(&block->instr_list));
+
struct ir3_instruction *target =
list_first_entry(&block->instr_list, struct ir3_instruction, node);
target->flags |= IR3_INSTR_JP;
}
-/* Mark points where control flow converges or diverges.
+/* Mark points where control flow reconverges.
*
- * Divergence points could actually be re-convergence points where
- * "parked" threads are recoverged with threads that took the opposite
- * path last time around. Possibly it is easier to think of (jp) as
- * "the execution mask might have changed".
+ * Re-convergence points are where "parked" threads are reconverged with threads
+ * that took the opposite path last time around. We already calculated them, we
+ * just need to mark them with (jp).
*/
static void
mark_xvergence_points(struct ir3 *ir)
{
foreach_block (block, &ir->block_list) {
- if (block->predecessors_count > 1) {
- /* if a block has more than one possible predecessor, then
- * the first instruction is a convergence point.
- */
+ if (block->reconvergence_point)
mark_jp(block);
- } else if (block->predecessors_count == 1) {
- /* If a block has one predecessor, which has multiple possible
- * successors, it is a divergence point.
- */
- for (unsigned i = 0; i < block->predecessors_count; i++) {
- struct ir3_block *predecessor = block->predecessors[i];
- if (predecessor->successors[1]) {
- mark_jp(block);
- }
- }
- }
}
}
+static void
+invert_branch(struct ir3_instruction *branch)
+{
+ switch (branch->opc) {
+ case OPC_BR:
+ break;
+ case OPC_BALL:
+ branch->opc = OPC_BANY;
+ break;
+ case OPC_BANY:
+ branch->opc = OPC_BALL;
+ break;
+ case OPC_BRAA:
+ branch->opc = OPC_BRAO;
+ break;
+ case OPC_BRAO:
+ branch->opc = OPC_BRAA;
+ break;
+ default:
+ unreachable("can't get here");
+ }
+
+ branch->cat0.inv1 = !branch->cat0.inv1;
+ branch->cat0.inv2 = !branch->cat0.inv2;
+ branch->cat0.target = branch->block->successors[1];
+}
+
/* Insert the branch/jump instructions for flow control between blocks.
* Initially this is done naively, without considering if the successor
* block immediately follows the current block (ie. so no jump required),
* but that is cleaned up in opt_jump().
- *
- * TODO what ensures that the last write to p0.x in a block is the
- * branch condition? Have we been getting lucky all this time?
*/
static void
block_sched(struct ir3 *ir)
{
foreach_block (block, &ir->block_list) {
+ struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+
if (block->successors[1]) {
/* if/else, conditional branches to "then" or "else": */
struct ir3_instruction *br1, *br2;
- if (block->brtype == IR3_BRANCH_GETONE) {
- /* getone can't be inverted, and it wouldn't even make sense
+ assert(terminator);
+ unsigned opc = terminator->opc;
+
+ if (opc == OPC_GETONE || opc == OPC_SHPS || opc == OPC_GETLAST) {
+ /* getone/shps can't be inverted, and it wouldn't even make sense
* to follow it with an inverted branch, so follow it by an
* unconditional branch.
*/
- debug_assert(!block->condition);
- br1 = ir3_GETONE(block);
+ assert(terminator->srcs_count == 0);
+ br1 = terminator;
br1->cat0.target = block->successors[1];
br2 = ir3_JUMP(block);
br2->cat0.target = block->successors[0];
- } else {
- debug_assert(block->condition);
-
+ } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
+ opc == OPC_BALL || opc == OPC_BANY) {
/* create "else" branch first (since "then" block should
* frequently/always end up being a fall-thru):
*/
- br1 = ir3_instr_create(block, OPC_B, 0, 1);
- ir3_src_create(br1, regid(REG_P0, 0), 0)->def =
- block->condition->dsts[0];
- br1->cat0.inv1 = true;
- br1->cat0.target = block->successors[1];
-
- /* "then" branch: */
- br2 = ir3_instr_create(block, OPC_B, 0, 1);
- ir3_src_create(br2, regid(REG_P0, 0), 0)->def =
- block->condition->dsts[0];
+ br1 = terminator;
+ br2 = ir3_instr_clone(br1);
+ invert_branch(br1);
br2->cat0.target = block->successors[0];
+ } else {
+ assert(opc == OPC_PREDT || opc == OPC_PREDF);
- switch (block->brtype) {
- case IR3_BRANCH_COND:
- br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN;
- break;
- case IR3_BRANCH_ALL:
- br1->cat0.brtype = BRANCH_ANY;
- br2->cat0.brtype = BRANCH_ALL;
- break;
- case IR3_BRANCH_ANY:
- br1->cat0.brtype = BRANCH_ALL;
- br2->cat0.brtype = BRANCH_ANY;
- break;
- case IR3_BRANCH_GETONE:
- unreachable("can't get here");
- }
+ /* Handled by prede_sched. */
+ terminator->cat0.target = block->successors[0];
+ continue;
}
+
+ /* Creating br2 caused it to be moved before the terminator b1, move it
+ * back.
+ */
+ ir3_instr_move_after(br2, br1);
} else if (block->successors[0]) {
- /* otherwise unconditional jump to next block: */
- struct ir3_instruction *jmp;
+ /* otherwise unconditional jump or predt/predf to next block which
+ * should already have been inserted.
+ */
+ assert(terminator);
+ assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
+ terminator->opc == OPC_PREDF);
+ terminator->cat0.target = block->successors[0];
+ }
+ }
+}
+
+static void
+prede_sched(struct ir3 *ir)
+{
+ unsigned index = 0;
+ foreach_block (block, &ir->block_list)
+ block->index = index++;
- jmp = ir3_JUMP(block);
- jmp->cat0.target = block->successors[0];
+ foreach_block (block, &ir->block_list) {
+ /* Look for the following pattern generated by NIR lowering. The numbers
+ * at the top of blocks are their index.
+ * |--- i ----|
+ * | ... |
+ * | pred[tf] |
+ * |----------|
+ * succ0 / \ succ1
+ * |-- i+1 ---| |-- i+2 ---|
+ * | ... | | ... |
+ * | pred[ft] | | ... |
+ * |----------| |----------|
+ * succ0 \ / succ0
+ * |--- j ----|
+ * | ... |
+ * |----------|
+ */
+ struct ir3_block *succ0 = block->successors[0];
+ struct ir3_block *succ1 = block->successors[1];
+
+ if (!succ1)
+ continue;
+
+ struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+ if (!terminator)
+ continue;
+ if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
+ continue;
+
+ assert(!succ0->successors[1] && !succ1->successors[1]);
+ assert(succ0->successors[0] == succ1->successors[0]);
+ assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
+ assert(succ0->index == (block->index + 1));
+ assert(succ1->index == (block->index + 2));
+
+ struct ir3_instruction *succ0_terminator =
+ ir3_block_get_terminator(succ0);
+ assert(succ0_terminator);
+ assert(succ0_terminator->opc ==
+ (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
+
+ ASSERTED struct ir3_instruction *succ1_terminator =
+ ir3_block_get_terminator(succ1);
+ assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
+
+ /* Simple case: both successors contain instructions. Keep both blocks and
+ * insert prede before the second successor's terminator:
+ * |--- i ----|
+ * | ... |
+ * | pred[tf] |
+ * |----------|
+ * succ0 / \ succ1
+ * |-- i+1 ---| |-- i+2 ---|
+ * | ... | | ... |
+ * | pred[ft] | | prede |
+ * |----------| |----------|
+ * succ0 \ / succ0
+ * |--- j ----|
+ * | ... |
+ * |----------|
+ */
+ if (!list_is_empty(&succ1->instr_list)) {
+ ir3_PREDE(succ1);
+ continue;
}
+
+ /* Second successor is empty so we can remove it:
+ * |--- i ----|
+ * | ... |
+ * | pred[tf] |
+ * |----------|
+ * succ0 / \ succ1
+ * |-- i+1 ---| |
+ * | ... | |
+ * | prede | |
+ * |----------| |
+ * succ0 \ /
+ * |--- j ----|
+ * | ... |
+ * |----------|
+ */
+ list_delinit(&succ0_terminator->node);
+ ir3_PREDE(succ0);
+ remove_unused_block(succ1);
+ block->successors[1] = succ0->successors[0];
+ ir3_block_add_predecessor(succ0->successors[0], block);
}
}
@@ -742,6 +1187,8 @@ block_sched(struct ir3 *ir)
static void
kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
{
+ ir3_count_instructions(ir);
+
/* True if we know that this block will always eventually lead to the end
* block:
*/
@@ -763,7 +1210,7 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
if (instr->opc != OPC_KILL)
continue;
- struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1);
+ struct ir3_instruction *br = ir3_instr_create(block, OPC_BR, 0, 1);
ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
1;
br->cat0.target =
@@ -790,51 +1237,243 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
}
}
-/* Insert nop's required to make this a legal/valid shader program: */
static void
-nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
+dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
{
foreach_block (block, &ir->block_list) {
- struct ir3_instruction *last = NULL;
- struct list_head instr_list;
+ foreach_instr_safe (instr, &block->instr_list) {
+ if (opc_cat(instr->opc) == 4 || opc_cat(instr->opc) == 5 ||
+ opc_cat(instr->opc) == 6) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+ nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+ ir3_instr_move_after(nop, instr);
+ }
+ }
+ }
+}
- /* remove all the instructions from the list, we'll be adding
- * them back in as we go
- */
- list_replace(&block->instr_list, &instr_list);
- list_inithead(&block->instr_list);
+static void
+dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
+{
+ foreach_block (block, &ir->block_list) {
+ foreach_instr_safe (instr, &block->instr_list) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+ nop->repeat = 5;
+ ir3_instr_move_before(nop, instr);
+ }
+ }
+}
+
+struct ir3_helper_block_data {
+ /* Whether helper invocations may be used on any path starting at the
+ * beginning of the block.
+ */
+ bool uses_helpers_beginning;
- foreach_instr_safe (instr, &instr_list) {
- unsigned delay = ir3_delay_calc_exact(block, instr, so->mergedregs);
+ /* Whether helper invocations may be used by the end of the block. Branch
+ * instructions are considered to be "between" blocks, because (eq) has to be
+ * inserted after them in the successor blocks, so branch instructions using
+ * helpers will result in uses_helpers_end = true for their block.
+ */
+ bool uses_helpers_end;
+};
- /* NOTE: I think the nopN encoding works for a5xx and
- * probably a4xx, but not a3xx. So far only tested on
- * a6xx.
- */
+/* Insert (eq) after the last instruction using the results of helper
+ * invocations. Use a backwards dataflow analysis to determine at which points
+ * in the program helper invocations are definitely never used, and then insert
+ * (eq) at the point where we cross from a point where they may be used to a
+ * point where they are never used.
+ */
+static void
+helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
+ struct ir3_shader_variant *so)
+{
+ bool non_prefetch_helpers = false;
+
+ foreach_block (block, &ir->block_list) {
+ struct ir3_helper_block_data *bd =
+ rzalloc(ctx, struct ir3_helper_block_data);
+ foreach_instr (instr, &block->instr_list) {
+ if (uses_helpers(instr)) {
+ bd->uses_helpers_beginning = true;
+ if (instr->opc != OPC_META_TEX_PREFETCH) {
+ non_prefetch_helpers = true;
+ break;
+ }
+ }
- if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
- ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
- (last->repeat == 0)) {
- /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
- unsigned transfer = MIN2(delay, 3 - last->nop);
- last->nop += transfer;
- delay -= transfer;
+ if (instr->opc == OPC_SHPE) {
+ /* (eq) is not allowed in preambles, mark the whole preamble as
+ * requiring helpers to avoid putting it there.
+ */
+ bd->uses_helpers_beginning = true;
+ bd->uses_helpers_end = true;
}
+ }
- if ((delay > 0) && last && (last->opc == OPC_NOP)) {
- /* the previous nop can encode at most 5 repeats: */
- unsigned transfer = MIN2(delay, 5 - last->repeat);
- last->repeat += transfer;
- delay -= transfer;
+ struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+ if (terminator) {
+ if (terminator->opc == OPC_BALL || terminator->opc == OPC_BANY ||
+ terminator->opc == OPC_GETONE) {
+ bd->uses_helpers_beginning = true;
+ bd->uses_helpers_end = true;
}
+ }
- if (delay > 0) {
- debug_assert(delay <= 6);
- ir3_NOP(block)->repeat = delay - 1;
+ block->data = bd;
+ }
+
+ /* If only prefetches use helpers then we can disable them in the shader via
+ * a register setting.
+ */
+ if (!non_prefetch_helpers) {
+ so->prefetch_end_of_quad = true;
+ return;
+ }
+
+ bool progress;
+ do {
+ progress = false;
+ foreach_block_rev (block, &ir->block_list) {
+ struct ir3_helper_block_data *bd = block->data;
+
+ if (!bd->uses_helpers_beginning)
+ continue;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ struct ir3_block *pred = block->predecessors[i];
+ struct ir3_helper_block_data *pred_bd = pred->data;
+ if (!pred_bd->uses_helpers_end) {
+ pred_bd->uses_helpers_end = true;
+ }
+ if (!pred_bd->uses_helpers_beginning) {
+ pred_bd->uses_helpers_beginning = true;
+ progress = true;
+ }
}
+ }
+ } while (progress);
- list_addtail(&instr->node, &block->instr_list);
- last = instr;
+ /* Now, we need to determine the points where helper invocations become
+ * unused.
+ */
+ foreach_block (block, &ir->block_list) {
+ struct ir3_helper_block_data *bd = block->data;
+ if (bd->uses_helpers_end)
+ continue;
+
+ /* We need to check the predecessors because of situations with critical
+ * edges like this that can occur after optimizing jumps:
+ *
+ * br p0.x, #endif
+ * ...
+ * sam ...
+ * ...
+ * endif:
+ * ...
+ * end
+ *
+ * The endif block will have uses_helpers_beginning = false and
+ * uses_helpers_end = false, but because we jump to there from the
+ * beginning of the if where uses_helpers_end = true, we still want to
+ * add an (eq) at the beginning of the block:
+ *
+ * br p0.x, #endif
+ * ...
+ * sam ...
+ * (eq)nop
+ * ...
+ * endif:
+ * (eq)nop
+ * ...
+ * end
+ *
+ * This an extra nop in the case where the branch isn't taken, but that's
+ * probably preferable to adding an extra jump instruction which is what
+ * would happen if we ran this pass before optimizing jumps:
+ *
+ * br p0.x, #else
+ * ...
+ * sam ...
+ * (eq)nop
+ * ...
+ * jump #endif
+ * else:
+ * (eq)nop
+ * endif:
+ * ...
+ * end
+ *
+ * We also need this to make sure we insert (eq) after branches which use
+ * helper invocations.
+ */
+ bool pred_uses_helpers = bd->uses_helpers_beginning;
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ struct ir3_block *pred = block->predecessors[i];
+ struct ir3_helper_block_data *pred_bd = pred->data;
+ if (pred_bd->uses_helpers_end) {
+ pred_uses_helpers = true;
+ break;
+ }
+ }
+
+ if (!pred_uses_helpers)
+ continue;
+
+ /* The last use of helpers is somewhere between the beginning and the
+ * end. first_instr will be the first instruction where helpers are no
+ * longer required, or NULL if helpers are not required just at the end.
+ */
+ struct ir3_instruction *first_instr = NULL;
+ foreach_instr_rev (instr, &block->instr_list) {
+ /* Skip prefetches because they actually execute before the block
+ * starts and at this stage they aren't guaranteed to be at the start
+ * of the block.
+ */
+ if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
+ break;
+ first_instr = instr;
+ }
+
+ bool killed = false;
+ bool expensive_instruction_in_block = false;
+ if (first_instr) {
+ foreach_instr_from (instr, first_instr, &block->instr_list) {
+ /* If there's already a nop, we don't have to worry about whether to
+ * insert one.
+ */
+ if (instr->opc == OPC_NOP) {
+ instr->flags |= IR3_INSTR_EQ;
+ killed = true;
+ break;
+ }
+
+ /* ALU and SFU instructions probably aren't going to benefit much
+ * from killing helper invocations, because they complete at least
+ * an entire quad in a cycle and don't access any quad-divergent
+ * memory, so delay emitting (eq) in the hopes that we find a nop
+ * afterwards.
+ */
+ if (is_alu(instr) || is_sfu(instr))
+ continue;
+ if (instr->opc == OPC_PREDE)
+ continue;
+
+ expensive_instruction_in_block = true;
+ break;
+ }
+ }
+
+ /* If this block isn't the last block before the end instruction, assume
+ * that there may be expensive instructions in later blocks so it's worth
+ * it to insert a nop.
+ */
+ if (!killed && (expensive_instruction_in_block ||
+ block->successors[0] != ir3_end_block(ir))) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+ nop->flags |= IR3_INSTR_EQ;
+ if (first_instr)
+ ir3_instr_move_before(nop, first_instr);
}
}
}
@@ -859,24 +1498,28 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
regmask_init(&bd->state.needs_ss_war, mergedregs);
regmask_init(&bd->state.needs_ss, mergedregs);
regmask_init(&bd->state.needs_sy, mergedregs);
+ regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
+ regmask_init(&bd->begin_state.needs_ss, mergedregs);
+ regmask_init(&bd->begin_state.needs_sy, mergedregs);
block->data = bd;
}
- ir3_remove_nops(ir);
-
/* We may have failed to pull all input loads into the first block.
* In such case at the moment we aren't able to find a better place
* to for (ei) than the end of the program.
* a5xx and a6xx do automatically release varying storage at the end.
*/
ctx->early_input_release = true;
- struct ir3_block *start_block = ir3_start_block(ir);
+ struct ir3_block *start_block = ir3_after_preamble(ir);
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
- if (is_input(instr) && block != start_block) {
- ctx->early_input_release = false;
- break;
+ if (is_input(instr)) {
+ ctx->has_inputs = true;
+ if (block != start_block) {
+ ctx->early_input_release = false;
+ break;
+ }
}
}
}
@@ -893,6 +1536,14 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
*max_bary = ctx->max_bary;
+ foreach_block (block, &ir->block_list) {
+ struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+ if (terminator && terminator->opc == OPC_GETONE) {
+ apply_push_consts_load_macro(ctx, block->successors[0]);
+ break;
+ }
+ }
+
block_sched(ir);
if (so->type == MESA_SHADER_FRAGMENT)
kill_sched(ir, so);
@@ -901,11 +1552,24 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
progress |= apply_fine_deriv_macro(ctx, block);
}
- nop_sched(ir, so);
+ if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
+ dbg_sync_sched(ir, so);
+ }
+
+ if (ir3_shader_debug & IR3_DBG_FULLNOP) {
+ dbg_nop_sched(ir, so);
+ }
while (opt_jump(ir))
;
+ prede_sched(ir);
+
+ /* TODO: does (eq) exist before a6xx? */
+ if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
+ so->compiler->gen >= 6)
+ helper_sched(ctx, ir, so);
+
ir3_count_instructions(ir);
resolve_jumps(ir);