diff options
Diffstat (limited to 'src/freedreno/ir3/ir3_legalize.c')
-rw-r--r-- | src/freedreno/ir3/ir3_legalize.c | 1008 |
1 files changed, 836 insertions, 172 deletions
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 8d56efccfc6..b3c34ba5f3b 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -39,7 +39,7 @@ * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed, * based on state flowing out of predecessor blocks until there is * no further change. In some cases this requires inserting nops. - * 2) Mark (ei) on last varying input, and (ul) on last use of a0.x + * 2) Mark (ei) on last varying input * 3) Final nop scheduling for instruction latency * 4) Resolve jumps and schedule blocks, marking potential convergence * points with (jp) @@ -51,19 +51,239 @@ struct ir3_legalize_ctx { gl_shader_stage type; int max_bary; bool early_input_release; + bool has_inputs; +}; + +struct ir3_nop_state { + unsigned full_ready[4 * 48]; + unsigned half_ready[4 * 48]; }; struct ir3_legalize_state { regmask_t needs_ss; regmask_t needs_ss_war; /* write after read */ regmask_t needs_sy; + bool needs_ss_for_const; + + /* Each of these arrays contains the cycle when the corresponding register + * becomes "ready" i.e. does not require any more nops. There is a special + * mechanism to let ALU instructions read compatible (i.e. same halfness) + * destinations of another ALU instruction with less delay, so this can + * depend on what type the consuming instruction is, which is why there are + * multiple arrays. The cycle is counted relative to the start of the block. + */ + + /* When ALU instructions reading the given full/half register will be ready. + */ + struct ir3_nop_state alu_nop; + + /* When non-ALU (e.g. cat5) instructions reading the given full/half register + * will be ready. + */ + struct ir3_nop_state non_alu_nop; + + /* When p0.x-w, a0.x, and a1.x are ready. */ + unsigned pred_ready[4]; + unsigned addr_ready[2]; }; struct ir3_legalize_block_data { bool valid; + struct ir3_legalize_state begin_state; struct ir3_legalize_state state; }; +static inline void +apply_ss(struct ir3_instruction *instr, + struct ir3_legalize_state *state, + bool mergedregs) +{ + instr->flags |= IR3_INSTR_SS; + regmask_init(&state->needs_ss_war, mergedregs); + regmask_init(&state->needs_ss, mergedregs); + state->needs_ss_for_const = false; +} + +static inline void +apply_sy(struct ir3_instruction *instr, + struct ir3_legalize_state *state, + bool mergedregs) +{ + instr->flags |= IR3_INSTR_SY; + regmask_init(&state->needs_sy, mergedregs); +} + +static bool +count_instruction(struct ir3_instruction *n) +{ + /* NOTE: don't count branch/jump since we don't know yet if they will + * be eliminated later in resolve_jumps().. really should do that + * earlier so we don't have this constraint. + */ + return is_alu(n) || + (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) && + (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO)); +} + +static unsigned * +get_ready_slot(struct ir3_legalize_state *state, + struct ir3_register *reg, unsigned num, + bool consumer_alu, bool matching_size) +{ + if (reg->flags & IR3_REG_PREDICATE) { + assert(num == reg->num); + assert(reg_num(reg) == REG_P0); + return &state->pred_ready[reg_comp(reg)]; + } + if (reg->num == regid(REG_A0, 0)) + return &state->addr_ready[0]; + if (reg->num == regid(REG_A0, 1)) + return &state->addr_ready[1]; + struct ir3_nop_state *nop = + consumer_alu ? &state->alu_nop : &state->non_alu_nop; + assert(!(reg->flags & IR3_REG_SHARED)); + if (reg->flags & IR3_REG_HALF) { + if (matching_size) + return &nop->half_ready[num]; + else + return &nop->full_ready[num / 2]; + } else { + if (matching_size) + return &nop->full_ready[num]; + /* If "num" is large enough, then it can't alias a half-reg because only + * the first half of the full reg speace aliases half regs. Return NULL in + * this case. + */ + else if (num * 2 < ARRAY_SIZE(nop->half_ready)) + return &nop->half_ready[num * 2]; + else + return NULL; + } +} + +static unsigned +delay_calc(struct ir3_legalize_state *state, + struct ir3_instruction *instr, + unsigned cycle) +{ + /* As far as we know, shader outputs don't need any delay. */ + if (instr->opc == OPC_END || instr->opc == OPC_CHMASK) + return 0; + + unsigned delay = 0; + foreach_src_n (src, n, instr) { + if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) + continue; + + unsigned elems = post_ra_reg_elems(src); + unsigned num = post_ra_reg_num(src); + unsigned src_cycle = cycle; + + /* gat and swz have scalar sources and each source is read in a + * subsequent cycle. + */ + if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ) + src_cycle += n; + + /* cat3 instructions consume their last source two cycles later, so they + * only need a delay of 1. + */ + if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2) + src_cycle += 2; + + for (unsigned elem = 0; elem < elems; elem++, num++) { + unsigned ready_cycle = + *get_ready_slot(state, src, num, is_alu(instr), true); + delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle); + + /* Increment cycle for ALU instructions with (rptN) where sources are + * read each subsequent cycle. + */ + if (instr->repeat && !(src->flags & IR3_REG_RELATIV)) + src_cycle++; + } + } + + return delay; +} + +static void +delay_update(struct ir3_legalize_state *state, + struct ir3_instruction *instr, + unsigned cycle, + bool mergedregs) +{ + foreach_dst_n (dst, n, instr) { + unsigned elems = post_ra_reg_elems(dst); + unsigned num = post_ra_reg_num(dst); + unsigned dst_cycle = cycle; + + /* sct and swz have scalar destinations and each destination is written in + * a subsequent cycle. + */ + if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ) + dst_cycle += n; + + /* For relative accesses with (rptN), we have no way of knowing which + * component is accessed when, so we have to assume the worst and mark + * every array member as being written at the end. + */ + if (dst->flags & IR3_REG_RELATIV) + dst_cycle += instr->repeat; + + if (dst->flags & IR3_REG_SHARED) + continue; + + for (unsigned elem = 0; elem < elems; elem++, num++) { + for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) { + for (unsigned matching_size = 0; matching_size < 2; matching_size++) { + unsigned *ready_slot = + get_ready_slot(state, dst, num, consumer_alu, matching_size); + + if (!ready_slot) + continue; + + bool reset_ready_slot = false; + unsigned delay = 0; + if (!is_alu(instr)) { + /* Apparently writes that require (ss) or (sy) are + * synchronized against previous writes, so consumers don't + * have to wait for any previous overlapping ALU instructions + * to complete. + */ + reset_ready_slot = true; + } else if ((dst->flags & IR3_REG_PREDICATE) || + reg_num(dst) == REG_A0) { + delay = 6; + if (!matching_size) + continue; + } else { + delay = (consumer_alu && matching_size) ? 3 : 6; + } + + if (!matching_size) { + for (unsigned i = 0; i < reg_elem_size(dst); i++) { + ready_slot[i] = + reset_ready_slot ? 0 : + MAX2(ready_slot[i], dst_cycle + delay); + } + } else { + *ready_slot = + reset_ready_slot ? 0 : + MAX2(*ready_slot, dst_cycle + delay); + } + } + } + + /* Increment cycle for ALU instructions with (rptN) where destinations + * are written each subsequent cycle. + */ + if (instr->repeat && !(dst->flags & IR3_REG_RELATIV)) + dst_cycle++; + } + } +} + /* We want to evaluate each block from the position of any other * predecessor block, in order that the flags set are the union of * all possible program paths. @@ -87,16 +307,23 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (bd->valid) return false; - struct ir3_instruction *last_rel = NULL; struct ir3_instruction *last_n = NULL; struct list_head instr_list; struct ir3_legalize_state prev_state = bd->state; - struct ir3_legalize_state *state = &bd->state; + struct ir3_legalize_state *state = &bd->begin_state; bool last_input_needs_ss = false; bool has_tex_prefetch = false; bool mergedregs = ctx->so->mergedregs; - /* our input state is the OR of all predecessor blocks' state: */ + /* Our input state is the OR of all predecessor blocks' state. + * + * Why don't we just zero the state at the beginning before merging in the + * predecessors? Because otherwise updates may not be a "lattice refinement", + * i.e. needs_ss may go from true to false for some register due to a (ss) we + * inserted the second time around (and the same for (sy)). This means that + * there's no solid guarantee the algorithm will converge, and in theory + * there may be infinite loops where we fight over the placment of an (ss). + */ for (unsigned i = 0; i < block->predecessors_count; i++) { struct ir3_block *predecessor = block->predecessors[i]; struct ir3_legalize_block_data *pbd = predecessor->data; @@ -109,8 +336,38 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) regmask_or(&state->needs_ss_war, &state->needs_ss_war, &pstate->needs_ss_war); regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy); + state->needs_ss_for_const |= pstate->needs_ss_for_const; + + /* Our nop state is the max of the predecessor blocks */ + for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++) + state->pred_ready[i] = MAX2(state->pred_ready[i], + pstate->pred_ready[i]); + for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) { + state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i], + pstate->alu_nop.full_ready[i]); + state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i], + pstate->alu_nop.half_ready[i]); + state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i], + pstate->non_alu_nop.full_ready[i]); + state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i], + pstate->non_alu_nop.half_ready[i]); + } } + /* We need to take phsyical-only edges into account when tracking shared + * registers. + */ + for (unsigned i = 0; i < block->physical_predecessors_count; i++) { + struct ir3_block *predecessor = block->physical_predecessors[i]; + struct ir3_legalize_block_data *pbd = predecessor->data; + struct ir3_legalize_state *pstate = &pbd->state; + + regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss); + } + + memcpy(&bd->state, state, sizeof(*state)); + state = &bd->state; + unsigned input_count = 0; foreach_instr (n, &block->instr_list) { @@ -125,7 +382,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * with the end of the program. */ assert(input_count == 0 || !ctx->early_input_release || - block == ir3_start_block(block->shader)); + block == ir3_after_preamble(block->shader)); /* remove all the instructions from the list, we'll be adding * them back in as we go @@ -133,6 +390,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) list_replace(&block->instr_list, &instr_list); list_inithead(&block->instr_list); + unsigned cycle = 0; + foreach_instr_safe (n, &instr_list) { unsigned i; @@ -150,18 +409,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val); } - if (last_n && is_barrier(last_n)) { - n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) { + apply_ss(n, state, mergedregs); + apply_sy(n, state, mergedregs); last_input_needs_ss = false; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); - regmask_init(&state->needs_sy, mergedregs); } if (last_n && (last_n->opc == OPC_PREDT)) { - n->flags |= IR3_INSTR_SS; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); + apply_ss(n, state, mergedregs); } /* NOTE: consider dst register too.. it could happen that @@ -184,37 +439,25 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * some tests for both this and (sy).. */ if (regmask_get(&state->needs_ss, reg)) { - n->flags |= IR3_INSTR_SS; + apply_ss(n, state, mergedregs); last_input_needs_ss = false; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); } if (regmask_get(&state->needs_sy, reg)) { - n->flags |= IR3_INSTR_SY; - regmask_init(&state->needs_sy, mergedregs); + apply_sy(n, state, mergedregs); + } + } else if ((reg->flags & IR3_REG_CONST)) { + if (state->needs_ss_for_const) { + apply_ss(n, state, mergedregs); + last_input_needs_ss = false; } } - - /* TODO: is it valid to have address reg loaded from a - * relative src (ie. mova a0, c<a0.x+4>)? If so, the - * last_rel check below should be moved ahead of this: - */ - if (reg->flags & IR3_REG_RELATIV) - last_rel = n; } foreach_dst (reg, n) { if (regmask_get(&state->needs_ss_war, reg)) { - n->flags |= IR3_INSTR_SS; + apply_ss(n, state, mergedregs); last_input_needs_ss = false; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); - } - - if (last_rel && (reg->num == regid(REG_A0, 0))) { - last_rel->flags |= IR3_INSTR_UL; - last_rel = NULL; } } @@ -228,11 +471,40 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) nop = ir3_NOP(block); nop->flags |= IR3_INSTR_SS; n->flags &= ~IR3_INSTR_SS; + last_n = nop; + cycle++; + } + + unsigned delay = delay_calc(state, n, cycle); + + /* NOTE: I think the nopN encoding works for a5xx and + * probably a4xx, but not a3xx. So far only tested on + * a6xx. + */ + + if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n && + ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) && + (last_n->repeat == 0)) { + /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ + unsigned transfer = MIN2(delay, 3 - last_n->nop); + last_n->nop += transfer; + delay -= transfer; + cycle += transfer; } - /* need to be able to set (ss) on first instruction: */ - if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) - ir3_NOP(block); + if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) { + /* the previous nop can encode at most 5 repeats: */ + unsigned transfer = MIN2(delay, 5 - last_n->repeat); + last_n->repeat += transfer; + delay -= transfer; + cycle += transfer; + } + + if (delay > 0) { + assert(delay <= 6); + ir3_NOP(block)->repeat = delay - 1; + cycle += delay; + } if (ctx->compiler->samgq_workaround && ctx->type != MESA_SHADER_FRAGMENT && @@ -255,6 +527,11 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (is_sfu(n)) regmask_set(&state->needs_ss, n->dsts[0]); + foreach_dst (dst, n) { + if (dst->flags & IR3_REG_SHARED) + regmask_set(&state->needs_ss, dst); + } + if (is_tex_or_prefetch(n)) { regmask_set(&state->needs_sy, n->dsts[0]); if (n->opc == OPC_META_TEX_PREFETCH) @@ -264,28 +541,25 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) ir3_NOP(block)->flags |= IR3_INSTR_SS; last_input_needs_ss = false; } else if (is_load(n)) { - /* seems like ldlv needs (ss) bit instead?? which is odd but - * makes a bunch of flat-varying tests start working on a4xx. - */ - if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || - (n->opc == OPC_LDLW)) + if (is_local_mem_load(n)) regmask_set(&state->needs_ss, n->dsts[0]); else regmask_set(&state->needs_sy, n->dsts[0]); } else if (is_atomic(n->opc)) { - if (n->flags & IR3_INSTR_G) { - if (ctx->compiler->gen >= 6) { - /* New encoding, returns result via second src: */ - regmask_set(&state->needs_sy, n->srcs[2]); - } else { - regmask_set(&state->needs_sy, n->dsts[0]); - } + if (is_bindless_atomic(n->opc)) { + regmask_set(&state->needs_sy, n->srcs[2]); + } else if (is_global_a3xx_atomic(n->opc) || + is_global_a6xx_atomic(n->opc)) { + regmask_set(&state->needs_sy, n->dsts[0]); } else { regmask_set(&state->needs_ss, n->dsts[0]); } + } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) { + state->needs_ss_for_const = true; } - if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) + if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) || + is_bindless_atomic(n->opc)) ctx->so->has_ssbo = true; /* both tex/sfu appear to not always immediately consume @@ -293,11 +567,18 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) */ if (is_tex(n) || is_sfu(n) || is_mem(n)) { foreach_src (reg, n) { - if (reg_gpr(reg)) - regmask_set(&state->needs_ss_war, reg); + regmask_set(&state->needs_ss_war, reg); } } + if (count_instruction(n)) + cycle += 1; + + delay_update(state, n, cycle, mergedregs); + + if (count_instruction(n)) + cycle += n->repeat; + if (ctx->early_input_release && is_input(n)) { last_input_needs_ss |= (n->opc == OPC_LDLV); @@ -326,9 +607,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) last_input->dsts[0]->flags |= IR3_REG_EI; if (last_input_needs_ss) { - last_input->flags |= IR3_INSTR_SS; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); + apply_ss(last_input, state, mergedregs); } } } @@ -338,7 +617,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) assert(inputs_remaining == 0 || !ctx->early_input_release); - if (has_tex_prefetch && input_count == 0) { + if (has_tex_prefetch && !ctx->has_inputs) { /* texture prefetch, but *no* inputs.. we need to insert a * dummy bary.f at the top of the shader to unblock varying * storage: @@ -356,8 +635,23 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) list_add(&baryf->node, &block->instr_list); } - if (last_rel) - last_rel->flags |= IR3_INSTR_UL; + /* Currently our nop state contains the cycle offset from the start of this + * block when each register becomes ready. But successor blocks need the + * cycle offset from their start, which is this block's end. Translate the + * cycle offset. + */ + for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++) + state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle; + for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) { + state->alu_nop.full_ready[i] = + MAX2(state->alu_nop.full_ready[i], cycle) - cycle; + state->alu_nop.half_ready[i] = + MAX2(state->alu_nop.half_ready[i], cycle) - cycle; + state->non_alu_nop.full_ready[i] = + MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle; + state->non_alu_nop.half_ready[i] = + MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle; + } bd->valid = true; @@ -382,8 +676,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * dsxpp.1.p dst, src * * We apply this after flags syncing, as we don't want to sync in between the - * two (which might happen if dst == src). We do it before nop scheduling - * because that needs to count actual instructions. + * two (which might happen if dst == src). */ static bool apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block) @@ -405,13 +698,43 @@ apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block) struct ir3_instruction *op_p = ir3_instr_clone(n); op_p->flags = IR3_INSTR_P; - ctx->so->need_fine_derivatives = true; + ctx->so->need_full_quad = true; } } return true; } +static void +apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx, + struct ir3_block *block) +{ + foreach_instr (n, &block->instr_list) { + if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) { + struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2); + ir3_instr_move_after(stsc, n); + ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val = + n->push_consts.dst_base; + ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val = + n->push_consts.src_base; + stsc->cat6.iim_val = n->push_consts.src_size; + stsc->cat6.type = TYPE_U32; + + if (ctx->compiler->stsc_duplication_quirk) { + struct ir3_instruction *nop = ir3_NOP(block); + ir3_instr_move_after(nop, stsc); + nop->flags |= IR3_INSTR_SS; + ir3_instr_move_after(ir3_instr_clone(stsc), nop); + } + + list_delinit(&n->node); + break; + } else if (!is_meta(n)) { + break; + } + } +} + /* NOTE: branch instructions are always the last instruction(s) * in the block. We take advantage of this as we resolve the * branches, since "if (foo) break;" constructs turn into @@ -507,26 +830,21 @@ retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target) if (cur_block->successors[0] == old_target) { cur_block->successors[0] = new_target; } else { - debug_assert(cur_block->successors[1] == old_target); + assert(cur_block->successors[1] == old_target); cur_block->successors[1] = new_target; } - /* also update physical_successors.. we don't really need them at - * this stage, but it keeps ir3_validate happy: - */ - if (cur_block->physical_successors[0] == old_target) { - cur_block->physical_successors[0] = new_target; - } else { - debug_assert(cur_block->physical_successors[1] == old_target); - cur_block->physical_successors[1] = new_target; - } - /* update new target's predecessors: */ ir3_block_add_predecessor(new_target, cur_block); /* and remove old_target's predecessor: */ ir3_block_remove_predecessor(old_target, cur_block); + /* If we reconverged at the old target, we'll reconverge at the new target + * too: + */ + new_target->reconvergence_point |= old_target->reconvergence_point; + instr->cat0.target = new_target; if (old_target->predecessors_count == 0) { @@ -538,6 +856,21 @@ retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target) } static bool +is_invertible_branch(struct ir3_instruction *instr) +{ + switch (instr->opc) { + case OPC_BR: + case OPC_BRAA: + case OPC_BRAO: + case OPC_BANY: + case OPC_BALL: + return true; + default: + return false; + } +} + +static bool opt_jump(struct ir3 *ir) { bool progress = false; @@ -547,6 +880,12 @@ opt_jump(struct ir3 *ir) block->index = index++; foreach_block (block, &ir->block_list) { + /* This pass destroys the physical CFG so don't keep it around to avoid + * validation errors. + */ + block->physical_successors_count = 0; + block->physical_predecessors_count = 0; + foreach_instr (instr, &block->instr_list) { if (!is_flow(instr) || !instr->cat0.target) continue; @@ -581,13 +920,14 @@ opt_jump(struct ir3 *ir) if (jumps[0]->opc == OPC_JUMP) jumps[1] = NULL; - else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B) + else if (!is_invertible_branch(jumps[0]) || !jumps[1] || + !is_invertible_branch(jumps[1])) { continue; + } for (unsigned i = 0; i < 2; i++) { if (!jumps[i]) continue; - struct ir3_block *tblock = jumps[i]->cat0.target; if (&tblock->node == block->node.next) { list_delinit(&jumps[i]->node); @@ -616,109 +956,214 @@ resolve_jumps(struct ir3 *ir) static void mark_jp(struct ir3_block *block) { + /* We only call this on the end block (in kill_sched) or after retargeting + * all jumps to empty blocks (in mark_xvergence_points) so there's no need to + * worry about empty blocks. + */ + assert(!list_is_empty(&block->instr_list)); + struct ir3_instruction *target = list_first_entry(&block->instr_list, struct ir3_instruction, node); target->flags |= IR3_INSTR_JP; } -/* Mark points where control flow converges or diverges. +/* Mark points where control flow reconverges. * - * Divergence points could actually be re-convergence points where - * "parked" threads are recoverged with threads that took the opposite - * path last time around. Possibly it is easier to think of (jp) as - * "the execution mask might have changed". + * Re-convergence points are where "parked" threads are reconverged with threads + * that took the opposite path last time around. We already calculated them, we + * just need to mark them with (jp). */ static void mark_xvergence_points(struct ir3 *ir) { foreach_block (block, &ir->block_list) { - if (block->predecessors_count > 1) { - /* if a block has more than one possible predecessor, then - * the first instruction is a convergence point. - */ + if (block->reconvergence_point) mark_jp(block); - } else if (block->predecessors_count == 1) { - /* If a block has one predecessor, which has multiple possible - * successors, it is a divergence point. - */ - for (unsigned i = 0; i < block->predecessors_count; i++) { - struct ir3_block *predecessor = block->predecessors[i]; - if (predecessor->successors[1]) { - mark_jp(block); - } - } - } } } +static void +invert_branch(struct ir3_instruction *branch) +{ + switch (branch->opc) { + case OPC_BR: + break; + case OPC_BALL: + branch->opc = OPC_BANY; + break; + case OPC_BANY: + branch->opc = OPC_BALL; + break; + case OPC_BRAA: + branch->opc = OPC_BRAO; + break; + case OPC_BRAO: + branch->opc = OPC_BRAA; + break; + default: + unreachable("can't get here"); + } + + branch->cat0.inv1 = !branch->cat0.inv1; + branch->cat0.inv2 = !branch->cat0.inv2; + branch->cat0.target = branch->block->successors[1]; +} + /* Insert the branch/jump instructions for flow control between blocks. * Initially this is done naively, without considering if the successor * block immediately follows the current block (ie. so no jump required), * but that is cleaned up in opt_jump(). - * - * TODO what ensures that the last write to p0.x in a block is the - * branch condition? Have we been getting lucky all this time? */ static void block_sched(struct ir3 *ir) { foreach_block (block, &ir->block_list) { + struct ir3_instruction *terminator = ir3_block_get_terminator(block); + if (block->successors[1]) { /* if/else, conditional branches to "then" or "else": */ struct ir3_instruction *br1, *br2; - if (block->brtype == IR3_BRANCH_GETONE) { - /* getone can't be inverted, and it wouldn't even make sense + assert(terminator); + unsigned opc = terminator->opc; + + if (opc == OPC_GETONE || opc == OPC_SHPS || opc == OPC_GETLAST) { + /* getone/shps can't be inverted, and it wouldn't even make sense * to follow it with an inverted branch, so follow it by an * unconditional branch. */ - debug_assert(!block->condition); - br1 = ir3_GETONE(block); + assert(terminator->srcs_count == 0); + br1 = terminator; br1->cat0.target = block->successors[1]; br2 = ir3_JUMP(block); br2->cat0.target = block->successors[0]; - } else { - debug_assert(block->condition); - + } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO || + opc == OPC_BALL || opc == OPC_BANY) { /* create "else" branch first (since "then" block should * frequently/always end up being a fall-thru): */ - br1 = ir3_instr_create(block, OPC_B, 0, 1); - ir3_src_create(br1, regid(REG_P0, 0), 0)->def = - block->condition->dsts[0]; - br1->cat0.inv1 = true; - br1->cat0.target = block->successors[1]; - - /* "then" branch: */ - br2 = ir3_instr_create(block, OPC_B, 0, 1); - ir3_src_create(br2, regid(REG_P0, 0), 0)->def = - block->condition->dsts[0]; + br1 = terminator; + br2 = ir3_instr_clone(br1); + invert_branch(br1); br2->cat0.target = block->successors[0]; + } else { + assert(opc == OPC_PREDT || opc == OPC_PREDF); - switch (block->brtype) { - case IR3_BRANCH_COND: - br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN; - break; - case IR3_BRANCH_ALL: - br1->cat0.brtype = BRANCH_ANY; - br2->cat0.brtype = BRANCH_ALL; - break; - case IR3_BRANCH_ANY: - br1->cat0.brtype = BRANCH_ALL; - br2->cat0.brtype = BRANCH_ANY; - break; - case IR3_BRANCH_GETONE: - unreachable("can't get here"); - } + /* Handled by prede_sched. */ + terminator->cat0.target = block->successors[0]; + continue; } + + /* Creating br2 caused it to be moved before the terminator b1, move it + * back. + */ + ir3_instr_move_after(br2, br1); } else if (block->successors[0]) { - /* otherwise unconditional jump to next block: */ - struct ir3_instruction *jmp; + /* otherwise unconditional jump or predt/predf to next block which + * should already have been inserted. + */ + assert(terminator); + assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT || + terminator->opc == OPC_PREDF); + terminator->cat0.target = block->successors[0]; + } + } +} + +static void +prede_sched(struct ir3 *ir) +{ + unsigned index = 0; + foreach_block (block, &ir->block_list) + block->index = index++; - jmp = ir3_JUMP(block); - jmp->cat0.target = block->successors[0]; + foreach_block (block, &ir->block_list) { + /* Look for the following pattern generated by NIR lowering. The numbers + * at the top of blocks are their index. + * |--- i ----| + * | ... | + * | pred[tf] | + * |----------| + * succ0 / \ succ1 + * |-- i+1 ---| |-- i+2 ---| + * | ... | | ... | + * | pred[ft] | | ... | + * |----------| |----------| + * succ0 \ / succ0 + * |--- j ----| + * | ... | + * |----------| + */ + struct ir3_block *succ0 = block->successors[0]; + struct ir3_block *succ1 = block->successors[1]; + + if (!succ1) + continue; + + struct ir3_instruction *terminator = ir3_block_get_terminator(block); + if (!terminator) + continue; + if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF) + continue; + + assert(!succ0->successors[1] && !succ1->successors[1]); + assert(succ0->successors[0] == succ1->successors[0]); + assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1); + assert(succ0->index == (block->index + 1)); + assert(succ1->index == (block->index + 2)); + + struct ir3_instruction *succ0_terminator = + ir3_block_get_terminator(succ0); + assert(succ0_terminator); + assert(succ0_terminator->opc == + (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT)); + + ASSERTED struct ir3_instruction *succ1_terminator = + ir3_block_get_terminator(succ1); + assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP)); + + /* Simple case: both successors contain instructions. Keep both blocks and + * insert prede before the second successor's terminator: + * |--- i ----| + * | ... | + * | pred[tf] | + * |----------| + * succ0 / \ succ1 + * |-- i+1 ---| |-- i+2 ---| + * | ... | | ... | + * | pred[ft] | | prede | + * |----------| |----------| + * succ0 \ / succ0 + * |--- j ----| + * | ... | + * |----------| + */ + if (!list_is_empty(&succ1->instr_list)) { + ir3_PREDE(succ1); + continue; } + + /* Second successor is empty so we can remove it: + * |--- i ----| + * | ... | + * | pred[tf] | + * |----------| + * succ0 / \ succ1 + * |-- i+1 ---| | + * | ... | | + * | prede | | + * |----------| | + * succ0 \ / + * |--- j ----| + * | ... | + * |----------| + */ + list_delinit(&succ0_terminator->node); + ir3_PREDE(succ0); + remove_unused_block(succ1); + block->successors[1] = succ0->successors[0]; + ir3_block_add_predecessor(succ0->successors[0], block); } } @@ -742,6 +1187,8 @@ block_sched(struct ir3 *ir) static void kill_sched(struct ir3 *ir, struct ir3_shader_variant *so) { + ir3_count_instructions(ir); + /* True if we know that this block will always eventually lead to the end * block: */ @@ -763,7 +1210,7 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so) if (instr->opc != OPC_KILL) continue; - struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1); + struct ir3_instruction *br = ir3_instr_create(block, OPC_BR, 0, 1); ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask = 1; br->cat0.target = @@ -790,51 +1237,243 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so) } } -/* Insert nop's required to make this a legal/valid shader program: */ static void -nop_sched(struct ir3 *ir, struct ir3_shader_variant *so) +dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so) { foreach_block (block, &ir->block_list) { - struct ir3_instruction *last = NULL; - struct list_head instr_list; + foreach_instr_safe (instr, &block->instr_list) { + if (opc_cat(instr->opc) == 4 || opc_cat(instr->opc) == 5 || + opc_cat(instr->opc) == 6) { + struct ir3_instruction *nop = ir3_NOP(block); + nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + ir3_instr_move_after(nop, instr); + } + } + } +} - /* remove all the instructions from the list, we'll be adding - * them back in as we go - */ - list_replace(&block->instr_list, &instr_list); - list_inithead(&block->instr_list); +static void +dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so) +{ + foreach_block (block, &ir->block_list) { + foreach_instr_safe (instr, &block->instr_list) { + struct ir3_instruction *nop = ir3_NOP(block); + nop->repeat = 5; + ir3_instr_move_before(nop, instr); + } + } +} + +struct ir3_helper_block_data { + /* Whether helper invocations may be used on any path starting at the + * beginning of the block. + */ + bool uses_helpers_beginning; - foreach_instr_safe (instr, &instr_list) { - unsigned delay = ir3_delay_calc_exact(block, instr, so->mergedregs); + /* Whether helper invocations may be used by the end of the block. Branch + * instructions are considered to be "between" blocks, because (eq) has to be + * inserted after them in the successor blocks, so branch instructions using + * helpers will result in uses_helpers_end = true for their block. + */ + bool uses_helpers_end; +}; - /* NOTE: I think the nopN encoding works for a5xx and - * probably a4xx, but not a3xx. So far only tested on - * a6xx. - */ +/* Insert (eq) after the last instruction using the results of helper + * invocations. Use a backwards dataflow analysis to determine at which points + * in the program helper invocations are definitely never used, and then insert + * (eq) at the point where we cross from a point where they may be used to a + * point where they are never used. + */ +static void +helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir, + struct ir3_shader_variant *so) +{ + bool non_prefetch_helpers = false; + + foreach_block (block, &ir->block_list) { + struct ir3_helper_block_data *bd = + rzalloc(ctx, struct ir3_helper_block_data); + foreach_instr (instr, &block->instr_list) { + if (uses_helpers(instr)) { + bd->uses_helpers_beginning = true; + if (instr->opc != OPC_META_TEX_PREFETCH) { + non_prefetch_helpers = true; + break; + } + } - if ((delay > 0) && (ir->compiler->gen >= 6) && last && - ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) && - (last->repeat == 0)) { - /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ - unsigned transfer = MIN2(delay, 3 - last->nop); - last->nop += transfer; - delay -= transfer; + if (instr->opc == OPC_SHPE) { + /* (eq) is not allowed in preambles, mark the whole preamble as + * requiring helpers to avoid putting it there. + */ + bd->uses_helpers_beginning = true; + bd->uses_helpers_end = true; } + } - if ((delay > 0) && last && (last->opc == OPC_NOP)) { - /* the previous nop can encode at most 5 repeats: */ - unsigned transfer = MIN2(delay, 5 - last->repeat); - last->repeat += transfer; - delay -= transfer; + struct ir3_instruction *terminator = ir3_block_get_terminator(block); + if (terminator) { + if (terminator->opc == OPC_BALL || terminator->opc == OPC_BANY || + terminator->opc == OPC_GETONE) { + bd->uses_helpers_beginning = true; + bd->uses_helpers_end = true; } + } - if (delay > 0) { - debug_assert(delay <= 6); - ir3_NOP(block)->repeat = delay - 1; + block->data = bd; + } + + /* If only prefetches use helpers then we can disable them in the shader via + * a register setting. + */ + if (!non_prefetch_helpers) { + so->prefetch_end_of_quad = true; + return; + } + + bool progress; + do { + progress = false; + foreach_block_rev (block, &ir->block_list) { + struct ir3_helper_block_data *bd = block->data; + + if (!bd->uses_helpers_beginning) + continue; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ir3_helper_block_data *pred_bd = pred->data; + if (!pred_bd->uses_helpers_end) { + pred_bd->uses_helpers_end = true; + } + if (!pred_bd->uses_helpers_beginning) { + pred_bd->uses_helpers_beginning = true; + progress = true; + } } + } + } while (progress); - list_addtail(&instr->node, &block->instr_list); - last = instr; + /* Now, we need to determine the points where helper invocations become + * unused. + */ + foreach_block (block, &ir->block_list) { + struct ir3_helper_block_data *bd = block->data; + if (bd->uses_helpers_end) + continue; + + /* We need to check the predecessors because of situations with critical + * edges like this that can occur after optimizing jumps: + * + * br p0.x, #endif + * ... + * sam ... + * ... + * endif: + * ... + * end + * + * The endif block will have uses_helpers_beginning = false and + * uses_helpers_end = false, but because we jump to there from the + * beginning of the if where uses_helpers_end = true, we still want to + * add an (eq) at the beginning of the block: + * + * br p0.x, #endif + * ... + * sam ... + * (eq)nop + * ... + * endif: + * (eq)nop + * ... + * end + * + * This an extra nop in the case where the branch isn't taken, but that's + * probably preferable to adding an extra jump instruction which is what + * would happen if we ran this pass before optimizing jumps: + * + * br p0.x, #else + * ... + * sam ... + * (eq)nop + * ... + * jump #endif + * else: + * (eq)nop + * endif: + * ... + * end + * + * We also need this to make sure we insert (eq) after branches which use + * helper invocations. + */ + bool pred_uses_helpers = bd->uses_helpers_beginning; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ir3_helper_block_data *pred_bd = pred->data; + if (pred_bd->uses_helpers_end) { + pred_uses_helpers = true; + break; + } + } + + if (!pred_uses_helpers) + continue; + + /* The last use of helpers is somewhere between the beginning and the + * end. first_instr will be the first instruction where helpers are no + * longer required, or NULL if helpers are not required just at the end. + */ + struct ir3_instruction *first_instr = NULL; + foreach_instr_rev (instr, &block->instr_list) { + /* Skip prefetches because they actually execute before the block + * starts and at this stage they aren't guaranteed to be at the start + * of the block. + */ + if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH) + break; + first_instr = instr; + } + + bool killed = false; + bool expensive_instruction_in_block = false; + if (first_instr) { + foreach_instr_from (instr, first_instr, &block->instr_list) { + /* If there's already a nop, we don't have to worry about whether to + * insert one. + */ + if (instr->opc == OPC_NOP) { + instr->flags |= IR3_INSTR_EQ; + killed = true; + break; + } + + /* ALU and SFU instructions probably aren't going to benefit much + * from killing helper invocations, because they complete at least + * an entire quad in a cycle and don't access any quad-divergent + * memory, so delay emitting (eq) in the hopes that we find a nop + * afterwards. + */ + if (is_alu(instr) || is_sfu(instr)) + continue; + if (instr->opc == OPC_PREDE) + continue; + + expensive_instruction_in_block = true; + break; + } + } + + /* If this block isn't the last block before the end instruction, assume + * that there may be expensive instructions in later blocks so it's worth + * it to insert a nop. + */ + if (!killed && (expensive_instruction_in_block || + block->successors[0] != ir3_end_block(ir))) { + struct ir3_instruction *nop = ir3_NOP(block); + nop->flags |= IR3_INSTR_EQ; + if (first_instr) + ir3_instr_move_before(nop, first_instr); } } } @@ -859,24 +1498,28 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) regmask_init(&bd->state.needs_ss_war, mergedregs); regmask_init(&bd->state.needs_ss, mergedregs); regmask_init(&bd->state.needs_sy, mergedregs); + regmask_init(&bd->begin_state.needs_ss_war, mergedregs); + regmask_init(&bd->begin_state.needs_ss, mergedregs); + regmask_init(&bd->begin_state.needs_sy, mergedregs); block->data = bd; } - ir3_remove_nops(ir); - /* We may have failed to pull all input loads into the first block. * In such case at the moment we aren't able to find a better place * to for (ei) than the end of the program. * a5xx and a6xx do automatically release varying storage at the end. */ ctx->early_input_release = true; - struct ir3_block *start_block = ir3_start_block(ir); + struct ir3_block *start_block = ir3_after_preamble(ir); foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { - if (is_input(instr) && block != start_block) { - ctx->early_input_release = false; - break; + if (is_input(instr)) { + ctx->has_inputs = true; + if (block != start_block) { + ctx->early_input_release = false; + break; + } } } } @@ -893,6 +1536,14 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) *max_bary = ctx->max_bary; + foreach_block (block, &ir->block_list) { + struct ir3_instruction *terminator = ir3_block_get_terminator(block); + if (terminator && terminator->opc == OPC_GETONE) { + apply_push_consts_load_macro(ctx, block->successors[0]); + break; + } + } + block_sched(ir); if (so->type == MESA_SHADER_FRAGMENT) kill_sched(ir, so); @@ -901,11 +1552,24 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) progress |= apply_fine_deriv_macro(ctx, block); } - nop_sched(ir, so); + if (ir3_shader_debug & IR3_DBG_FULLSYNC) { + dbg_sync_sched(ir, so); + } + + if (ir3_shader_debug & IR3_DBG_FULLNOP) { + dbg_nop_sched(ir, so); + } while (opt_jump(ir)) ; + prede_sched(ir); + + /* TODO: does (eq) exist before a6xx? */ + if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod && + so->compiler->gen >= 6) + helper_sched(ctx, ir, so); + ir3_count_instructions(ir); resolve_jumps(ir); |