/* * Copyright (c) 2012 Rob Clark * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ir3.h" #include #include #include #include #include #include #include "util/bitscan.h" #include "util/ralloc.h" #include "util/u_math.h" #include "instr-a3xx.h" #include "ir3_shader.h" /* simple allocator to carve allocations out of an up-front allocated heap, * so that we can free everything easily in one shot. */ void * ir3_alloc(struct ir3 *shader, int sz) { return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */ } struct ir3 * ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v) { struct ir3 *shader = rzalloc(v, struct ir3); shader->compiler = compiler; shader->type = v->type; list_inithead(&shader->block_list); list_inithead(&shader->array_list); return shader; } void ir3_destroy(struct ir3 *shader) { ralloc_free(shader); } static void collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg, struct ir3_info *info) { struct ir3_shader_variant *v = info->data; unsigned repeat = instr->repeat; if (reg->flags & IR3_REG_IMMED) { /* nothing to do */ return; } if (!(reg->flags & IR3_REG_R)) { repeat = 0; } unsigned components; int16_t max; if (reg->flags & IR3_REG_RELATIV) { components = reg->size; max = (reg->array.base + components - 1); } else { components = util_last_bit(reg->wrmask); max = (reg->num + repeat + components - 1); } if (reg->flags & IR3_REG_CONST) { info->max_const = MAX2(info->max_const, max >> 2); } else if (max < regid(48, 0)) { if (reg->flags & IR3_REG_HALF) { if (v->mergedregs) { /* starting w/ a6xx, half regs conflict with full regs: */ info->max_reg = MAX2(info->max_reg, max >> 3); } else { info->max_half_reg = MAX2(info->max_half_reg, max >> 2); } } else { info->max_reg = MAX2(info->max_reg, max >> 2); } } } bool ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count) { const struct ir3_compiler *compiler = v->shader->compiler; /* We can't support more than compiler->branchstack_size diverging threads * in a wave. Thus, doubling the threadsize is only possible if we don't * exceed the branchstack size limit. */ if (MIN2(v->branchstack, compiler->threadsize_base * 2) > compiler->branchstack_size) { return false; } switch (v->type) { case MESA_SHADER_COMPUTE: { unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; /* For a5xx, if the workgroup size is greater than the maximum number * of threads per core with 32 threads per wave (512) then we have to * use the doubled threadsize because otherwise the workgroup wouldn't * fit. For smaller workgroup sizes, we follow the blob and use the * smaller threadsize. */ if (compiler->gpu_id < 600) { return v->local_size_variable || threads_per_wg > compiler->threadsize_base * compiler->max_waves; } /* On a6xx, we prefer the larger threadsize unless the workgroup is * small enough that it would be useless. Note that because * threadsize_base is bumped to 64, we don't have to worry about the * workgroup fitting, unlike the a5xx case. */ if (!v->local_size_variable) { if (threads_per_wg <= compiler->threadsize_base) return false; } } FALLTHROUGH; case MESA_SHADER_FRAGMENT: { /* Check that doubling the threadsize wouldn't exceed the regfile size */ return regs_count * 2 <= compiler->reg_size_vec4; } default: /* On a6xx+, it's impossible to use a doubled wavesize in the geometry * stages - the bit doesn't exist. The blob never used it for the VS * on earlier gen's anyway. */ return false; } } /* Get the maximum number of waves that could be used even if this shader * didn't use any registers. */ unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize) { const struct ir3_compiler *compiler = v->shader->compiler; unsigned max_waves = compiler->max_waves; /* If this is a compute shader, compute the limit based on shared size */ if (v->type == MESA_SHADER_COMPUTE) { /* Shared is allocated in chunks of 1k */ unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024); if (shared_per_wg > 0 && !v->local_size_variable) { unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg; unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; unsigned waves_per_wg = DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) * compiler->wave_granularity); max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core * compiler->wave_granularity); } } /* Compute the limit based on branchstack */ if (v->branchstack > 0) { unsigned branchstack_max_waves = compiler->branchstack_size / v->branchstack * compiler->wave_granularity; max_waves = MIN2(max_waves, branchstack_max_waves); } return max_waves; } /* Get the maximum number of waves that could be launched limited by reg size. */ unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, unsigned reg_count, bool double_threadsize) { return reg_count ? (compiler->reg_size_vec4 / (reg_count * (double_threadsize ? 2 : 1)) * compiler->wave_granularity) : compiler->max_waves; } void ir3_collect_info(struct ir3_shader_variant *v) { struct ir3_info *info = &v->info; struct ir3 *shader = v->ir; const struct ir3_compiler *compiler = v->shader->compiler; memset(info, 0, sizeof(*info)); info->data = v; info->max_reg = -1; info->max_half_reg = -1; info->max_const = -1; info->multi_dword_ldp_stp = false; uint32_t instr_count = 0; foreach_block (block, &shader->block_list) { foreach_instr (instr, &block->instr_list) { instr_count++; } } v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align); /* Pad out with NOPs to instrlen, including at least 4 so that cffdump * doesn't try to decode the following data as instructions (such as the * next stage's shader in turnip) */ info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8; info->sizedwords = info->size / 4; foreach_block (block, &shader->block_list) { int sfu_delay = 0; foreach_instr (instr, &block->instr_list) { foreach_src (reg, instr) { collect_reg_info(instr, reg, info); } if (writes_gpr(instr)) { collect_reg_info(instr, instr->regs[0], info); } if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) { struct ir3_register *base = (instr->opc == OPC_STP) ? instr->regs[3] : instr->regs[2]; if (base->iim_val * type_size(instr->cat6.type) > 32) { info->multi_dword_ldp_stp = true; } } if ((instr->opc == OPC_BARY_F) && (instr->regs[0]->flags & IR3_REG_EI)) info->last_baryf = info->instrs_count; unsigned instrs_count = 1 + instr->repeat + instr->nop; unsigned nops_count = instr->nop; if (instr->opc == OPC_NOP) { nops_count = 1 + instr->repeat; info->instrs_per_cat[0] += nops_count; } else { info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat; info->instrs_per_cat[0] += nops_count; } if (instr->opc == OPC_MOV) { if (instr->cat1.src_type == instr->cat1.dst_type) { info->mov_count += 1 + instr->repeat; } else { info->cov_count += 1 + instr->repeat; } } info->instrs_count += instrs_count; info->nops_count += nops_count; if (instr->flags & IR3_INSTR_SS) { info->ss++; info->sstall += sfu_delay; sfu_delay = 0; } if (instr->flags & IR3_INSTR_SY) info->sy++; if (is_sfu(instr)) { sfu_delay = 10; } else { int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop); sfu_delay -= n; } } } /* TODO: for a5xx and below, is there a separate regfile for * half-registers? */ unsigned regs_count = info->max_reg + 1 + (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0); info->double_threadsize = ir3_should_double_threadsize(v, regs_count); unsigned reg_independent_max_waves = ir3_get_reg_independent_max_waves(v, info->double_threadsize); unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(compiler, regs_count, info->double_threadsize); info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves); assert(info->max_waves <= v->shader->compiler->max_waves); } static struct ir3_register * reg_create(struct ir3 *shader, int num, int flags) { struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register)); reg->wrmask = 1; reg->flags = flags; reg->num = num; return reg; } static void insert_instr(struct ir3_block *block, struct ir3_instruction *instr) { struct ir3 *shader = block->shader; instr->serialno = ++shader->instr_count; list_addtail(&instr->node, &block->instr_list); if (is_input(instr)) array_insert(shader, shader->baryfs, instr); } struct ir3_block * ir3_block_create(struct ir3 *shader) { struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); #ifdef DEBUG block->serialno = ++shader->block_count; #endif block->shader = shader; list_inithead(&block->node); list_inithead(&block->instr_list); return block; } void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred) { array_insert(block, block->predecessors, pred); } void ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred) { for (unsigned i = 0; i < block->predecessors_count; i++) { if (block->predecessors[i] == pred) { if (i < block->predecessors_count - 1) { block->predecessors[i] = block->predecessors[block->predecessors_count - 1]; } block->predecessors_count--; return; } } } unsigned ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred) { for (unsigned i = 0; i < block->predecessors_count; i++) { if (block->predecessors[i] == pred) { return i; } } unreachable("ir3_block_get_pred_index() invalid predecessor"); } static struct ir3_instruction *instr_create(struct ir3_block *block, opc_t opc, int nreg) { /* Add an extra source for array destinations */ if (1 <= opc_cat(opc)) nreg++; struct ir3_instruction *instr; unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0])); char *ptr = ir3_alloc(block->shader, sz); instr = (struct ir3_instruction *)ptr; ptr += sizeof(*instr); instr->regs = (struct ir3_register **)ptr; #ifdef DEBUG instr->regs_max = nreg; #endif return instr; } struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc, int nreg) { struct ir3_instruction *instr = instr_create(block, opc, nreg); instr->block = block; instr->opc = opc; insert_instr(block, instr); return instr; } struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) { struct ir3_instruction *new_instr = instr_create(instr->block, instr->opc, instr->regs_count); struct ir3_register **regs; unsigned i; regs = new_instr->regs; *new_instr = *instr; new_instr->regs = regs; insert_instr(instr->block, new_instr); /* clone registers: */ new_instr->regs_count = 0; for (i = 0; i < instr->regs_count; i++) { struct ir3_register *reg = instr->regs[i]; struct ir3_register *new_reg = ir3_reg_create(new_instr, reg->num, reg->flags); *new_reg = *reg; if ((new_reg->flags & IR3_REG_DEST) && new_reg->instr) new_reg->instr = new_instr; } return new_instr; } /* Add a false dependency to instruction, to ensure it is scheduled first: */ void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) { for (unsigned i = 0; i < instr->deps_count; i++) { if (instr->deps[i] == dep) return; } array_insert(instr, instr->deps, dep); } struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags) { struct ir3 *shader = instr->block->shader; struct ir3_register *reg = reg_create(shader, num, flags); #ifdef DEBUG debug_assert(instr->regs_count < instr->regs_max); #endif instr->regs[instr->regs_count++] = reg; return reg; } struct ir3_register * ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg) { struct ir3_register *new_reg = reg_create(shader, 0, 0); *new_reg = *reg; return new_reg; } void ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg, struct ir3_register *last_write) { assert(reg->flags & IR3_REG_ARRAY); assert(reg->flags & IR3_REG_DEST); struct ir3_register *new_reg = ir3_reg_create(instr, 0, 0); *new_reg = *reg; new_reg->flags &= ~IR3_REG_DEST; new_reg->def = last_write; ir3_reg_tie(reg, new_reg); } void ir3_instr_set_address(struct ir3_instruction *instr, struct ir3_instruction *addr) { if (instr->address != addr) { struct ir3 *ir = instr->block->shader; debug_assert(!instr->address); debug_assert(instr->block == addr->block); instr->address = addr; debug_assert(reg_num(addr->regs[0]) == REG_A0); unsigned comp = reg_comp(addr->regs[0]); if (comp == 0) { array_insert(ir, ir->a0_users, instr); } else { debug_assert(comp == 1); array_insert(ir, ir->a1_users, instr); } } } void ir3_block_clear_mark(struct ir3_block *block) { foreach_instr (instr, &block->instr_list) instr->flags &= ~IR3_INSTR_MARK; } void ir3_clear_mark(struct ir3 *ir) { foreach_block (block, &ir->block_list) { ir3_block_clear_mark(block); } } unsigned ir3_count_instructions(struct ir3 *ir) { unsigned cnt = 1; foreach_block (block, &ir->block_list) { block->start_ip = cnt; foreach_instr (instr, &block->instr_list) { instr->ip = cnt++; } block->end_ip = cnt; } return cnt; } /* When counting instructions for RA, we insert extra fake instructions at the * beginning of each block, where values become live, and at the end where * values die. This prevents problems where values live-in at the beginning or * live-out at the end of a block from being treated as if they were * live-in/live-out at the first/last instruction, which would be incorrect. * In ir3_legalize these ip's are assumed to be actual ip's of the final * program, so it would be incorrect to use this everywhere. */ unsigned ir3_count_instructions_ra(struct ir3 *ir) { unsigned cnt = 1; foreach_block (block, &ir->block_list) { block->start_ip = cnt++; foreach_instr (instr, &block->instr_list) { instr->ip = cnt++; } block->end_ip = cnt++; } return cnt; } struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id) { foreach_array (arr, &ir->array_list) if (arr->id == id) return arr; return NULL; } void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps) { /* We could do this in a single pass if we can assume instructions * are always sorted. Which currently might not always be true. * (In particular after ir3_group pass, but maybe other places.) */ foreach_block (block, &ir->block_list) foreach_instr (instr, &block->instr_list) instr->uses = NULL; foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { foreach_ssa_src_n (src, n, instr) { if (__is_false_dep(instr, n) && !falsedeps) continue; if (!src->uses) src->uses = _mesa_pointer_set_create(mem_ctx); _mesa_set_add(src->uses, instr); } } } } /** * Set the destination type of an instruction, for example if a * conversion is folded in, handling the special cases where the * instruction's dest type or opcode needs to be fixed up. */ void ir3_set_dst_type(struct ir3_instruction *instr, bool half) { if (half) { instr->regs[0]->flags |= IR3_REG_HALF; } else { instr->regs[0]->flags &= ~IR3_REG_HALF; } switch (opc_cat(instr->opc)) { case 1: /* move instructions */ if (half) { instr->cat1.dst_type = half_type(instr->cat1.dst_type); } else { instr->cat1.dst_type = full_type(instr->cat1.dst_type); } break; case 4: if (half) { instr->opc = cat4_half_opc(instr->opc); } else { instr->opc = cat4_full_opc(instr->opc); } break; case 5: if (half) { instr->cat5.type = half_type(instr->cat5.type); } else { instr->cat5.type = full_type(instr->cat5.type); } break; } } /** * One-time fixup for instruction src-types. Other than cov's that * are folded, an instruction's src type does not change. */ void ir3_fixup_src_type(struct ir3_instruction *instr) { switch (opc_cat(instr->opc)) { case 1: /* move instructions */ if (instr->regs[1]->flags & IR3_REG_HALF) { instr->cat1.src_type = half_type(instr->cat1.src_type); } else { instr->cat1.src_type = full_type(instr->cat1.src_type); } break; case 3: if (instr->regs[1]->flags & IR3_REG_HALF) { instr->opc = cat3_half_opc(instr->opc); } else { instr->opc = cat3_full_opc(instr->opc); } break; } } static unsigned cp_flags(unsigned flags) { /* only considering these flags (at least for now): */ flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV); return flags; } bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags) { struct ir3_compiler *compiler = instr->block->shader->compiler; unsigned valid_flags; if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3) return false; flags = cp_flags(flags); /* If destination is indirect, then source cannot be.. at least * I don't think so.. */ if ((instr->regs[0]->flags & IR3_REG_RELATIV) && (flags & IR3_REG_RELATIV)) return false; if (flags & IR3_REG_RELATIV) { /* TODO need to test on earlier gens.. pretty sure the earlier * problem was just that we didn't check that the src was from * same block (since we can't propagate address register values * across blocks currently) */ if (compiler->gpu_id < 600) return false; /* NOTE in the special try_swap_mad_two_srcs() case we can be * called on a src that has already had an indirect load folded * in, in which case ssa() returns NULL */ if (instr->regs[n+1]->flags & IR3_REG_SSA) { struct ir3_instruction *src = ssa(instr->regs[n+1]); if (src->address->block != instr->block) return false; } } if (is_meta(instr)) { /* collect and phi nodes support const/immed sources, which will be * turned into move instructions, but not anything else. */ if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST)) return false; return true; } switch (opc_cat(instr->opc)) { case 0: /* end, chmask */ return flags == 0; case 1: valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; if (flags & ~valid_flags) return false; break; case 2: valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST | IR3_REG_RELATIV; if (ir3_cat2_int(instr->opc)) valid_flags |= IR3_REG_IMMED; if (flags & ~valid_flags) return false; if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) { unsigned m = (n ^ 1) + 1; /* cannot deal w/ const in both srcs: * (note that some cat2 actually only have a single src) */ if (m < instr->regs_count) { struct ir3_register *reg = instr->regs[m]; if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST)) return false; if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED)) return false; } } break; case 3: valid_flags = ir3_cat3_absneg(instr->opc) | IR3_REG_CONST | IR3_REG_RELATIV; if (flags & ~valid_flags) return false; if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) { /* cannot deal w/ const/relativ in 2nd src: */ if (n == 1) return false; } break; case 4: /* seems like blob compiler avoids const as src.. */ /* TODO double check if this is still the case on a4xx */ if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) return false; if (flags & (IR3_REG_SABS | IR3_REG_SNEG)) return false; break; case 5: /* no flags allowed */ if (flags) return false; break; case 6: valid_flags = IR3_REG_IMMED; if (flags & ~valid_flags) return false; if (flags & IR3_REG_IMMED) { /* doesn't seem like we can have immediate src for store * instructions: * * TODO this restriction could also apply to load instructions, * but for load instructions this arg is the address (and not * really sure any good way to test a hard-coded immed addr src) */ if (is_store(instr) && (n == 1)) return false; if ((instr->opc == OPC_LDL) && (n == 0)) return false; if ((instr->opc == OPC_STL) && (n != 2)) return false; if ((instr->opc == OPC_LDP) && (n == 0)) return false; if ((instr->opc == OPC_STP) && (n != 2)) return false; if (instr->opc == OPC_STLW && n == 0) return false; if (instr->opc == OPC_LDLW && n == 0) return false; /* disallow immediates in anything but the SSBO slot argument for * cat6 instructions: */ if (is_atomic(instr->opc) && (n != 0)) return false; if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G)) return false; if (instr->opc == OPC_STG && (instr->flags & IR3_INSTR_G) && (n != 2)) return false; /* as with atomics, these cat6 instrs can only have an immediate * for SSBO/IBO slot argument */ switch (instr->opc) { case OPC_LDIB: case OPC_STIB: case OPC_LDC: case OPC_RESINFO: if (n != 0) return false; break; default: break; } } break; } return true; }